1 ;*****************************************************************************
2 ;* deblock-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*****************************************************************************
25 %include "x86util.asm"
29 transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
38 ; expands to [base],...,[base+7*stride]
39 %define PASS8ROWS(base, base3, stride, stride3) \
40 [base], [base+stride], [base+stride*2], [base3], \
41 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
43 %define PASS8ROWS(base, base3, stride, stride3, offset) \
44 PASS8ROWS(base+offset, base3+offset, stride, stride3)
46 ; in: 8 rows of 4 bytes in %4..%11
47 ; out: 4 rows of 8 bytes in m0..m3
48 %macro TRANSPOSE4x8_LOAD 11
77 ; in: 4 rows of 8 bytes in m0..m3
78 ; out: 8 rows of 4 bytes in %1..%8
79 %macro TRANSPOSE8x4B_STORE 8
113 %macro TRANSPOSE4x8B_LOAD 8
114 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
117 %macro TRANSPOSE4x8W_LOAD 8
119 TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
126 TRANSPOSE4x4W 0, 1, 2, 3, 4
130 %macro TRANSPOSE8x2W_STORE 8
165 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
166 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
167 %macro TRANSPOSE6x8_MEM 9
176 SBUTTERFLY bw, 0, 1, 7
177 SBUTTERFLY bw, 2, 3, 7
178 SBUTTERFLY bw, 4, 5, 7
180 SBUTTERFLY3 bw, m6, %8, m7
181 SBUTTERFLY wd, 0, 2, 3
182 SBUTTERFLY wd, 4, 6, 3
185 SBUTTERFLY3 wd, m1, [%9+0x10], m3
186 SBUTTERFLY wd, 5, 7, 0
187 SBUTTERFLY dq, 1, 5, 0
188 SBUTTERFLY dq, 2, 6, 0
198 ; in: 8 rows of 8 in %1..%8
199 ; out: 8 rows of 8 in %9..%16
200 %macro TRANSPOSE8x8_MEM 16
209 SBUTTERFLY bw, 0, 1, 7
210 SBUTTERFLY bw, 2, 3, 7
211 SBUTTERFLY bw, 4, 5, 7
212 SBUTTERFLY3 bw, m6, %8, m7
214 SBUTTERFLY wd, 0, 2, 5
215 SBUTTERFLY wd, 4, 6, 5
216 SBUTTERFLY wd, 1, 3, 5
219 SBUTTERFLY wd, 6, 7, 5
220 SBUTTERFLY dq, 0, 4, 5
221 SBUTTERFLY dq, 1, 6, 5
226 SBUTTERFLY3 dq, m2, %11, m0
227 SBUTTERFLY dq, 3, 7, 4
235 ; out: %4 = |%1-%2|>%3
246 ; out: %4 = |%1-%2|>%3
267 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
268 ; out: m5=beta-1, m7=mask, %3=alpha-1
275 packuswb m4, m4 ; 16x alpha-1
276 packuswb m5, m5 ; 16x beta-1
280 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
281 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
283 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
289 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
292 %macro DEBLOCK_P0_Q0 0
295 pand m5, [pb_1] ; (p0^q0)&1
298 pavgb m3, m0 ; (p1 - q1 + 256)>>1
299 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
301 pavgb m4, m2 ; (q0 - p0 + 256)>>1
303 paddusb m3, m4 ; d+128+33
316 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
317 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
318 ; clobbers: q2, tmp, tc0
322 pavgb %2, %6 ; avg(p2,avg(p0,q0))
324 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
325 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
335 ;-----------------------------------------------------------------------------
336 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
337 ;-----------------------------------------------------------------------------
339 cglobal deblock_v_luma_sse2, 5,5,10
345 add r4, r0 ; pix-3*stride
347 mova m0, [r4+r1] ; p1
348 mova m1, [r4+2*r1] ; p0
350 mova m3, [r0+r1] ; q1
354 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
361 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
366 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
368 movdqa m4, [r0+2*r1] ; q2
369 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
374 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
381 ;-----------------------------------------------------------------------------
382 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
383 ;-----------------------------------------------------------------------------
385 cglobal deblock_h_luma_sse2, 5,7
392 %define pix_tmp rsp+0x30
398 ; transpose 6x16 -> tmp space
399 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
402 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
405 ; alpha, beta, tc0 are still in r2d, r3d, r4
406 ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
407 lea r0, [pix_tmp+0x30]
412 call deblock_v_luma_sse2
414 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
417 movq m0, [pix_tmp+0x18]
418 movq m1, [pix_tmp+0x28]
419 movq m2, [pix_tmp+0x38]
420 movq m3, [pix_tmp+0x48]
421 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
427 movq m0, [pix_tmp+0x10]
428 movq m1, [pix_tmp+0x20]
429 movq m2, [pix_tmp+0x30]
430 movq m3, [pix_tmp+0x40]
431 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
442 %macro DEBLOCK_LUMA 3
443 ;-----------------------------------------------------------------------------
444 ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
445 ;-----------------------------------------------------------------------------
446 cglobal deblock_%2_luma_%1, 5,5
451 add r4, r0 ; pix-3*stride
452 %assign pad 2*%3+12-(stack_offset&15)
455 mova m0, [r4+r1] ; p1
456 mova m1, [r4+2*r1] ; p0
458 mova m3, [r0+r1] ; q1
464 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
465 mova [esp+%3], m4 ; tc
469 mova [esp], m4 ; mask
472 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
474 pand m4, [esp+%3] ; tc
478 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
480 mova m4, [r0+2*r1] ; q2
481 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
482 mova m5, [esp] ; mask
484 mova m5, [esp+%3] ; tc
488 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
496 ;-----------------------------------------------------------------------------
497 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
498 ;-----------------------------------------------------------------------------
500 cglobal deblock_h_luma_%1, 0,5
506 %assign pad 0x78-(stack_offset&15)
508 %define pix_tmp esp+12
510 ; transpose 6x16 -> tmp space
511 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
514 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
517 lea r0, [pix_tmp+0x30]
523 call deblock_%2_luma_%1
525 add dword [esp ], 8 ; pix_tmp+0x38
526 add dword [esp+16], 2 ; tc0+2
527 call deblock_%2_luma_%1
531 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
536 movq m0, [pix_tmp+0x10]
537 movq m1, [pix_tmp+0x20]
538 movq m2, [pix_tmp+0x30]
539 movq m3, [pix_tmp+0x40]
540 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
544 movq m0, [pix_tmp+0x18]
545 movq m1, [pix_tmp+0x28]
546 movq m2, [pix_tmp+0x38]
547 movq m3, [pix_tmp+0x48]
548 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
552 %endmacro ; DEBLOCK_LUMA
555 DEBLOCK_LUMA mmxext, v8, 8
557 DEBLOCK_LUMA sse2, v, 16
563 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
568 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
581 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
588 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
592 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
597 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
605 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
613 mova %1, t1 ; store p0
619 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
621 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
626 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
634 mova %2, t0 ; store p1
635 mova %3, t1 ; store p2
638 %macro LUMA_INTRA_SWAP_PQ 0
644 %define mask1p mask1q
647 %macro DEBLOCK_LUMA_INTRA 2
663 %define mask1q [rsp-24]
667 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
672 %define mask0 spill(2)
673 %define mask1p spill(3)
674 %define mask1q spill(4)
679 ;-----------------------------------------------------------------------------
680 ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
681 ;-----------------------------------------------------------------------------
682 cglobal deblock_%2_luma_intra_%1, 4,6,16
687 lea r5, [r1*3] ; 3*stride
693 add r4, r0 ; pix-4*stride
701 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
702 SWAP 7, 12 ; m12=mask0
704 pavgb t5, mpb_1 ; alpha/4+1
707 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
708 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
709 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
716 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
720 pavgb m4, [pb_1] ; alpha/4+1
721 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
723 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
726 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
730 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
732 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
741 ;-----------------------------------------------------------------------------
742 ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
743 ;-----------------------------------------------------------------------------
744 cglobal deblock_h_luma_intra_%1, 4,7
752 ; transpose 8x16 -> tmp space
753 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
756 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
758 lea r0, [pix_tmp+0x40]
760 call deblock_v_luma_intra_%1
762 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
764 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
769 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
773 cglobal deblock_h_luma_intra_%1, 2,4
777 %assign pad 0x8c-(stack_offset&15)
781 ; transpose 8x16 -> tmp space
782 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
785 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
787 lea r0, [pix_tmp+0x40]
792 call deblock_%2_luma_intra_%1
794 add dword [rsp], 8 ; pix_tmp+8
795 call deblock_%2_luma_intra_%1
804 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
805 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
808 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
812 %endmacro ; DEBLOCK_LUMA_INTRA
815 DEBLOCK_LUMA_INTRA sse2, v
818 DEBLOCK_LUMA_INTRA mmxext, v8
823 %macro CHROMA_V_START 0
835 %macro CHROMA_H_START 0
848 %macro CHROMA_V_LOOP 1
860 %macro CHROMA_H_LOOP 1
875 %macro DEBLOCK_CHROMA 1
876 ;-----------------------------------------------------------------------------
877 ; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
878 ;-----------------------------------------------------------------------------
879 cglobal deblock_v_chroma_%1, 5,6,8
885 call chroma_inter_body_%1
891 ;-----------------------------------------------------------------------------
892 ; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
893 ;-----------------------------------------------------------------------------
894 cglobal deblock_h_chroma_%1, 5,7,8
896 TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
897 call chroma_inter_body_%1
898 TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
904 chroma_inter_body_%1:
912 %endmacro ; DEBLOCK_CHROMA
918 DEBLOCK_CHROMA mmxext
922 ; in: %1=p0 %2=p1 %3=q1
923 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
924 %macro CHROMA_INTRA_P0 3
927 pand m4, [pb_1] ; m4 = (p0^q1)&1
930 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
936 %macro DEBLOCK_CHROMA_INTRA 1
937 ;-----------------------------------------------------------------------------
938 ; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
939 ;-----------------------------------------------------------------------------
940 cglobal deblock_v_chroma_intra_%1, 4,5,8
946 call chroma_intra_body_%1
952 ;-----------------------------------------------------------------------------
953 ; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
954 ;-----------------------------------------------------------------------------
955 cglobal deblock_h_chroma_intra_%1, 4,6,8
957 TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
958 call chroma_intra_body_%1
959 TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
965 chroma_intra_body_%1:
969 CHROMA_INTRA_P0 m1, m0, m3
970 CHROMA_INTRA_P0 m2, m3, m0
978 %endmacro ; DEBLOCK_CHROMA_INTRA
981 DEBLOCK_CHROMA_INTRA sse2
984 DEBLOCK_CHROMA_INTRA mmxext
989 ;-----------------------------------------------------------------------------
990 ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
991 ; uint8_t bs[2][4][4], int mvy_limit, int bframe )
992 ;-----------------------------------------------------------------------------
994 %define scan8start (4+1*8)
995 %define nnz r0+scan8start
996 %define ref r1+scan8start
997 %define mv r2+scan8start*4
1001 %macro LOAD_BYTES_MMX 1
1006 punpckldq m2, [%1+8*1-1]
1007 punpckldq m0, [%1+8*1]
1008 punpckldq m3, [%1+8*3-1]
1009 punpckldq m1, [%1+8*3]
1012 %macro DEBLOCK_STRENGTH_REFS_MMX 0
1023 punpckldq m2, m0 ; row -1, row 0
1024 punpckldq m3, m1 ; row 1, row 2
1033 %macro DEBLOCK_STRENGTH_MVS_MMX 2
1046 %macro DEBLOCK_STRENGTH_NNZ_MMX 1
1053 pminub m4, m6 ; mv ? 1 : 0
1055 paddb m2, m2 ; nnz ? 2 : 0
1061 %macro LOAD_BYTES_XMM 1
1062 movu m0, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
1066 shufps m2, m1, 0xdd ; cur nnz, all rows
1068 shufps m0, m1, 0xdd ; left neighbors
1070 movd m3, [%1-8] ; could be palignr if nnz was aligned
1072 por m1, m3 ; top neighbors
1076 cglobal deblock_strength_mmxext, 6,6
1077 ; Prepare mv comparison register
1090 DEBLOCK_STRENGTH_REFS_MMX
1093 DEBLOCK_STRENGTH_MVS_MMX bs0, 4
1094 DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
1107 DEBLOCK_STRENGTH_NNZ_MMX bs0
1108 ; Transpose column output
1109 SBUTTERFLY bw, 2, 3, 4
1110 SBUTTERFLY bw, 2, 3, 4
1115 punpckldq m2, m0 ; row -1, row 0
1116 punpckldq m3, m1 ; row 1, row 2
1117 DEBLOCK_STRENGTH_NNZ_MMX bs1
1122 %macro DEBLOCK_STRENGTH_XMM 1
1123 cglobal deblock_strength_%1, 6,6,8
1124 ; Prepare mv comparison register
1146 palignr m3, [mv+4*8*0-16], 12
1147 palignr m2, [mv+4*8*1-16], 12
1156 palignr m3, [mv+4*8*2-16], 12
1157 palignr m7, [mv+4*8*3-16], 12
1162 movu m0, [mv-4+4*8*0]
1163 movu m1, [mv-4+4*8*1]
1164 movu m2, [mv-4+4*8*2]
1165 movu m3, [mv-4+4*8*3]
1166 psubw m0, [mv+4*8*0]
1167 psubw m1, [mv+4*8*1]
1168 psubw m2, [mv+4*8*2]
1169 psubw m3, [mv+4*8*3]
1173 ABSB2 m0, m2, m1, m3
1179 mova m0, [mv+4*8*-1]
1180 mova m1, [mv+4*8* 0]
1181 mova m2, [mv+4*8* 1]
1182 mova m3, [mv+4*8* 2]
1186 psubw m3, [mv+4*8* 3]
1189 ABSB2 m0, m2, m1, m3
1206 pminub m4, m6 ; mv ? 1 : 0
1208 paddb m0, m0 ; nnz ? 2 : 0
1213 pshufb m4, [transpose_shuf]
1226 DEBLOCK_STRENGTH_XMM sse2
1227 %define ABSB2 ABSB2_SSSE3
1228 DEBLOCK_STRENGTH_XMM ssse3