1 ;*****************************************************************************
2 ;* deblock-a.asm: x86 deblocking
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Oskar Arvidsson <oskar@irock.se>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at licensing@x264.com.
26 ;*****************************************************************************
29 %include "x86util.asm"
33 transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
47 ; out: %4 = |%1-%2|-%3
56 ; out: %4 = |%1-%2|<%3
62 psubw %5, %3 ; |%1-%2|-%3
63 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
81 pshuflw %1, %1, 01010000b
82 pshufd %1, %1, 01010000b
87 ; in: %1=p1, %2=p0, %3=q0, %4=q1
88 ; %5=alpha, %6=beta, %7-%9=tmp
91 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
92 ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
94 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
100 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
101 ; out: %1=p0', m2=q0'
102 %macro DEBLOCK_P0_Q0 7
111 mova %6, [pw_pixel_max]
120 ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
122 pavgw %6, %3, %4 ; (p0+q0+1)>>1
132 %macro LUMA_DEBLOCK_ONE 3
133 DIFF_LT m5, %1, bm, m4, m6
140 LUMA_Q1 m5, %2, m1, m2, m4, m6
143 %macro LUMA_H_STORE 2
155 movhps [%1+r1*2-4], m2
157 movhps [%1+r1*4-4], m3
161 %macro DEBLOCK_LUMA 1
162 ;-----------------------------------------------------------------------------
163 ; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
164 ;-----------------------------------------------------------------------------
165 cglobal deblock_v_luma_%1, 5,5,8*(mmsize/16)
166 %assign pad 5*mmsize+12-(stack_offset&15)
168 %define ms1 [rsp+mmsize]
169 %define ms2 [rsp+mmsize*2]
170 %define am [rsp+mmsize*3]
171 %define bm [rsp+mmsize*4]
174 LOAD_AB m4, m5, r2, r3
188 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
193 LUMA_DEBLOCK_ONE m1, m0, ms1
197 LUMA_DEBLOCK_ONE m2, m3, ms2
207 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
219 cglobal deblock_h_luma_%1, 5,6,8*(mmsize/16)
220 %assign pad 7*mmsize+12-(stack_offset&15)
222 %define ms1 [rsp+mmsize]
223 %define ms2 [rsp+mmsize*2]
224 %define p1m [rsp+mmsize*3]
225 %define p2m [rsp+mmsize*4]
226 %define am [rsp+mmsize*5]
227 %define bm [rsp+mmsize*6]
230 LOAD_AB m4, m5, r2, r3
243 movq m2, [r0-8] ; y q2 q1 q0
250 TRANSPOSE4x4W 2, 5, 0, 1, 4
253 TRANSPOSE4x4W 2, 3, 6, 7, 4
255 movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
259 TRANSPOSE4x4W 5, 0, 2, 3, 6
266 TRANSPOSE4x4W 4, 1, 3, 7, 6
271 SBUTTERFLY qdq, 0, 1, 7
272 SBUTTERFLY qdq, 2, 3, 7
276 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
280 LUMA_DEBLOCK_ONE m1, m0, ms1
284 LUMA_DEBLOCK_ONE m2, m3, ms2
294 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
297 TRANSPOSE4x4W 0, 1, 2, 3, 4
301 lea r0, [r0+r1*(mmsize/2)]
302 lea r2, [r2+r1*(mmsize/2)]
311 ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
312 ; m12=alpha, m13=beta
313 ; out: m0=p1', m3=q1', m1=p0', m2=q0'
314 ; clobbers: m4, m5, m6, m7, m10, m11, m14
315 %macro DEBLOCK_LUMA_INTER_SSE2 0
316 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
318 DIFF_LT m8, m1, m13, m10, m4
319 DIFF_LT m9, m2, m13, m11, m4
329 LUMA_Q1 m8, m0, m1, m2, m5, m4
333 LUMA_Q1 m9, m3, m1, m2, m5, m4
341 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
347 %macro DEBLOCK_LUMA_64 1
348 cglobal deblock_v_luma_%1, 5,5,15
359 LOAD_AB m12, m13, r2, r3
372 DEBLOCK_LUMA_INTER_SSE2
384 cglobal deblock_h_luma_%1, 5,7,15
386 LOAD_AB m12, m13, r2, r3
394 movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
403 TRANSPOSE4x4W 8, 0, 2, 9, 10
404 TRANSPOSE4x4W 5, 1, 3, 7, 10
407 SBUTTERFLY qdq, 0, 1, 10
408 SBUTTERFLY qdq, 2, 3, 10
411 DEBLOCK_LUMA_INTER_SSE2
413 TRANSPOSE4x4W 0, 1, 2, 3, 4
437 ; in: t0-t2: tmp registers
438 ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
439 ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
440 %macro LUMA_INTRA_P012 12 ; p0..p3 in memory
455 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
456 paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
475 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
476 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
488 %macro LUMA_INTRA_INIT 1
489 %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
496 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
503 ; in: %1-%3=tmp, %4=p2, %5=q2
504 %macro LUMA_INTRA_INTER 5
505 LOAD_AB t0, t1, r2d, r3d
507 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
516 paddw t3, [pw_2] ; alpha/4+2
517 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
521 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
525 DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
530 %macro LUMA_H_INTRA_LOAD 0
536 TRANSPOSE4x4W 4, 5, 0, 1, 2
544 TRANSPOSE4x4W 2, 3, 4, 5, 6
556 TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
564 ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
565 %macro LUMA_H_INTRA_STORE 9
567 TRANSPOSE4x4W %1, %2, %3, %4, %9
570 movq [r0+r1*2-8], m%3
573 TRANSPOSE4x4W %5, %6, %7, %1, %9
579 TRANSPOSE2x4x4W %1, %2, %3, %4, %9
582 movq [r0+r1*2-8], m%3
585 movhps [r4+r1-8], m%2
586 movhps [r4+r1*2-8], m%3
587 movhps [r4+r5-8], m%4
593 TRANSPOSE2x4x4W %5, %6, %7, %1, %9
600 movhps [r4+r1*2], m%7
606 ;-----------------------------------------------------------------------------
607 ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
608 ;-----------------------------------------------------------------------------
609 %macro DEBLOCK_LUMA_INTRA_64 1
610 cglobal deblock_v_luma_intra_%1, 4,7,16
624 lea r5, [r1*3] ; 3*stride
626 add r4, r0 ; pix-4*stride
629 LOAD_AB aa, bb, r2d, r3d
638 LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
641 paddw t2, m0 ; alpha/4+2
642 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
643 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
644 DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
648 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
649 LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
656 ;-----------------------------------------------------------------------------
657 ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
658 ;-----------------------------------------------------------------------------
659 cglobal deblock_h_luma_intra_%1, 4,7,16
672 %assign pad 24-(stack_offset&15)
676 lea r5, [r1*3] ; 3*stride
677 add r4, r0 ; pix+4*stride
689 TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
691 LOAD_AB m1, m2, r2d, r3d
692 LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
694 paddw m1, m0 ; alpha/4+2
695 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
696 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
697 DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
703 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
704 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
707 LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
718 DEBLOCK_LUMA_INTRA_64 sse2
720 DEBLOCK_LUMA_INTRA_64 avx
724 %macro DEBLOCK_LUMA_INTRA 1
725 ;-----------------------------------------------------------------------------
726 ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
727 ;-----------------------------------------------------------------------------
728 cglobal deblock_v_luma_intra_%1, 4,7,8*(mmsize/16)
736 mova m0, [r4+r1*2] ; p1
737 mova m1, [r4+r5] ; p0
739 mova m3, [r0+r1] ; q1
740 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
741 LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
742 mova t3, [r0+r1*2] ; q2
743 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
751 ;-----------------------------------------------------------------------------
752 ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
753 ;-----------------------------------------------------------------------------
754 cglobal deblock_h_luma_intra_%1, 4,7,8*(mmsize/16)
761 lea r5, [r1*3] ; 3*stride
762 add r4, r0 ; pix+4*stride
767 LUMA_INTRA_INTER t8, t9, t10, t5, t6
769 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
771 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
779 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
781 lea r0, [r0+r1*(mmsize/2)]
785 lea r4, [r4+r1*(mmsize/2)]
796 DEBLOCK_LUMA_INTRA mmxext
799 DEBLOCK_LUMA_INTRA sse2
802 DEBLOCK_LUMA_INTRA avx
804 %endif ; HIGH_BIT_DEPTH
806 %ifndef HIGH_BIT_DEPTH
807 ; expands to [base],...,[base+7*stride]
808 %define PASS8ROWS(base, base3, stride, stride3) \
809 [base], [base+stride], [base+stride*2], [base3], \
810 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
812 %define PASS8ROWS(base, base3, stride, stride3, offset) \
813 PASS8ROWS(base+offset, base3+offset, stride, stride3)
815 ; in: 8 rows of 4 bytes in %4..%11
816 ; out: 4 rows of 8 bytes in m0..m3
817 %macro TRANSPOSE4x8_LOAD 11
844 ; in: 4 rows of 8 bytes in m0..m3
845 ; out: 8 rows of 4 bytes in %1..%8
846 %macro TRANSPOSE8x4B_STORE 8
875 %macro TRANSPOSE4x8B_LOAD 8
876 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
879 %macro TRANSPOSE4x8W_LOAD 8
881 TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
888 TRANSPOSE4x4W 0, 1, 2, 3, 4
892 %macro TRANSPOSE8x2W_STORE 8
925 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
926 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
927 %macro TRANSPOSE6x8_MEM 9
936 SBUTTERFLY bw, 0, 1, 7
937 SBUTTERFLY bw, 2, 3, 7
938 SBUTTERFLY bw, 4, 5, 7
940 SBUTTERFLY3 bw, m6, %8, m7
941 SBUTTERFLY wd, 0, 2, 3
942 SBUTTERFLY wd, 4, 6, 3
945 SBUTTERFLY3 wd, m1, [%9+0x10], m3
946 SBUTTERFLY wd, 5, 7, 0
947 SBUTTERFLY dq, 1, 5, 0
948 SBUTTERFLY dq, 2, 6, 0
958 ; in: 8 rows of 8 in %1..%8
959 ; out: 8 rows of 8 in %9..%16
960 %macro TRANSPOSE8x8_MEM 16
969 SBUTTERFLY bw, 0, 1, 7
970 SBUTTERFLY bw, 2, 3, 7
971 SBUTTERFLY bw, 4, 5, 7
972 SBUTTERFLY3 bw, m6, %8, m7
974 SBUTTERFLY wd, 0, 2, 5
975 SBUTTERFLY wd, 4, 6, 5
976 SBUTTERFLY wd, 1, 3, 5
979 SBUTTERFLY wd, 6, 7, 5
980 SBUTTERFLY dq, 0, 4, 5
981 SBUTTERFLY dq, 1, 6, 5
986 SBUTTERFLY3 dq, m2, %11, m0
987 SBUTTERFLY dq, 3, 7, 4
995 ; out: %4 = |%1-%2|>%3
1011 ; out: %4 = |%1-%2|>%3
1028 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
1029 ; out: m5=beta-1, m7=mask, %3=alpha-1
1031 %macro LOAD_MASK 2-3
1036 packuswb m4, m4 ; 16x alpha-1
1037 packuswb m5, m5 ; 16x beta-1
1041 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
1042 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
1044 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
1050 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
1051 ; out: m1=p0' m2=q0'
1053 %macro DEBLOCK_P0_Q0 0
1054 pxor m5, m1, m2 ; p0^q0
1055 pand m5, [pb_1] ; (p0^q0)&1
1058 pavgb m3, m0 ; (p1 - q1 + 256)>>1
1059 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
1061 pavgb m4, m2 ; (q0 - p0 + 256)>>1
1063 paddusb m3, m4 ; d+128+33
1076 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
1077 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
1078 ; clobbers: q2, tmp, tc0
1081 pavgb %2, %6 ; avg(p2,avg(p0,q0))
1083 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
1084 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
1093 ;-----------------------------------------------------------------------------
1094 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1095 ;-----------------------------------------------------------------------------
1096 %macro DEBLOCK_LUMA 1
1097 cglobal deblock_v_luma_%1, 5,5,10
1103 add r4, r0 ; pix-3*stride
1105 mova m0, [r4+r1] ; p1
1106 mova m1, [r4+2*r1] ; p0
1108 mova m3, [r0+r1] ; q1
1112 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
1118 movdqa m3, [r4] ; p2
1119 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
1123 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
1125 movdqa m4, [r0+2*r1] ; q2
1126 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
1131 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
1138 ;-----------------------------------------------------------------------------
1139 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1140 ;-----------------------------------------------------------------------------
1142 cglobal deblock_h_luma_%1, 5,7
1144 lea r11, [r10+r10*2]
1149 %define pix_tmp rsp+0x30
1155 ; transpose 6x16 -> tmp space
1156 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
1159 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
1162 ; alpha, beta, tc0 are still in r2d, r3d, r4
1163 ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
1164 lea r0, [pix_tmp+0x30]
1169 call deblock_v_luma_%1
1171 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
1174 movq m0, [pix_tmp+0x18]
1175 movq m1, [pix_tmp+0x28]
1176 movq m2, [pix_tmp+0x38]
1177 movq m3, [pix_tmp+0x48]
1178 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
1184 movq m0, [pix_tmp+0x10]
1185 movq m1, [pix_tmp+0x20]
1186 movq m2, [pix_tmp+0x30]
1187 movq m3, [pix_tmp+0x40]
1188 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
1205 %macro DEBLOCK_LUMA 3
1206 ;-----------------------------------------------------------------------------
1207 ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1208 ;-----------------------------------------------------------------------------
1209 cglobal deblock_%2_luma_%1, 5,5
1214 add r4, r0 ; pix-3*stride
1215 %assign pad 2*%3+12-(stack_offset&15)
1218 mova m0, [r4+r1] ; p1
1219 mova m1, [r4+2*r1] ; p0
1221 mova m3, [r0+r1] ; q1
1227 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
1228 mova [esp+%3], m4 ; tc
1232 mova [esp], m4 ; mask
1235 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
1237 pand m4, [esp+%3] ; tc
1240 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
1242 mova m4, [r0+2*r1] ; q2
1243 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
1244 mova m5, [esp] ; mask
1246 mova m5, [esp+%3] ; tc
1250 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
1258 ;-----------------------------------------------------------------------------
1259 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1260 ;-----------------------------------------------------------------------------
1262 cglobal deblock_h_luma_%1, 0,5
1268 %assign pad 0x78-(stack_offset&15)
1270 %define pix_tmp esp+12
1272 ; transpose 6x16 -> tmp space
1273 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
1276 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
1279 lea r0, [pix_tmp+0x30]
1285 call deblock_%2_luma_%1
1287 add dword [esp ], 8 ; pix_tmp+0x38
1288 add dword [esp+16], 2 ; tc0+2
1289 call deblock_%2_luma_%1
1293 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
1298 movq m0, [pix_tmp+0x10]
1299 movq m1, [pix_tmp+0x20]
1300 movq m2, [pix_tmp+0x30]
1301 movq m3, [pix_tmp+0x40]
1302 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
1306 movq m0, [pix_tmp+0x18]
1307 movq m1, [pix_tmp+0x28]
1308 movq m2, [pix_tmp+0x38]
1309 movq m3, [pix_tmp+0x48]
1310 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
1314 %endmacro ; DEBLOCK_LUMA
1317 DEBLOCK_LUMA mmxext, v8, 8
1319 DEBLOCK_LUMA sse2, v, 16
1321 DEBLOCK_LUMA avx, v, 16
1327 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
1337 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
1355 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
1367 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
1371 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
1376 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
1382 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
1390 mova %1, t1 ; store p0
1395 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
1397 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
1402 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
1410 mova %2, t0 ; store p1
1411 mova %3, t1 ; store p2
1414 %macro LUMA_INTRA_SWAP_PQ 0
1420 %define mask1p mask1q
1423 %macro DEBLOCK_LUMA_INTRA 2
1439 %define mask1q [rsp-24]
1443 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
1445 %define q2 [r0+2*r1]
1448 %define mask0 spill(2)
1449 %define mask1p spill(3)
1450 %define mask1q spill(4)
1451 %define mpb_0 [pb_0]
1452 %define mpb_1 [pb_1]
1455 ;-----------------------------------------------------------------------------
1456 ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
1457 ;-----------------------------------------------------------------------------
1458 cglobal deblock_%2_luma_intra_%1, 4,6,16
1463 lea r5, [r1*3] ; 3*stride
1469 add r4, r0 ; pix-4*stride
1477 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
1478 SWAP 7, 12 ; m12=mask0
1480 pavgb t5, mpb_1 ; alpha/4+1
1482 movdqa q2, [r0+2*r1]
1483 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
1484 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
1485 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
1492 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
1496 pavgb m4, [pb_1] ; alpha/4+1
1497 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
1499 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
1502 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
1506 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
1508 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
1517 ;-----------------------------------------------------------------------------
1518 ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
1519 ;-----------------------------------------------------------------------------
1520 cglobal deblock_h_luma_intra_%1, 4,7
1528 ; transpose 8x16 -> tmp space
1529 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
1532 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
1534 lea r0, [pix_tmp+0x40]
1536 call deblock_v_luma_intra_%1
1538 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
1540 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
1545 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
1549 cglobal deblock_h_luma_intra_%1, 2,4
1553 %assign pad 0x8c-(stack_offset&15)
1557 ; transpose 8x16 -> tmp space
1558 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
1561 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
1563 lea r0, [pix_tmp+0x40]
1568 call deblock_%2_luma_intra_%1
1570 add dword [rsp], 8 ; pix_tmp+8
1571 call deblock_%2_luma_intra_%1
1580 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
1581 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
1584 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
1587 %endif ; ARCH_X86_64
1588 %endmacro ; DEBLOCK_LUMA_INTRA
1591 DEBLOCK_LUMA_INTRA sse2, v
1593 DEBLOCK_LUMA_INTRA avx , v
1596 DEBLOCK_LUMA_INTRA mmxext, v8
1598 %endif ; !HIGH_BIT_DEPTH
1600 %ifdef HIGH_BIT_DEPTH
1601 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
1602 ; out: %1=p0', %2=q0'
1603 %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
1623 %macro CHROMA_H_LOAD 0-1
1624 movq m0, [r0-8] ; p1 p1 p0 p0
1625 movq m2, [r0] ; q0 q0 q1 q1
1631 punpckldq m0, m5 ; p1
1632 punpckhdq m1, m5 ; p0
1633 punpckldq m2, m7 ; q0
1634 punpckhdq m3, m7 ; q1
1636 movq m4, [r0+r1*2-8]
1640 punpckldq m0, m5 ; p1 ... p0 ...
1641 punpckldq m2, m7 ; q0 ... q1 ...
1644 punpckhqdq m1, m0, m4 ; p0
1645 punpcklqdq m0, m4 ; p1
1646 punpckhqdq m3, m2, m6 ; q1
1647 punpcklqdq m2, m6 ; q0
1651 %macro CHROMA_V_LOAD 1
1653 mova m1, [r0+r1] ; p0
1655 mova m3, [%1+r1] ; q1
1658 ; clobbers: m1, m2, m3
1659 %macro CHROMA_H_STORE 0-1
1660 SBUTTERFLY dq, 1, 2, 3
1666 movq [r0+r1*2-4], m2
1667 movhps [r0+r1-4], m1
1668 movhps [r0+%1-4], m2
1672 %macro CHROMA_V_STORE 0
1677 %macro DEBLOCK_CHROMA 1
1678 ;-----------------------------------------------------------------------------
1679 ; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1680 ;-----------------------------------------------------------------------------
1681 cglobal deblock_v_chroma_%1, 7,7,8*(mmsize/16)
1689 call deblock_inter_body_%1
1698 ;-----------------------------------------------------------------------------
1699 ; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1700 ;-----------------------------------------------------------------------------
1701 cglobal deblock_h_chroma_%1, 5,7,8*(mmsize/16)
1709 call deblock_inter_body_%1
1711 lea r0, [r0+r1*(mmsize/4)]
1717 deblock_inter_body_%1:
1718 RESET_MM_PERMUTATION
1719 LOAD_AB m4, m5, r2, r3
1720 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
1725 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
1728 ;-----------------------------------------------------------------------------
1729 ; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
1730 ;-----------------------------------------------------------------------------
1731 cglobal deblock_v_chroma_intra_%1, 4,6,8*(mmsize/16)
1741 call deblock_intra_body_%1
1749 ;-----------------------------------------------------------------------------
1750 ; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
1751 ;-----------------------------------------------------------------------------
1752 cglobal deblock_h_chroma_intra_%1, 4,6,8*(mmsize/16)
1760 call deblock_intra_body_%1
1762 lea r0, [r0+r1*(mmsize/4)]
1767 deblock_intra_body_%1:
1768 RESET_MM_PERMUTATION
1769 LOAD_AB m4, m5, r2, r3
1770 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
1771 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
1777 DEBLOCK_CHROMA mmxext
1783 %endif ; HIGH_BIT_DEPTH
1785 %ifndef HIGH_BIT_DEPTH
1786 %macro CHROMA_V_START 0
1798 %macro CHROMA_H_START 0
1811 %macro CHROMA_V_LOOP 1
1823 %macro CHROMA_H_LOOP 1
1838 %macro DEBLOCK_CHROMA 1
1839 ;-----------------------------------------------------------------------------
1840 ; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1841 ;-----------------------------------------------------------------------------
1842 cglobal deblock_v_chroma_%1, 5,6,8
1848 call chroma_inter_body_%1
1854 ;-----------------------------------------------------------------------------
1855 ; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
1856 ;-----------------------------------------------------------------------------
1857 cglobal deblock_h_chroma_%1, 5,7,8
1859 TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
1860 call chroma_inter_body_%1
1861 TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
1866 RESET_MM_PERMUTATION
1867 chroma_inter_body_%1:
1875 %endmacro ; DEBLOCK_CHROMA
1883 DEBLOCK_CHROMA mmxext
1887 ; in: %1=p0 %2=p1 %3=q1
1888 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
1889 %macro CHROMA_INTRA_P0 3
1891 pand m4, [pb_1] ; m4 = (p0^q1)&1
1894 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
1900 %macro DEBLOCK_CHROMA_INTRA 1
1901 ;-----------------------------------------------------------------------------
1902 ; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
1903 ;-----------------------------------------------------------------------------
1904 cglobal deblock_v_chroma_intra_%1, 4,5,8
1910 call chroma_intra_body_%1
1916 ;-----------------------------------------------------------------------------
1917 ; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
1918 ;-----------------------------------------------------------------------------
1919 cglobal deblock_h_chroma_intra_%1, 4,6,8
1921 TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
1922 call chroma_intra_body_%1
1923 TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
1928 RESET_MM_PERMUTATION
1929 chroma_intra_body_%1:
1933 CHROMA_INTRA_P0 m1, m0, m3
1934 CHROMA_INTRA_P0 m2, m3, m0
1942 %endmacro ; DEBLOCK_CHROMA_INTRA
1945 DEBLOCK_CHROMA_INTRA sse2
1947 DEBLOCK_CHROMA_INTRA avx
1950 DEBLOCK_CHROMA_INTRA mmxext
1952 %endif ; !HIGH_BIT_DEPTH
1956 ;-----------------------------------------------------------------------------
1957 ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
1958 ; uint8_t bs[2][4][4], int mvy_limit, int bframe )
1959 ;-----------------------------------------------------------------------------
1961 %define scan8start (4+1*8)
1962 %define nnz r0+scan8start
1963 %define ref r1+scan8start
1964 %define mv r2+scan8start*4
1968 %macro LOAD_BYTES_MMX 1
1973 punpckldq m2, [%1+8*1-1]
1974 punpckldq m0, [%1+8*1]
1975 punpckldq m3, [%1+8*3-1]
1976 punpckldq m1, [%1+8*3]
1979 %macro DEBLOCK_STRENGTH_REFS_MMX 0
1990 punpckldq m2, m0 ; row -1, row 0
1991 punpckldq m3, m1 ; row 1, row 2
2000 %macro DEBLOCK_STRENGTH_MVS_MMX 2
2013 %macro DEBLOCK_STRENGTH_NNZ_MMX 1
2020 pminub m4, m6 ; mv ? 1 : 0
2022 paddb m2, m2 ; nnz ? 2 : 0
2028 %macro LOAD_BYTES_XMM 1
2029 movu m0, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
2033 shufps m2, m1, 0xdd ; cur nnz, all rows
2035 shufps m0, m1, 0xdd ; left neighbors
2037 movd m3, [%1-8] ; could be palignr if nnz was aligned
2038 por m1, m3 ; top neighbors
2042 cglobal deblock_strength_mmxext, 6,6
2043 ; Prepare mv comparison register
2056 DEBLOCK_STRENGTH_REFS_MMX
2059 DEBLOCK_STRENGTH_MVS_MMX bs0, 4
2060 DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
2073 DEBLOCK_STRENGTH_NNZ_MMX bs0
2074 ; Transpose column output
2075 SBUTTERFLY bw, 2, 3, 4
2076 SBUTTERFLY bw, 2, 3, 4
2081 punpckldq m2, m0 ; row -1, row 0
2082 punpckldq m3, m1 ; row 1, row 2
2083 DEBLOCK_STRENGTH_NNZ_MMX bs1
2088 %macro DEBLOCK_STRENGTH_XMM 1
2089 cglobal deblock_strength_%1, 6,6,8
2090 ; Prepare mv comparison register
2110 palignr m3, m0, [mv+4*8*0-16], 12
2111 palignr m2, m1, [mv+4*8*1-16], 12
2118 palignr m3, m2, [mv+4*8*2-16], 12
2119 palignr m7, m1, [mv+4*8*3-16], 12
2124 movu m0, [mv-4+4*8*0]
2125 movu m1, [mv-4+4*8*1]
2126 movu m2, [mv-4+4*8*2]
2127 movu m3, [mv-4+4*8*3]
2128 psubw m0, [mv+4*8*0]
2129 psubw m1, [mv+4*8*1]
2130 psubw m2, [mv+4*8*2]
2131 psubw m3, [mv+4*8*3]
2135 ABSB2 m0, m2, m1, m3
2141 mova m0, [mv+4*8*-1]
2142 mova m1, [mv+4*8* 0]
2143 mova m2, [mv+4*8* 1]
2144 mova m3, [mv+4*8* 2]
2148 psubw m3, [mv+4*8* 3]
2151 ABSB2 m0, m2, m1, m3
2168 pminub m4, m6 ; mv ? 1 : 0
2170 paddb m0, m0 ; nnz ? 2 : 0
2175 pshufb m4, [transpose_shuf]
2188 DEBLOCK_STRENGTH_XMM sse2
2189 %define ABSB2 ABSB2_SSSE3
2190 DEBLOCK_STRENGTH_XMM ssse3
2192 DEBLOCK_STRENGTH_XMM avx