1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86inc.asm"
24 %include "libavutil/x86/x86util.asm"
28 fourtap_filter_hw_m: times 4 dw -6, 123
37 sixtap_filter_hw_m: times 4 dw 2, -11
47 fourtap_filter_hb_m: times 8 db -6, 123
56 sixtap_filter_hb_m: times 8 db 2, 1
66 fourtap_filter_v_m: times 8 dw -6
83 sixtap_filter_v_m: times 8 dw 2
102 bilinear_filter_vw_m: times 8 dw 1
110 bilinear_filter_vb_m: times 8 db 7, 1
119 %define fourtap_filter_hw r11
120 %define sixtap_filter_hw r11
121 %define fourtap_filter_hb r11
122 %define sixtap_filter_hb r11
123 %define fourtap_filter_v r11
124 %define sixtap_filter_v r11
125 %define bilinear_filter_vw r11
126 %define bilinear_filter_vb r11
128 %define fourtap_filter_hw fourtap_filter_hw_m
129 %define sixtap_filter_hw sixtap_filter_hw_m
130 %define fourtap_filter_hb fourtap_filter_hb_m
131 %define sixtap_filter_hb sixtap_filter_hb_m
132 %define fourtap_filter_v fourtap_filter_v_m
133 %define sixtap_filter_v sixtap_filter_v_m
134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
145 pw_20091: times 4 dw 20091
146 pw_17734: times 4 dw 17734
148 pb_27_63: times 8 db 27, 63
149 pb_18_63: times 8 db 18, 63
150 pb_9_63: times 8 db 9, 63
168 ;-----------------------------------------------------------------------------
169 ; subpel MC functions:
171 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
172 ; uint8_t *src, int srcstride,
173 ; int height, int mx, int my);
174 ;-----------------------------------------------------------------------------
176 %macro FILTER_SSSE3 3
177 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
179 mova m3, [filter_h6_shuf2]
180 mova m4, [filter_h6_shuf3]
182 lea r11, [sixtap_filter_hb_m]
184 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
185 mova m6, [sixtap_filter_hb+r5*8-32]
186 mova m7, [sixtap_filter_hb+r5*8-16]
193 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
194 ; shuffle with a memory operand
197 pshufb m0, [filter_h6_shuf1]
209 movh [r0], m0 ; store
218 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
221 mova m3, [filter_h2_shuf]
222 mova m4, [filter_h4_shuf]
224 lea r11, [fourtap_filter_hb_m]
226 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
227 mova m6, [fourtap_filter_hb+r5]
240 movh [r0], m0 ; store
249 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
252 lea r11, [fourtap_filter_hb_m]
254 mova m5, [fourtap_filter_hb+r6-16]
255 mova m6, [fourtap_filter_hb+r6]
266 movh m3, [r2+2*r3] ; read new row
288 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
291 lea r11, [sixtap_filter_hb_m]
293 lea r6, [sixtap_filter_hb+r6*8]
307 movh m5, [r2+2*r3] ; read new row
314 pmaddubsw m6, [r6-48]
315 pmaddubsw m1, [r6-32]
316 pmaddubsw m7, [r6-16]
341 ; 4x4 block, H-only 4-tap filter
342 cglobal put_vp8_epel4_h4_mmxext, 6, 6
345 lea r11, [fourtap_filter_hw_m]
347 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
348 movq mm5, [fourtap_filter_hw+r5]
353 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
355 ; first set of 2 pixels
356 movq mm2, mm1 ; byte ABCD..
357 punpcklbw mm1, mm6 ; byte->word ABCD
358 pshufw mm0, mm2, 9 ; byte CDEF..
359 punpcklbw mm0, mm6 ; byte->word CDEF
360 pshufw mm3, mm1, 0x94 ; word ABBC
361 pshufw mm1, mm0, 0x94 ; word CDDE
362 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
363 movq mm0, mm1 ; backup for second set of pixels
364 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
365 paddd mm3, mm1 ; finish 1st 2px
367 ; second set of 2 pixels, use backup of above
368 punpckhbw mm2, mm6 ; byte->word EFGH
369 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
370 pshufw mm1, mm2, 0x94 ; word EFFG
371 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
372 paddd mm0, mm1 ; finish 2nd 2px
374 ; merge two sets of 2 pixels into one set of 4, round/clip/store
375 packssdw mm3, mm0 ; merge dword->word (4px)
376 paddsw mm3, mm7 ; rounding
378 packuswb mm3, mm6 ; clip and word->bytes
379 movd [r0], mm3 ; store
388 ; 4x4 block, H-only 6-tap filter
389 cglobal put_vp8_epel4_h6_mmxext, 6, 6
392 lea r11, [sixtap_filter_hw_m]
394 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
395 movq mm5, [sixtap_filter_hw+r5*8-32]
396 movq mm6, [sixtap_filter_hw+r5*8-16]
401 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
403 ; first set of 2 pixels
404 movq mm2, mm1 ; byte ABCD..
405 punpcklbw mm1, mm3 ; byte->word ABCD
406 pshufw mm0, mm2, 0x9 ; byte CDEF..
407 punpckhbw mm2, mm3 ; byte->word EFGH
408 punpcklbw mm0, mm3 ; byte->word CDEF
409 pshufw mm1, mm1, 0x94 ; word ABBC
410 pshufw mm2, mm2, 0x94 ; word EFFG
411 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
412 pshufw mm3, mm0, 0x94 ; word CDDE
413 movq mm0, mm3 ; backup for second set of pixels
414 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
415 paddd mm1, mm3 ; add to 1st 2px cache
416 movq mm3, mm2 ; backup for second set of pixels
417 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
418 paddd mm1, mm2 ; finish 1st 2px
420 ; second set of 2 pixels, use backup of above
421 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
422 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
423 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
424 paddd mm0, mm3 ; add to 2nd 2px cache
426 punpcklbw mm2, mm3 ; byte->word FGHI
427 pshufw mm2, mm2, 0xE9 ; word GHHI
428 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
429 paddd mm0, mm2 ; finish 2nd 2px
431 ; merge two sets of 2 pixels into one set of 4, round/clip/store
432 packssdw mm1, mm0 ; merge dword->word (4px)
433 paddsw mm1, mm7 ; rounding
435 packuswb mm1, mm3 ; clip and word->bytes
436 movd [r0], mm1 ; store
446 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
449 lea r11, [fourtap_filter_v_m]
451 lea r5, [fourtap_filter_v+r5-32]
484 movh [r0], m0 ; store
493 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
497 lea r11, [sixtap_filter_v_m]
499 lea r5, [sixtap_filter_v+r5-96]
546 movh [r0], m0 ; store
556 ; 4x4 block, V-only 4-tap filter
557 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
560 lea r11, [fourtap_filter_v_m]
562 lea r6, [fourtap_filter_v+r6-32]
578 ; first calculate negative taps (to prevent losing positive overflows)
579 movh m4, [r2+2*r3] ; read new row
586 ; then calculate positive taps
609 ; 4x4 block, V-only 6-tap filter
610 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
614 lea r11, [sixtap_filter_v_m]
616 lea r6, [sixtap_filter_v+r6-96]
636 ; first calculate negative taps (to prevent losing positive overflows)
643 ; then calculate positive taps
644 movh m5, [r2+2*r3] ; read new row
675 FILTER_V mmxext, 4, 0
679 %macro FILTER_BILINEAR 3
680 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
685 lea r11, [bilinear_filter_vw_m]
688 mova m4, [bilinear_filter_vw+r5-16]
689 mova m5, [bilinear_filter_vw+r6-16]
725 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
730 lea r11, [bilinear_filter_vw_m]
733 mova m4, [bilinear_filter_vw+r6-16]
734 mova m5, [bilinear_filter_vw+r5-16]
773 FILTER_BILINEAR mmxext, 4, 0
775 FILTER_BILINEAR sse2, 8, 7
777 %macro FILTER_BILINEAR_SSSE3 1
778 cglobal put_vp8_bilinear%1_v_ssse3, 7,7
781 lea r11, [bilinear_filter_vb_m]
784 mova m3, [bilinear_filter_vb+r6-16]
814 cglobal put_vp8_bilinear%1_h_ssse3, 7,7
817 lea r11, [bilinear_filter_vb_m]
820 mova m2, [filter_h2_shuf]
821 mova m3, [bilinear_filter_vb+r5-16]
852 FILTER_BILINEAR_SSSE3 4
854 FILTER_BILINEAR_SSSE3 8
856 cglobal put_vp8_pixels8_mmx, 5,5
868 cglobal put_vp8_pixels16_mmx, 5,5
870 movq mm0, [r2+r3*0+0]
871 movq mm1, [r2+r3*0+8]
872 movq mm2, [r2+r3*1+0]
873 movq mm3, [r2+r3*1+8]
875 movq [r0+r1*0+0], mm0
876 movq [r0+r1*0+8], mm1
877 movq [r0+r1*1+0], mm2
878 movq [r0+r1*1+8], mm3
884 cglobal put_vp8_pixels16_sse, 5,5,2
886 movups xmm0, [r2+r3*0]
887 movups xmm1, [r2+r3*1]
889 movaps [r0+r1*0], xmm0
890 movaps [r0+r1*1], xmm1
896 ;-----------------------------------------------------------------------------
897 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
898 ;-----------------------------------------------------------------------------
920 cglobal vp8_idct_dc_add_mmx, 3, 3
939 ADD_DC m0, m1, 0, movh
943 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
967 pextrd [r0+r2], m2, 1
969 pextrd [r1+r2], m2, 3
972 ;-----------------------------------------------------------------------------
973 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
974 ;-----------------------------------------------------------------------------
977 cglobal vp8_idct_dc_add4y_mmx, 3, 3
979 movd m0, [r1+32*0] ; A
980 movd m1, [r1+32*2] ; C
981 punpcklwd m0, [r1+32*1] ; A B
982 punpcklwd m1, [r1+32*3] ; C D
983 punpckldq m0, m1 ; A B C D
996 punpcklbw m0, m0 ; AABBCCDD
997 punpcklbw m6, m6 ; AABBCCDD
1000 punpcklbw m0, m0 ; AAAABBBB
1001 punpckhbw m1, m1 ; CCCCDDDD
1002 punpcklbw m6, m6 ; AAAABBBB
1003 punpckhbw m7, m7 ; CCCCDDDD
1007 ADD_DC m0, m6, 0, mova
1008 ADD_DC m1, m7, 8, mova
1012 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
1014 movd m0, [r1+32*0] ; A
1015 movd m1, [r1+32*2] ; C
1016 punpcklwd m0, [r1+32*1] ; A B
1017 punpcklwd m1, [r1+32*3] ; C D
1018 punpckldq m0, m1 ; A B C D
1038 ADD_DC m0, m1, 0, mova
1041 ;-----------------------------------------------------------------------------
1042 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
1043 ;-----------------------------------------------------------------------------
1046 cglobal vp8_idct_dc_add4uv_mmx, 3, 3
1048 movd m0, [r1+32*0] ; A
1049 movd m1, [r1+32*2] ; C
1050 punpcklwd m0, [r1+32*1] ; A B
1051 punpcklwd m1, [r1+32*3] ; C D
1052 punpckldq m0, m1 ; A B C D
1065 punpcklbw m0, m0 ; AABBCCDD
1066 punpcklbw m6, m6 ; AABBCCDD
1069 punpcklbw m0, m0 ; AAAABBBB
1070 punpckhbw m1, m1 ; CCCCDDDD
1071 punpcklbw m6, m6 ; AAAABBBB
1072 punpckhbw m7, m7 ; CCCCDDDD
1076 ADD_DC m0, m6, 0, mova
1079 ADD_DC m1, m7, 0, mova
1082 ;-----------------------------------------------------------------------------
1083 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
1084 ;-----------------------------------------------------------------------------
1086 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1087 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
1088 %macro VP8_MULTIPLY_SUMSUB 4
1091 pmulhw %3, m6 ;20091(1)
1092 pmulhw %4, m6 ;20091(2)
1097 pmulhw %1, m7 ;35468(1)
1098 pmulhw %2, m7 ;35468(2)
1103 ; calculate x0=%1+%3; x1=%1-%3
1104 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1105 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1106 ; %5/%6 are temporary registers
1107 ; we assume m6/m7 have constant words 20091/17734 loaded in them
1108 %macro VP8_IDCT_TRANSFORM4x4_1D 6
1109 SUMSUB_BA w, %3, %1, %5 ;t0, t1
1110 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1111 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
1112 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
1118 %macro VP8_IDCT_ADD 1
1119 cglobal vp8_idct_add_%1, 3, 3
1129 movaps [r1+ 0], xmm0
1130 movaps [r1+16], xmm0
1140 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1141 TRANSPOSE4x4W 0, 1, 2, 3, 4
1143 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1144 TRANSPOSE4x4W 0, 1, 2, 3, 4
1149 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1150 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1158 ;-----------------------------------------------------------------------------
1159 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1160 ;-----------------------------------------------------------------------------
1162 %macro SCATTER_WHT 3
1165 mov [r0+2*16*(0+%3)], r1w
1166 mov [r0+2*16*(1+%3)], r2w
1171 mov [r0+2*16*(4+%3)], r1w
1172 mov [r0+2*16*(5+%3)], r2w
1175 mov [r0+2*16*(8+%3)], r1w
1176 mov [r0+2*16*(9+%3)], r2w
1179 mov [r0+2*16*(12+%3)], r1w
1180 mov [r0+2*16*(13+%3)], r2w
1183 %macro HADAMARD4_1D 4
1184 SUMSUB_BADC w, %2, %1, %4, %3
1185 SUMSUB_BADC w, %4, %2, %3, %1
1190 cglobal vp8_luma_dc_wht_%1, 2,3
1197 movaps [r1+ 0], xmm0
1198 movaps [r1+16], xmm0
1206 HADAMARD4_1D 0, 1, 2, 3
1207 TRANSPOSE4x4W 0, 1, 2, 3, 4
1209 HADAMARD4_1D 0, 1, 2, 3
1223 ;-----------------------------------------------------------------------------
1224 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1225 ;-----------------------------------------------------------------------------
1227 ; macro called with 7 mm register indexes as argument, and 4 regular registers
1229 ; first 4 mm registers will carry the transposed pixel data
1230 ; the other three are scratchspace (one would be sufficient, but this allows
1231 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
1233 ; first two regular registers are buf+4*stride and buf+5*stride
1234 ; third is -stride, fourth is +stride
1235 %macro READ_8x4_INTERLEAVED 11
1236 ; interleave 8 (A-H) rows of 4 pixels each
1237 movd m%1, [%8+%10*4] ; A0-3
1238 movd m%5, [%9+%10*4] ; B0-3
1239 movd m%2, [%8+%10*2] ; C0-3
1240 movd m%6, [%8+%10] ; D0-3
1241 movd m%3, [%8] ; E0-3
1242 movd m%7, [%9] ; F0-3
1243 movd m%4, [%9+%11] ; G0-3
1244 punpcklbw m%1, m%5 ; A/B interleaved
1245 movd m%5, [%9+%11*2] ; H0-3
1246 punpcklbw m%2, m%6 ; C/D interleaved
1247 punpcklbw m%3, m%7 ; E/F interleaved
1248 punpcklbw m%4, m%5 ; G/H interleaved
1251 ; macro called with 7 mm register indexes as argument, and 5 regular registers
1252 ; first 11 mean the same as READ_8x4_TRANSPOSED above
1253 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
1254 ; will be set to second regular register + 8*stride at the end
1255 %macro READ_16x4_INTERLEAVED 12
1256 ; transpose 16 (A-P) rows of 4 pixels each
1259 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1260 movd m%1, [%8+%10*4] ; A0-3
1261 movd m%3, [%12+%10*4] ; I0-3
1262 movd m%2, [%8+%10*2] ; C0-3
1263 movd m%4, [%12+%10*2] ; K0-3
1264 movd m%6, [%8+%10] ; D0-3
1265 movd m%5, [%12+%10] ; L0-3
1266 movd m%7, [%12] ; M0-3
1268 punpcklbw m%1, m%3 ; A/I
1269 movd m%3, [%8] ; E0-3
1270 punpcklbw m%2, m%4 ; C/K
1271 punpcklbw m%6, m%5 ; D/L
1272 punpcklbw m%3, m%7 ; E/M
1273 punpcklbw m%2, m%6 ; C/D/K/L interleaved
1275 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1276 movd m%5, [%9+%10*4] ; B0-3
1277 movd m%4, [%12+%10*4] ; J0-3
1278 movd m%7, [%9] ; F0-3
1279 movd m%6, [%12] ; N0-3
1280 punpcklbw m%5, m%4 ; B/J
1281 punpcklbw m%7, m%6 ; F/N
1282 punpcklbw m%1, m%5 ; A/B/I/J interleaved
1283 punpcklbw m%3, m%7 ; E/F/M/N interleaved
1284 movd m%4, [%9+%11] ; G0-3
1285 movd m%6, [%12+%11] ; O0-3
1286 movd m%5, [%9+%11*2] ; H0-3
1287 movd m%7, [%12+%11*2] ; P0-3
1288 punpcklbw m%4, m%6 ; G/O
1289 punpcklbw m%5, m%7 ; H/P
1290 punpcklbw m%4, m%5 ; G/H/O/P interleaved
1293 ; write 4 mm registers of 2 dwords each
1294 ; first four arguments are mm register indexes containing source data
1295 ; last four are registers containing buf+4*stride, buf+5*stride,
1296 ; -stride and +stride
1298 ; write out (2 dwords per register)
1313 ; write 4 xmm registers of 4 dwords each
1314 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1315 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1316 ; we add 1*stride to the third regular registry in the process
1317 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1318 ; same memory region), or 8 if they cover two separate buffers (third one points to
1319 ; a different memory region than the first two), allowing for more optimal code for
1321 %macro WRITE_4x4D 10
1322 ; write out (4 dwords per register), start with dwords zero
1373 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
1374 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
1376 ; 3 is a general-purpose register that we will clobber
1378 ; 3 is a pointer to the destination's 5th line
1379 ; 4 is a pointer to the destination's 4th line
1380 ; 5/6 is -stride and +stride
1409 %macro WRITE_8W_SSE2 5
1437 %macro WRITE_8W_SSE4 5
1438 pextrw [%3+%4*4], %1, 0
1439 pextrw [%2+%4*4], %1, 1
1440 pextrw [%3+%4*2], %1, 2
1441 pextrw [%3+%4 ], %1, 3
1444 pextrw [%2+%5 ], %1, 6
1445 pextrw [%2+%5*2], %1, 7
1448 %macro SPLATB_REG_MMX 2-3
1455 %macro SPLATB_REG_MMXEXT 2-3
1461 %macro SPLATB_REG_SSE2 2-3
1468 %macro SPLATB_REG_SSSE3 3
1473 %macro SIMPLE_LOOPFILTER 4
1474 cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
1475 %if mmsize == 8 ; mmx/mmxext
1483 SPLATB_REG m7, r2, m0 ; splat "flim" into register
1485 ; set up indexes to address 4 rows
1492 %if mmsize == 8 ; mmx / mmxext
1496 ; read 4 half/full rows of pixels
1497 mova m0, [r0+r1*2] ; p1
1498 mova m1, [r0+r1] ; p0
1500 mova m3, [r0+r2] ; q1
1504 %if mmsize == 8 ; mmx/mmxext
1505 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1507 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1509 TRANSPOSE4x4W 0, 1, 2, 3, 4
1513 mova m5, m2 ; m5=backup of q0
1514 mova m6, m1 ; m6=backup of p0
1515 psubusb m1, m2 ; p0-q0
1516 psubusb m2, m6 ; q0-p0
1517 por m1, m2 ; FFABS(p0-q0)
1518 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
1522 psubusb m3, m0 ; q1-p1
1523 psubusb m0, m4 ; p1-q1
1524 por m3, m0 ; FFABS(p1-q1)
1528 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
1530 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
1534 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1536 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1540 psubsb m5, m0 ; q0-p0 (signed)
1543 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
1544 pand m2, m3 ; apply filter mask (m3)
1548 paddsb m2, [pb_4] ; f1<<3=a+4
1549 paddsb m1, [pb_3] ; f2<<3=a+3
1551 pand m1, m3 ; cache f2<<3
1555 pcmpgtb m0, m2 ; which values are <0?
1556 psubb m3, m2 ; -f1<<3
1562 paddusb m4, m3 ; q0-f1
1566 pcmpgtb m0, m1 ; which values are <0?
1567 psubb m3, m1 ; -f2<<3
1573 psubusb m6, m3 ; p0+f2
1581 SBUTTERFLY bw, 6, 4, 0
1583 %if mmsize == 16 ; sse2
1587 WRITE_8W m6, r4, r0, r1, r2
1592 WRITE_8W m4, r3, r4, r1, r2
1594 WRITE_2x4W m6, m4, r4, r0, r1, r2
1598 %if mmsize == 8 ; mmx/mmxext
1601 add r0, 8 ; advance 8 cols = pixels
1603 lea r0, [r0+r2*8-1] ; advance 8 rows = lines
1614 %define SPLATB_REG SPLATB_REG_MMX
1615 SIMPLE_LOOPFILTER mmx, v, 4, 0
1616 SIMPLE_LOOPFILTER mmx, h, 5, 0
1617 %define SPLATB_REG SPLATB_REG_MMXEXT
1618 SIMPLE_LOOPFILTER mmxext, v, 4, 0
1619 SIMPLE_LOOPFILTER mmxext, h, 5, 0
1621 %define SPLATB_REG SPLATB_REG_SSE2
1622 %define WRITE_8W WRITE_8W_SSE2
1623 SIMPLE_LOOPFILTER sse2, v, 3, 8
1624 SIMPLE_LOOPFILTER sse2, h, 5, 8
1625 %define SPLATB_REG SPLATB_REG_SSSE3
1626 SIMPLE_LOOPFILTER ssse3, v, 3, 8
1627 SIMPLE_LOOPFILTER ssse3, h, 5, 8
1628 %define WRITE_8W WRITE_8W_SSE4
1629 SIMPLE_LOOPFILTER sse4, h, 5, 8
1631 ;-----------------------------------------------------------------------------
1632 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1633 ; int flimE, int flimI, int hev_thr);
1634 ;-----------------------------------------------------------------------------
1636 %macro INNER_LOOPFILTER 5
1637 %if %4 == 8 ; chroma
1638 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1640 %define mstride_reg r2
1643 %define hev_thr_reg r5
1645 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1646 %define mstride_reg r1
1649 %define hev_thr_reg r4
1650 %ifdef m8 ; x86-64, sse2
1652 %elif mmsize == 16 ; x86-32, sse2
1654 %else ; x86-32, mmx/mmxext
1659 %define stride_reg E_reg
1660 %define dst2_reg I_reg
1662 %define stack_reg hev_thr_reg
1671 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
1672 ; splat function arguments
1673 SPLATB_REG m0, E_reg, m7 ; E
1674 SPLATB_REG m1, I_reg, m7 ; I
1675 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
1678 mov stack_reg, rsp ; backup stack pointer
1679 and rsp, ~(mmsize-1) ; align stack
1681 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1684 sub rsp, mmsize * 5 ; extra storage space for transposes
1687 %define flim_E [rsp]
1688 %define flim_I [rsp+mmsize]
1689 %define hev_thr [rsp+mmsize*2]
1690 %define mask_res [rsp+mmsize*3]
1691 %define p0backup [rsp+mmsize*3]
1692 %define q0backup [rsp+mmsize*4]
1698 %else ; sse2 on x86-64
1703 %define mask_res m12
1704 %define p0backup m12
1707 ; splat function arguments
1708 SPLATB_REG flim_E, E_reg, m7 ; E
1709 SPLATB_REG flim_I, I_reg, m7 ; I
1710 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
1713 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
1716 mov stride_reg, mstride_reg
1719 lea dst_reg, [dst_reg + stride_reg*4-4]
1721 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
1729 lea dst2_reg, [dst_reg + stride_reg]
1731 %if %4 == 8 && mmsize == 16
1736 movrow m0, [dst_reg +mstride_reg*4] ; p3
1737 movrow m1, [dst2_reg+mstride_reg*4] ; p2
1738 movrow m2, [dst_reg +mstride_reg*2] ; p1
1739 movrow m5, [dst2_reg] ; q1
1740 movrow m6, [dst2_reg+ stride_reg] ; q2
1741 movrow m7, [dst2_reg+ stride_reg*2] ; q3
1742 %if mmsize == 16 && %4 == 8
1743 movhps m0, [dst8_reg+mstride_reg*4]
1744 movhps m2, [dst8_reg+mstride_reg*2]
1745 add dst8_reg, stride_reg
1746 movhps m1, [dst8_reg+mstride_reg*4]
1747 movhps m5, [dst8_reg]
1748 movhps m6, [dst8_reg+ stride_reg]
1749 movhps m7, [dst8_reg+ stride_reg*2]
1750 add dst8_reg, mstride_reg
1752 %elif mmsize == 8 ; mmx/mmxext (h)
1753 ; read 8 rows of 8px each
1754 movu m0, [dst_reg +mstride_reg*4]
1755 movu m1, [dst2_reg+mstride_reg*4]
1756 movu m2, [dst_reg +mstride_reg*2]
1757 movu m3, [dst_reg +mstride_reg]
1760 movu m6, [dst2_reg+ stride_reg]
1763 TRANSPOSE4x4B 0, 1, 2, 3, 7
1765 movu m7, [dst2_reg+ stride_reg*2]
1766 TRANSPOSE4x4B 4, 5, 6, 7, 1
1767 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1768 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1769 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1771 mova q0backup, m2 ; store q0
1772 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1773 mova p0backup, m5 ; store p0
1780 lea dst8_reg, [dst_reg + stride_reg*8]
1783 ; read 16 rows of 8px each, interleave
1784 movh m0, [dst_reg +mstride_reg*4]
1785 movh m1, [dst8_reg+mstride_reg*4]
1786 movh m2, [dst_reg +mstride_reg*2]
1787 movh m5, [dst8_reg+mstride_reg*2]
1788 movh m3, [dst_reg +mstride_reg]
1789 movh m6, [dst8_reg+mstride_reg]
1792 punpcklbw m0, m1 ; A/I
1793 punpcklbw m2, m5 ; C/K
1794 punpcklbw m3, m6 ; D/L
1795 punpcklbw m4, m7 ; E/M
1797 add dst8_reg, stride_reg
1798 movh m1, [dst2_reg+mstride_reg*4]
1799 movh m6, [dst8_reg+mstride_reg*4]
1802 punpcklbw m1, m6 ; B/J
1803 punpcklbw m5, m7 ; F/N
1804 movh m6, [dst2_reg+ stride_reg]
1805 movh m7, [dst8_reg+ stride_reg]
1806 punpcklbw m6, m7 ; G/O
1809 TRANSPOSE4x4B 0, 1, 2, 3, 7
1815 movh m7, [dst2_reg+ stride_reg*2]
1816 movh m1, [dst8_reg+ stride_reg*2]
1817 punpcklbw m7, m1 ; H/P
1818 TRANSPOSE4x4B 4, 5, 6, 7, 1
1819 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1820 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1821 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1827 mova q0backup, m2 ; store q0
1829 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1833 mova p0backup, m5 ; store p0
1841 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1844 psubusb m4, m0 ; p2-p3
1845 psubusb m0, m1 ; p3-p2
1846 por m0, m4 ; abs(p3-p2)
1850 psubusb m4, m1 ; p1-p2
1851 psubusb m1, m2 ; p2-p1
1852 por m1, m4 ; abs(p2-p1)
1856 psubusb m4, m7 ; q2-q3
1857 psubusb m7, m6 ; q3-q2
1858 por m7, m4 ; abs(q3-q2)
1862 psubusb m4, m6 ; q1-q2
1863 psubusb m6, m5 ; q2-q1
1864 por m6, m4 ; abs(q2-q1)
1873 pcmpeqb m0, m3 ; abs(p3-p2) <= I
1874 pcmpeqb m1, m3 ; abs(p2-p1) <= I
1875 pcmpeqb m7, m3 ; abs(q3-q2) <= I
1876 pcmpeqb m6, m3 ; abs(q2-q1) <= I
1886 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1887 SWAP 7, 3 ; now m7 is zero
1889 movrow m3, [dst_reg +mstride_reg] ; p0
1890 %if mmsize == 16 && %4 == 8
1891 movhps m3, [dst8_reg+mstride_reg]
1903 psubusb m1, m3 ; p1-p0
1904 psubusb m6, m2 ; p0-p1
1905 por m1, m6 ; abs(p1-p0)
1910 pcmpeqb m1, m7 ; abs(p1-p0) <= I
1911 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
1915 pmaxub m0, m1 ; max_I
1916 SWAP 1, 4 ; max_hev_thresh
1919 SWAP 6, 4 ; now m6 is I
1921 movrow m4, [dst_reg] ; q0
1922 %if mmsize == 16 && %4 == 8
1923 movhps m4, [dst8_reg]
1934 psubusb m1, m5 ; q0-q1
1935 psubusb m7, m4 ; q1-q0
1936 por m1, m7 ; abs(q1-q0)
1942 pcmpeqb m1, m6 ; abs(q1-q0) <= I
1943 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
1945 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
1953 pcmpeqb m0, m7 ; max(abs(..)) <= I
1954 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
1959 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1965 mova m6, m4 ; keep copies of p0/q0 around for later use
1967 psubusb m1, m4 ; p0-q0
1968 psubusb m6, m3 ; q0-p0
1969 por m1, m6 ; abs(q0-p0)
1970 paddusb m1, m1 ; m1=2*abs(q0-p0)
1976 psubusb m7, m5 ; p1-q1
1977 psubusb m6, m2 ; q1-p1
1978 por m7, m6 ; abs(q1-p1)
1981 psrlq m7, 1 ; abs(q1-p1)/2
1982 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
1984 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1985 pand m0, m7 ; normal_limit result
1987 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1988 %ifdef m8 ; x86-64 && sse2
1990 %define pb_80_var m8
1991 %else ; x86-32 or mmx/mmxext
1992 %define pb_80_var [pb_80]
1998 psubsb m1, m7 ; (signed) q0-p0
2003 psubsb m6, m7 ; (signed) p1-q1
2008 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
2027 paddusb m3, m1 ; p0+f2
2038 paddusb m4, m1 ; q0-f1
2066 paddusb m5, m1 ; q1-a
2067 paddusb m2, m0 ; p1+a
2071 movrow [dst_reg +mstride_reg*2], m2
2072 movrow [dst_reg +mstride_reg ], m3
2073 movrow [dst_reg], m4
2074 movrow [dst_reg + stride_reg ], m5
2075 %if mmsize == 16 && %4 == 8
2076 movhps [dst8_reg+mstride_reg*2], m2
2077 movhps [dst8_reg+mstride_reg ], m3
2078 movhps [dst8_reg], m4
2079 movhps [dst8_reg+ stride_reg ], m5
2086 TRANSPOSE4x4B 2, 3, 4, 5, 6
2088 %if mmsize == 8 ; mmx/mmxext (h)
2089 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
2091 lea dst8_reg, [dst8_reg+mstride_reg+2]
2092 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2097 %if %4 == 8 ; chroma
2101 cmp dst_reg, dst8_reg
2102 mov dst_reg, dst8_reg
2106 lea dst_reg, [dst_reg + stride_reg*8-2]
2115 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2116 mov rsp, stack_reg ; restore stack pointer
2122 %define SPLATB_REG SPLATB_REG_MMX
2123 INNER_LOOPFILTER mmx, v, 6, 16, 0
2124 INNER_LOOPFILTER mmx, h, 6, 16, 0
2125 INNER_LOOPFILTER mmx, v, 6, 8, 0
2126 INNER_LOOPFILTER mmx, h, 6, 8, 0
2128 %define SPLATB_REG SPLATB_REG_MMXEXT
2129 INNER_LOOPFILTER mmxext, v, 6, 16, 0
2130 INNER_LOOPFILTER mmxext, h, 6, 16, 0
2131 INNER_LOOPFILTER mmxext, v, 6, 8, 0
2132 INNER_LOOPFILTER mmxext, h, 6, 8, 0
2135 %define SPLATB_REG SPLATB_REG_SSE2
2136 INNER_LOOPFILTER sse2, v, 5, 16, 13
2138 INNER_LOOPFILTER sse2, h, 5, 16, 13
2140 INNER_LOOPFILTER sse2, h, 6, 16, 13
2142 INNER_LOOPFILTER sse2, v, 6, 8, 13
2143 INNER_LOOPFILTER sse2, h, 6, 8, 13
2145 %define SPLATB_REG SPLATB_REG_SSSE3
2146 INNER_LOOPFILTER ssse3, v, 5, 16, 13
2148 INNER_LOOPFILTER ssse3, h, 5, 16, 13
2150 INNER_LOOPFILTER ssse3, h, 6, 16, 13
2152 INNER_LOOPFILTER ssse3, v, 6, 8, 13
2153 INNER_LOOPFILTER ssse3, h, 6, 8, 13
2155 ;-----------------------------------------------------------------------------
2156 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
2157 ; int flimE, int flimI, int hev_thr);
2158 ;-----------------------------------------------------------------------------
2160 %macro MBEDGE_LOOPFILTER 5
2161 %if %4 == 8 ; chroma
2162 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
2164 %define mstride_reg r2
2167 %define hev_thr_reg r5
2169 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
2170 %define mstride_reg r1
2173 %define hev_thr_reg r4
2174 %ifdef m8 ; x86-64, sse2
2176 %elif mmsize == 16 ; x86-32, sse2
2178 %else ; x86-32, mmx/mmxext
2183 %define stride_reg E_reg
2184 %define dst2_reg I_reg
2186 %define stack_reg hev_thr_reg
2189 %define ssse3_or_higher 0
2192 %define ssse3_or_higher 1
2200 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
2201 ; splat function arguments
2202 SPLATB_REG m0, E_reg, m7 ; E
2203 SPLATB_REG m1, I_reg, m7 ; I
2204 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
2207 mov stack_reg, rsp ; backup stack pointer
2208 and rsp, ~(mmsize-1) ; align stack
2212 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2214 ; [4]=filter tmp result
2215 ; [5]/[6] = p2/q2 backup
2216 ; [7]=lim_res sign result
2219 %define flim_E [rsp]
2220 %define flim_I [rsp+mmsize]
2221 %define hev_thr [rsp+mmsize*2]
2222 %define mask_res [rsp+mmsize*3]
2223 %define lim_res [rsp+mmsize*4]
2224 %define p0backup [rsp+mmsize*3]
2225 %define q0backup [rsp+mmsize*4]
2226 %define p2backup [rsp+mmsize*5]
2227 %define q2backup [rsp+mmsize*6]
2229 %define lim_sign [rsp]
2231 %define lim_sign [rsp+mmsize*7]
2238 %else ; sse2 on x86-64
2243 %define mask_res m12
2245 %define p0backup m12
2247 %define p2backup m13
2248 %define q2backup m14
2251 ; splat function arguments
2252 SPLATB_REG flim_E, E_reg, m7 ; E
2253 SPLATB_REG flim_I, I_reg, m7 ; I
2254 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
2257 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
2260 mov stride_reg, mstride_reg
2263 lea dst_reg, [dst_reg + stride_reg*4-4]
2265 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
2273 lea dst2_reg, [dst_reg + stride_reg]
2275 %if %4 == 8 && mmsize == 16
2280 movrow m0, [dst_reg +mstride_reg*4] ; p3
2281 movrow m1, [dst2_reg+mstride_reg*4] ; p2
2282 movrow m2, [dst_reg +mstride_reg*2] ; p1
2283 movrow m5, [dst2_reg] ; q1
2284 movrow m6, [dst2_reg+ stride_reg] ; q2
2285 movrow m7, [dst2_reg+ stride_reg*2] ; q3
2286 %if mmsize == 16 && %4 == 8
2287 movhps m0, [dst8_reg+mstride_reg*4]
2288 movhps m2, [dst8_reg+mstride_reg*2]
2289 add dst8_reg, stride_reg
2290 movhps m1, [dst8_reg+mstride_reg*4]
2291 movhps m5, [dst8_reg]
2292 movhps m6, [dst8_reg+ stride_reg]
2293 movhps m7, [dst8_reg+ stride_reg*2]
2294 add dst8_reg, mstride_reg
2296 %elif mmsize == 8 ; mmx/mmxext (h)
2297 ; read 8 rows of 8px each
2298 movu m0, [dst_reg +mstride_reg*4]
2299 movu m1, [dst2_reg+mstride_reg*4]
2300 movu m2, [dst_reg +mstride_reg*2]
2301 movu m3, [dst_reg +mstride_reg]
2304 movu m6, [dst2_reg+ stride_reg]
2307 TRANSPOSE4x4B 0, 1, 2, 3, 7
2309 movu m7, [dst2_reg+ stride_reg*2]
2310 TRANSPOSE4x4B 4, 5, 6, 7, 1
2311 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
2312 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
2313 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
2315 mova q0backup, m2 ; store q0
2316 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
2317 mova p0backup, m5 ; store p0
2324 lea dst8_reg, [dst_reg + stride_reg*8]
2327 ; read 16 rows of 8px each, interleave
2328 movh m0, [dst_reg +mstride_reg*4]
2329 movh m1, [dst8_reg+mstride_reg*4]
2330 movh m2, [dst_reg +mstride_reg*2]
2331 movh m5, [dst8_reg+mstride_reg*2]
2332 movh m3, [dst_reg +mstride_reg]
2333 movh m6, [dst8_reg+mstride_reg]
2336 punpcklbw m0, m1 ; A/I
2337 punpcklbw m2, m5 ; C/K
2338 punpcklbw m3, m6 ; D/L
2339 punpcklbw m4, m7 ; E/M
2341 add dst8_reg, stride_reg
2342 movh m1, [dst2_reg+mstride_reg*4]
2343 movh m6, [dst8_reg+mstride_reg*4]
2346 punpcklbw m1, m6 ; B/J
2347 punpcklbw m5, m7 ; F/N
2348 movh m6, [dst2_reg+ stride_reg]
2349 movh m7, [dst8_reg+ stride_reg]
2350 punpcklbw m6, m7 ; G/O
2353 TRANSPOSE4x4B 0, 1, 2, 3, 7
2359 movh m7, [dst2_reg+ stride_reg*2]
2360 movh m1, [dst8_reg+ stride_reg*2]
2361 punpcklbw m7, m1 ; H/P
2362 TRANSPOSE4x4B 4, 5, 6, 7, 1
2363 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
2364 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
2365 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
2371 mova q0backup, m2 ; store q0
2373 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
2377 mova p0backup, m5 ; store p0
2385 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
2388 psubusb m4, m0 ; p2-p3
2389 psubusb m0, m1 ; p3-p2
2390 por m0, m4 ; abs(p3-p2)
2394 psubusb m4, m1 ; p1-p2
2396 psubusb m1, m2 ; p2-p1
2397 por m1, m4 ; abs(p2-p1)
2401 psubusb m4, m7 ; q2-q3
2402 psubusb m7, m6 ; q3-q2
2403 por m7, m4 ; abs(q3-q2)
2407 psubusb m4, m6 ; q1-q2
2409 psubusb m6, m5 ; q2-q1
2410 por m6, m4 ; abs(q2-q1)
2419 pcmpeqb m0, m3 ; abs(p3-p2) <= I
2420 pcmpeqb m1, m3 ; abs(p2-p1) <= I
2421 pcmpeqb m7, m3 ; abs(q3-q2) <= I
2422 pcmpeqb m6, m3 ; abs(q2-q1) <= I
2432 ; normal_limit and high_edge_variance for p1-p0, q1-q0
2433 SWAP 7, 3 ; now m7 is zero
2435 movrow m3, [dst_reg +mstride_reg] ; p0
2436 %if mmsize == 16 && %4 == 8
2437 movhps m3, [dst8_reg+mstride_reg]
2449 psubusb m1, m3 ; p1-p0
2450 psubusb m6, m2 ; p0-p1
2451 por m1, m6 ; abs(p1-p0)
2456 pcmpeqb m1, m7 ; abs(p1-p0) <= I
2457 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
2461 pmaxub m0, m1 ; max_I
2462 SWAP 1, 4 ; max_hev_thresh
2465 SWAP 6, 4 ; now m6 is I
2467 movrow m4, [dst_reg] ; q0
2468 %if mmsize == 16 && %4 == 8
2469 movhps m4, [dst8_reg]
2480 psubusb m1, m5 ; q0-q1
2481 psubusb m7, m4 ; q1-q0
2482 por m1, m7 ; abs(q1-q0)
2488 pcmpeqb m1, m6 ; abs(q1-q0) <= I
2489 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
2491 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
2499 pcmpeqb m0, m7 ; max(abs(..)) <= I
2500 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
2505 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2511 mova m6, m4 ; keep copies of p0/q0 around for later use
2513 psubusb m1, m4 ; p0-q0
2514 psubusb m6, m3 ; q0-p0
2515 por m1, m6 ; abs(q0-p0)
2516 paddusb m1, m1 ; m1=2*abs(q0-p0)
2522 psubusb m7, m5 ; p1-q1
2523 psubusb m6, m2 ; q1-p1
2524 por m7, m6 ; abs(q1-p1)
2527 psrlq m7, 1 ; abs(q1-p1)/2
2528 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
2530 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
2531 pand m0, m7 ; normal_limit result
2533 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
2534 %ifdef m8 ; x86-64 && sse2
2536 %define pb_80_var m8
2537 %else ; x86-32 or mmx/mmxext
2538 %define pb_80_var [pb_80]
2544 psubsb m1, m7 ; (signed) q0-p0
2549 psubsb m6, m7 ; (signed) p1-q1
2556 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
2563 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
2581 paddusb m3, m1 ; p0+f2
2592 paddusb m4, m1 ; q0-f1
2594 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2607 pcmpgtb m0, m1 ; which are negative
2609 punpcklbw m6, m7 ; interleave with "1" for rounding
2612 punpcklbw m6, m0 ; signed byte->word
2622 SWAP 0, 10 ; don't lose lim_sign copy
2635 mova mask_res, m6 ; backup for later in filter
2644 packsswb m6, m1 ; a0
2650 mova m6, [pb_18_63] ; pipelining
2654 paddusb m3, m0 ; p0+a0
2655 psubusb m4, m0 ; q0-a0
2684 packsswb m6, m1 ; a1
2694 paddusb m2, m0 ; p1+a1
2695 psubusb m5, m0 ; q1-a1
2729 packsswb m6, m1 ; a1
2743 paddusb m1, m7 ; p1+a1
2744 psubusb m6, m7 ; q1-a1
2748 movrow [dst2_reg+mstride_reg*4], m1
2749 movrow [dst_reg +mstride_reg*2], m2
2750 movrow [dst_reg +mstride_reg ], m3
2751 movrow [dst_reg], m4
2752 movrow [dst2_reg], m5
2753 movrow [dst2_reg+ stride_reg ], m6
2754 %if mmsize == 16 && %4 == 8
2755 add dst8_reg, mstride_reg
2756 movhps [dst8_reg+mstride_reg*2], m1
2757 movhps [dst8_reg+mstride_reg ], m2
2758 movhps [dst8_reg], m3
2759 add dst8_reg, stride_reg
2760 movhps [dst8_reg], m4
2761 movhps [dst8_reg+ stride_reg ], m5
2762 movhps [dst8_reg+ stride_reg*2], m6
2769 TRANSPOSE4x4B 1, 2, 3, 4, 0
2770 SBUTTERFLY bw, 5, 6, 0
2772 %if mmsize == 8 ; mmx/mmxext (h)
2773 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
2775 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
2777 lea dst8_reg, [dst8_reg+mstride_reg+1]
2778 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2779 lea dst_reg, [dst2_reg+mstride_reg+4]
2780 lea dst8_reg, [dst8_reg+mstride_reg+4]
2784 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg
2786 lea dst2_reg, [dst8_reg+ stride_reg]
2788 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
2793 %if %4 == 8 ; chroma
2797 cmp dst_reg, dst8_reg
2798 mov dst_reg, dst8_reg
2802 lea dst_reg, [dst_reg + stride_reg*8-5]
2811 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2812 mov rsp, stack_reg ; restore stack pointer
2818 %define SPLATB_REG SPLATB_REG_MMX
2819 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0
2820 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0
2821 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
2822 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
2824 %define SPLATB_REG SPLATB_REG_MMXEXT
2825 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
2826 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2827 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
2828 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
2831 %define SPLATB_REG SPLATB_REG_SSE2
2832 %define WRITE_8W WRITE_8W_SSE2
2833 MBEDGE_LOOPFILTER sse2, v, 5, 16, 15
2835 MBEDGE_LOOPFILTER sse2, h, 5, 16, 15
2837 MBEDGE_LOOPFILTER sse2, h, 6, 16, 15
2839 MBEDGE_LOOPFILTER sse2, v, 6, 8, 15
2840 MBEDGE_LOOPFILTER sse2, h, 6, 8, 15
2842 %define SPLATB_REG SPLATB_REG_SSSE3
2843 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15
2845 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15
2847 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15
2849 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15
2850 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15
2852 %define WRITE_8W WRITE_8W_SSE4
2854 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15
2856 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15
2858 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15