1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
24 %include "x86util.asm"
28 fourtap_filter_hw_m: times 4 dw -6, 123
37 sixtap_filter_hw_m: times 4 dw 2, -11
47 fourtap_filter_hb_m: times 8 db -6, 123
56 sixtap_filter_hb_m: times 8 db 2, 1
66 fourtap_filter_v_m: times 8 dw -6
83 sixtap_filter_v_m: times 8 dw 2
102 bilinear_filter_vw_m: times 8 dw 1
110 bilinear_filter_vb_m: times 8 db 7, 1
119 %define fourtap_filter_hw r11
120 %define sixtap_filter_hw r11
121 %define fourtap_filter_hb r11
122 %define sixtap_filter_hb r11
123 %define fourtap_filter_v r11
124 %define sixtap_filter_v r11
125 %define bilinear_filter_vw r11
126 %define bilinear_filter_vb r11
128 %define fourtap_filter_hw fourtap_filter_hw_m
129 %define sixtap_filter_hw sixtap_filter_hw_m
130 %define fourtap_filter_hb fourtap_filter_hb_m
131 %define sixtap_filter_hb sixtap_filter_hb_m
132 %define fourtap_filter_v fourtap_filter_v_m
133 %define sixtap_filter_v sixtap_filter_v_m
134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
145 pw_20091: times 4 dw 20091
146 pw_17734: times 4 dw 17734
160 ;-----------------------------------------------------------------------------
161 ; subpel MC functions:
163 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
164 ; uint8_t *src, int srcstride,
165 ; int height, int mx, int my);
166 ;-----------------------------------------------------------------------------
168 %macro FILTER_SSSE3 3
169 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
171 mova m3, [filter_h6_shuf2]
172 mova m4, [filter_h6_shuf3]
174 lea r11, [sixtap_filter_hb_m]
176 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
177 mova m6, [sixtap_filter_hb+r5*8-32]
178 mova m7, [sixtap_filter_hb+r5*8-16]
185 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
186 ; shuffle with a memory operand
189 pshufb m0, [filter_h6_shuf1]
201 movh [r0], m0 ; store
210 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
213 mova m3, [filter_h2_shuf]
214 mova m4, [filter_h4_shuf]
216 lea r11, [fourtap_filter_hb_m]
218 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
219 mova m6, [fourtap_filter_hb+r5]
232 movh [r0], m0 ; store
241 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
244 lea r11, [fourtap_filter_hb_m]
246 mova m5, [fourtap_filter_hb+r6-16]
247 mova m6, [fourtap_filter_hb+r6]
258 movh m3, [r2+2*r3] ; read new row
280 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
283 lea r11, [sixtap_filter_hb_m]
285 lea r6, [sixtap_filter_hb+r6*8]
299 movh m5, [r2+2*r3] ; read new row
306 pmaddubsw m6, [r6-48]
307 pmaddubsw m1, [r6-32]
308 pmaddubsw m7, [r6-16]
333 ; 4x4 block, H-only 4-tap filter
334 cglobal put_vp8_epel4_h4_mmxext, 6, 6
337 lea r11, [fourtap_filter_hw_m]
339 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
340 movq mm5, [fourtap_filter_hw+r5]
345 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
347 ; first set of 2 pixels
348 movq mm2, mm1 ; byte ABCD..
349 punpcklbw mm1, mm6 ; byte->word ABCD
350 pshufw mm0, mm2, 9 ; byte CDEF..
351 punpcklbw mm0, mm6 ; byte->word CDEF
352 pshufw mm3, mm1, 0x94 ; word ABBC
353 pshufw mm1, mm0, 0x94 ; word CDDE
354 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
355 movq mm0, mm1 ; backup for second set of pixels
356 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
357 paddd mm3, mm1 ; finish 1st 2px
359 ; second set of 2 pixels, use backup of above
360 punpckhbw mm2, mm6 ; byte->word EFGH
361 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
362 pshufw mm1, mm2, 0x94 ; word EFFG
363 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
364 paddd mm0, mm1 ; finish 2nd 2px
366 ; merge two sets of 2 pixels into one set of 4, round/clip/store
367 packssdw mm3, mm0 ; merge dword->word (4px)
368 paddsw mm3, mm7 ; rounding
370 packuswb mm3, mm6 ; clip and word->bytes
371 movd [r0], mm3 ; store
380 ; 4x4 block, H-only 6-tap filter
381 cglobal put_vp8_epel4_h6_mmxext, 6, 6
384 lea r11, [sixtap_filter_hw_m]
386 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
387 movq mm5, [sixtap_filter_hw+r5*8-32]
388 movq mm6, [sixtap_filter_hw+r5*8-16]
393 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
395 ; first set of 2 pixels
396 movq mm2, mm1 ; byte ABCD..
397 punpcklbw mm1, mm3 ; byte->word ABCD
398 pshufw mm0, mm2, 0x9 ; byte CDEF..
399 punpckhbw mm2, mm3 ; byte->word EFGH
400 punpcklbw mm0, mm3 ; byte->word CDEF
401 pshufw mm1, mm1, 0x94 ; word ABBC
402 pshufw mm2, mm2, 0x94 ; word EFFG
403 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
404 pshufw mm3, mm0, 0x94 ; word CDDE
405 movq mm0, mm3 ; backup for second set of pixels
406 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
407 paddd mm1, mm3 ; add to 1st 2px cache
408 movq mm3, mm2 ; backup for second set of pixels
409 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
410 paddd mm1, mm2 ; finish 1st 2px
412 ; second set of 2 pixels, use backup of above
413 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
414 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
415 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
416 paddd mm0, mm3 ; add to 2nd 2px cache
418 punpcklbw mm2, mm3 ; byte->word FGHI
419 pshufw mm2, mm2, 0xE9 ; word GHHI
420 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
421 paddd mm0, mm2 ; finish 2nd 2px
423 ; merge two sets of 2 pixels into one set of 4, round/clip/store
424 packssdw mm1, mm0 ; merge dword->word (4px)
425 paddsw mm1, mm7 ; rounding
427 packuswb mm1, mm3 ; clip and word->bytes
428 movd [r0], mm1 ; store
437 ; 4x4 block, H-only 4-tap filter
439 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
442 lea r11, [fourtap_filter_hw_m]
444 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
445 mova m6, [fourtap_filter_hw+r5]
450 punpcklbw m0, m7 ; ABCDEFGH
454 psrldq m1, 2 ; BCDEFGH
455 psrldq m2, 4 ; CDEFGH
457 punpcklwd m0, m1 ; ABBCCDDE
458 punpcklwd m2, m3 ; CDDEEFFG
464 punpcklbw m1, m7 ; ABCDEFGH
468 psrldq m2, 2 ; BCDEFGH
469 psrldq m3, 4 ; CDEFGH
471 punpcklwd m1, m2 ; ABBCCDDE
472 punpcklwd m3, m4 ; CDDEEFFG
481 movh [r0], m0 ; store
490 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
493 lea r11, [sixtap_filter_hw_m]
495 lea r5, [sixtap_filter_hw+r5*8]
502 punpcklbw m0, m7 ; ABCDEFGHI
506 psrldq m1, 2 ; BCDEFGH
507 psrldq m2, 4 ; CDEFGH
510 punpcklbw m4, m7 ; EFGH
513 punpcklwd m0, m1 ; ABBCCDDE
514 punpcklwd m2, m3 ; CDDEEFFG
515 punpcklwd m4, m5 ; EFFGGHHI
524 punpcklbw m6, m7 ; ABCDEFGHI
528 psrldq m1, 2 ; BCDEFGH
529 psrldq m2, 4 ; CDEFGH
532 punpcklbw m4, m7 ; EFGH
535 punpcklwd m6, m1 ; ABBCCDDE
536 punpcklwd m2, m3 ; CDDEEFFG
537 punpcklwd m4, m5 ; EFFGGHHI
548 movh [r0], m0 ; store
558 ; 4x4 block, V-only 4-tap filter
559 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
562 lea r11, [fourtap_filter_v_m]
564 lea r6, [fourtap_filter_v+r6-32]
580 ; first calculate negative taps (to prevent losing positive overflows)
581 movh m4, [r2+2*r3] ; read new row
588 ; then calculate positive taps
611 ; 4x4 block, V-only 6-tap filter
612 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
616 lea r11, [sixtap_filter_v_m]
618 lea r6, [sixtap_filter_v+r6-96]
638 ; first calculate negative taps (to prevent losing positive overflows)
645 ; then calculate positive taps
646 movh m5, [r2+2*r3] ; read new row
677 FILTER_V mmxext, 4, 0
681 %macro FILTER_BILINEAR 3
682 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
687 lea r11, [bilinear_filter_vw_m]
690 mova m4, [bilinear_filter_vw+r5-16]
691 mova m5, [bilinear_filter_vw+r6-16]
727 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
732 lea r11, [bilinear_filter_vw_m]
735 mova m4, [bilinear_filter_vw+r6-16]
736 mova m5, [bilinear_filter_vw+r5-16]
775 FILTER_BILINEAR mmxext, 4, 0
777 FILTER_BILINEAR sse2, 8, 7
779 %macro FILTER_BILINEAR_SSSE3 1
780 cglobal put_vp8_bilinear%1_v_ssse3, 7,7
783 lea r11, [bilinear_filter_vb_m]
786 mova m3, [bilinear_filter_vb+r6-16]
816 cglobal put_vp8_bilinear%1_h_ssse3, 7,7
819 lea r11, [bilinear_filter_vb_m]
822 mova m2, [filter_h2_shuf]
823 mova m3, [bilinear_filter_vb+r5-16]
854 FILTER_BILINEAR_SSSE3 4
856 FILTER_BILINEAR_SSSE3 8
858 cglobal put_vp8_pixels8_mmx, 5,5
870 cglobal put_vp8_pixels16_mmx, 5,5
872 movq mm0, [r2+r3*0+0]
873 movq mm1, [r2+r3*0+8]
874 movq mm2, [r2+r3*1+0]
875 movq mm3, [r2+r3*1+8]
877 movq [r0+r1*0+0], mm0
878 movq [r0+r1*0+8], mm1
879 movq [r0+r1*1+0], mm2
880 movq [r0+r1*1+8], mm3
886 cglobal put_vp8_pixels16_sse, 5,5,2
888 movups xmm0, [r2+r3*0]
889 movups xmm1, [r2+r3*1]
891 movaps [r0+r1*0], xmm0
892 movaps [r0+r1*1], xmm1
898 ;-----------------------------------------------------------------------------
901 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
902 ;-----------------------------------------------------------------------------
904 cglobal vp8_idct_dc_add_mmx, 3, 3
940 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
954 pshuflw xmm0, xmm0, 0
955 punpcklqdq xmm0, xmm0
964 pextrd [r0+r2], xmm2, 1
966 pextrd [r1+r2], xmm2, 3
969 ;-----------------------------------------------------------------------------
970 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
971 ;-----------------------------------------------------------------------------
973 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
974 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
975 %macro VP8_MULTIPLY_SUMSUB 4
978 pmulhw %3, m6 ;20091(1)
979 pmulhw %4, m6 ;20091(2)
984 pmulhw %1, m7 ;35468(1)
985 pmulhw %2, m7 ;35468(2)
990 ; calculate x0=%1+%3; x1=%1-%3
991 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
992 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
993 ; %5/%6 are temporary registers
994 ; we assume m6/m7 have constant words 20091/17734 loaded in them
995 %macro VP8_IDCT_TRANSFORM4x4_1D 6
996 SUMSUB_BA m%3, m%1, m%5 ;t0, t1
997 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
998 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
999 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
1005 cglobal vp8_idct_add_mmx, 3, 3
1015 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1016 TRANSPOSE4x4W 0, 1, 2, 3, 4
1018 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1019 TRANSPOSE4x4W 0, 1, 2, 3, 4
1024 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1025 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1029 ;-----------------------------------------------------------------------------
1030 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1031 ;-----------------------------------------------------------------------------
1033 %macro SCATTER_WHT 1
1036 mov [r0+2*16*0], r1w
1037 mov [r0+2*16*1], r2w
1040 mov [r0+2*16*2], r1w
1041 mov [r0+2*16*3], r2w
1044 %macro HADAMARD4_1D 4
1045 SUMSUB_BADC m%2, m%1, m%4, m%3
1046 SUMSUB_BADC m%4, m%2, m%3, m%1
1051 cglobal vp8_luma_dc_wht_mmxext, 2,3
1056 HADAMARD4_1D 0, 1, 2, 3
1057 TRANSPOSE4x4W 0, 1, 2, 3, 4
1059 HADAMARD4_1D 0, 1, 2, 3
1073 ;-----------------------------------------------------------------------------
1074 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1075 ;-----------------------------------------------------------------------------
1077 ; macro called with 7 mm register indexes as argument, and 4 regular registers
1079 ; first 4 mm registers will carry the transposed pixel data
1080 ; the other three are scratchspace (one would be sufficient, but this allows
1081 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
1083 ; first two regular registers are buf+4*stride and buf+5*stride
1084 ; third is -stride, fourth is +stride
1085 %macro READ_8x4_INTERLEAVED 11
1086 ; interleave 8 (A-H) rows of 4 pixels each
1087 movd m%1, [%8+%10*4] ; A0-3
1088 movd m%5, [%9+%10*4] ; B0-3
1089 movd m%2, [%8+%10*2] ; C0-3
1090 movd m%6, [%8+%10] ; D0-3
1091 movd m%3, [%8] ; E0-3
1092 movd m%7, [%9] ; F0-3
1093 movd m%4, [%9+%11] ; G0-3
1094 punpcklbw m%1, m%5 ; A/B interleaved
1095 movd m%5, [%9+%11*2] ; H0-3
1096 punpcklbw m%2, m%6 ; C/D interleaved
1097 punpcklbw m%3, m%7 ; E/F interleaved
1098 punpcklbw m%4, m%5 ; G/H interleaved
1101 ; macro called with 7 mm register indexes as argument, and 5 regular registers
1102 ; first 11 mean the same as READ_8x4_TRANSPOSED above
1103 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
1104 ; will be set to second regular register + 8*stride at the end
1105 %macro READ_16x4_INTERLEAVED 12
1106 ; transpose 16 (A-P) rows of 4 pixels each
1109 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1110 movd m%1, [%8+%10*4] ; A0-3
1111 movd m%3, [%12+%10*4] ; I0-3
1112 movd m%2, [%8+%10*2] ; C0-3
1113 movd m%4, [%12+%10*2] ; K0-3
1114 movd m%6, [%8+%10] ; D0-3
1115 movd m%5, [%12+%10] ; L0-3
1116 movd m%7, [%12] ; M0-3
1118 punpcklbw m%1, m%3 ; A/I
1119 movd m%3, [%8] ; E0-3
1120 punpcklbw m%2, m%4 ; C/K
1121 punpcklbw m%6, m%5 ; D/L
1122 punpcklbw m%3, m%7 ; E/M
1123 punpcklbw m%2, m%6 ; C/D/K/L interleaved
1125 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1126 movd m%5, [%9+%10*4] ; B0-3
1127 movd m%4, [%12+%10*4] ; J0-3
1128 movd m%7, [%9] ; F0-3
1129 movd m%6, [%12] ; N0-3
1130 punpcklbw m%5, m%4 ; B/J
1131 punpcklbw m%7, m%6 ; F/N
1132 punpcklbw m%1, m%5 ; A/B/I/J interleaved
1133 punpcklbw m%3, m%7 ; E/F/M/N interleaved
1134 movd m%4, [%9+%11] ; G0-3
1135 movd m%6, [%12+%11] ; O0-3
1136 movd m%5, [%9+%11*2] ; H0-3
1137 movd m%7, [%12+%11*2] ; P0-3
1138 punpcklbw m%4, m%6 ; G/O
1139 punpcklbw m%5, m%7 ; H/P
1140 punpcklbw m%4, m%5 ; G/H/O/P interleaved
1143 ; write 4 mm registers of 2 dwords each
1144 ; first four arguments are mm register indexes containing source data
1145 ; last four are registers containing buf+4*stride, buf+5*stride,
1146 ; -stride and +stride
1148 ; write out (2 dwords per register)
1163 ; write 4 xmm registers of 4 dwords each
1164 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1165 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1166 ; we add 1*stride to the third regular registry in the process
1167 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1168 ; same memory region), or 8 if they cover two separate buffers (third one points to
1169 ; a different memory region than the first two), allowing for more optimal code for
1171 %macro WRITE_4x4D 10
1172 ; write out (4 dwords per register), start with dwords zero
1226 %if mmsize == 16 ; sse2
1237 %macro SIMPLE_LOOPFILTER 3
1238 cglobal vp8_%2_loop_filter_simple_%1, 3, %3
1240 mov r5, rsp ; backup stack pointer
1241 and rsp, ~(mmsize-1) ; align stack
1243 %if mmsize == 8 ; mmx/mmxext
1246 SPLATB_REG m7, r2, %1 ; splat "flim" into register
1248 ; set up indexes to address 4 rows
1253 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1
1256 %if mmsize == 8 ; mmx / mmxext
1260 ; read 4 half/full rows of pixels
1261 mova m0, [r0+r1*2] ; p1
1262 mova m1, [r0+r1] ; p0
1264 mova m3, [r0+r2] ; q1
1268 %if mmsize == 8 ; mmx/mmxext
1269 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1271 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1273 TRANSPOSE4x4W 0, 1, 2, 3, 4
1275 mova [rsp], m0 ; store p1
1276 mova [rsp+mmsize], m3 ; store q1
1280 mova m5, m2 ; m5=backup of q0
1281 mova m6, m1 ; m6=backup of p0
1282 psubusb m1, m2 ; p0-q0
1283 psubusb m2, m6 ; q0-p0
1284 por m1, m2 ; FFABS(p0-q0)
1285 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
1289 psubusb m3, m0 ; q1-p1
1290 psubusb m0, m4 ; p1-q1
1291 por m3, m0 ; FFABS(p1-q1)
1295 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
1297 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
1301 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1303 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1307 psubsb m5, m0 ; q0-p0 (signed)
1310 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
1311 pand m2, m3 ; apply filter mask (m3)
1315 paddsb m2, [pb_4] ; f1<<3=a+4
1316 paddsb m1, [pb_3] ; f2<<3=a+3
1318 pand m1, m3 ; cache f2<<3
1322 pcmpgtb m0, m2 ; which values are <0?
1323 psubb m3, m2 ; -f1<<3
1329 paddusb m4, m3 ; q0-f1
1333 pcmpgtb m0, m1 ; which values are <0?
1334 psubb m3, m1 ; -f2<<3
1340 psubusb m6, m3 ; p0+f2
1350 mova m3, [rsp+mmsize] ; q1
1352 TRANSPOSE4x4B 0, 1, 2, 3, 4
1353 %if mmsize == 16 ; sse2
1354 add r3, r1 ; change from r4*8*stride to r0+8*stride
1355 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
1357 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1361 %if mmsize == 8 ; mmx/mmxext
1364 add r0, 8 ; advance 8 cols = pixels
1366 lea r0, [r0+r2*8] ; advance 8 rows = lines
1373 mov rsp, r5 ; restore stack pointer
1378 mov rsp, r5 ; restore stack pointer
1385 SIMPLE_LOOPFILTER mmx, v, 4
1386 SIMPLE_LOOPFILTER mmx, h, 6
1387 SIMPLE_LOOPFILTER mmxext, v, 4
1388 SIMPLE_LOOPFILTER mmxext, h, 6
1390 SIMPLE_LOOPFILTER sse2, v, 3
1391 SIMPLE_LOOPFILTER sse2, h, 6
1393 ;-----------------------------------------------------------------------------
1394 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1395 ; int flimE, int flimI, int hev_thr);
1396 ;-----------------------------------------------------------------------------
1398 %macro INNER_LOOPFILTER 5
1399 %if %4 == 8 ; chroma
1400 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1402 %define mstride_reg r2
1405 %define hev_thr_reg r5
1407 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1408 %define mstride_reg r1
1411 %define hev_thr_reg r4
1412 %ifdef m8 ; x86-64, sse2
1414 %elif mmsize == 16 ; x86-32, sse2
1416 %else ; x86-32, mmx/mmxext
1421 %define stride_reg E_reg
1422 %define dst2_reg I_reg
1424 %define stack_reg hev_thr_reg
1427 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
1428 ; splat function arguments
1429 SPLATB_REG m0, E_reg, %1 ; E
1430 SPLATB_REG m1, I_reg, %1 ; I
1431 SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh
1434 mov stack_reg, rsp ; backup stack pointer
1435 and rsp, ~(mmsize-1) ; align stack
1437 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1440 sub rsp, mmsize * 5 ; extra storage space for transposes
1443 %define flim_E [rsp]
1444 %define flim_I [rsp+mmsize]
1445 %define hev_thr [rsp+mmsize*2]
1446 %define mask_res [rsp+mmsize*3]
1452 %else ; sse2 on x86-64
1457 %define mask_res m12
1459 ; splat function arguments
1460 SPLATB_REG flim_E, E_reg, %1 ; E
1461 SPLATB_REG flim_I, I_reg, %1 ; I
1462 SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh
1465 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
1468 mov stride_reg, mstride_reg
1471 lea dst_reg, [dst_reg + stride_reg*4-4]
1473 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
1481 lea dst2_reg, [dst_reg + stride_reg]
1483 %if %4 == 8 && mmsize == 16
1488 movrow m0, [dst_reg +mstride_reg*4] ; p3
1489 movrow m1, [dst2_reg+mstride_reg*4] ; p2
1490 movrow m2, [dst_reg +mstride_reg*2] ; p1
1491 movrow m5, [dst2_reg] ; q1
1492 movrow m6, [dst2_reg+ stride_reg] ; q2
1493 movrow m7, [dst2_reg+ stride_reg*2] ; q3
1494 %if mmsize == 16 && %4 == 8
1495 movhps m0, [dst8_reg+mstride_reg*4]
1496 movhps m2, [dst8_reg+mstride_reg*2]
1497 add dst8_reg, stride_reg
1498 movhps m1, [dst8_reg+mstride_reg*4]
1499 movhps m5, [dst8_reg]
1500 movhps m6, [dst8_reg+ stride_reg]
1501 movhps m7, [dst8_reg+ stride_reg*2]
1502 add dst8_reg, mstride_reg
1504 %elif mmsize == 8 ; mmx/mmxext (h)
1505 ; read 8 rows of 8px each
1506 movu m0, [dst_reg +mstride_reg*4]
1507 movu m1, [dst2_reg+mstride_reg*4]
1508 movu m2, [dst_reg +mstride_reg*2]
1509 movu m3, [dst_reg +mstride_reg]
1512 movu m6, [dst2_reg+ stride_reg]
1515 TRANSPOSE4x4B 0, 1, 2, 3, 7
1519 mova [rsp+mmsize*4], m1
1521 movu m7, [dst2_reg+ stride_reg*2]
1522 TRANSPOSE4x4B 4, 5, 6, 7, 1
1523 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1524 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1525 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1530 mova m1, [rsp+mmsize*4]
1531 mova [rsp+mmsize*4], m2 ; store q0
1533 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1537 mova [rsp+mmsize*3], m5 ; store p0
1545 lea dst8_reg, [dst_reg + stride_reg*8]
1548 ; read 16 rows of 8px each, interleave
1549 movh m0, [dst_reg +mstride_reg*4]
1550 movh m1, [dst8_reg+mstride_reg*4]
1551 movh m2, [dst_reg +mstride_reg*2]
1552 movh m5, [dst8_reg+mstride_reg*2]
1553 movh m3, [dst_reg +mstride_reg]
1554 movh m6, [dst8_reg+mstride_reg]
1557 punpcklbw m0, m1 ; A/I
1558 punpcklbw m2, m5 ; C/K
1559 punpcklbw m3, m6 ; D/L
1560 punpcklbw m4, m7 ; E/M
1562 add dst8_reg, stride_reg
1563 movh m1, [dst2_reg+mstride_reg*4]
1564 movh m6, [dst8_reg+mstride_reg*4]
1567 punpcklbw m1, m6 ; B/J
1568 punpcklbw m5, m7 ; F/N
1569 movh m6, [dst2_reg+ stride_reg]
1570 movh m7, [dst8_reg+ stride_reg]
1571 punpcklbw m6, m7 ; G/O
1574 TRANSPOSE4x4B 0, 1, 2, 3, 7
1578 mova [rsp+mmsize*4], m1
1580 movh m7, [dst2_reg+ stride_reg*2]
1581 movh m1, [dst8_reg+ stride_reg*2]
1582 punpcklbw m7, m1 ; H/P
1583 TRANSPOSE4x4B 4, 5, 6, 7, 1
1584 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1585 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1586 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1591 mova m1, [rsp+mmsize*4]
1592 mova [rsp+mmsize*4], m2 ; store q0
1594 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1598 mova [rsp+mmsize*3], m5 ; store p0
1606 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1609 psubusb m4, m0 ; p2-p3
1610 psubusb m0, m1 ; p3-p2
1611 por m0, m4 ; abs(p3-p2)
1615 psubusb m4, m1 ; p1-p2
1616 psubusb m1, m2 ; p2-p1
1617 por m1, m4 ; abs(p2-p1)
1621 psubusb m4, m7 ; q2-q3
1622 psubusb m7, m6 ; q3-q2
1623 por m7, m4 ; abs(q3-q2)
1627 psubusb m4, m6 ; q1-q2
1628 psubusb m6, m5 ; q2-q1
1629 por m6, m4 ; abs(q2-q1)
1635 mova m4, [rsp+mmsize]
1642 pcmpeqb m0, m3 ; abs(p3-p2) <= I
1643 pcmpeqb m1, m3 ; abs(p2-p1) <= I
1644 pcmpeqb m7, m3 ; abs(q3-q2) <= I
1645 pcmpeqb m6, m3 ; abs(q2-q1) <= I
1655 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1656 SWAP 7, 3 ; now m7 is zero
1658 movrow m3, [dst_reg +mstride_reg] ; p0
1659 %if mmsize == 16 && %4 == 8
1660 movhps m3, [dst8_reg+mstride_reg]
1665 mova m3, [rsp+mmsize*3]
1672 psubusb m1, m3 ; p1-p0
1673 psubusb m6, m2 ; p0-p1
1674 por m1, m6 ; abs(p1-p0)
1679 pcmpeqb m1, m7 ; abs(p1-p0) <= I
1680 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
1685 mova [rsp+mmsize*3], m6
1688 pmaxub m0, m1 ; max_I
1689 SWAP 1, 4 ; max_hev_thresh
1692 SWAP 6, 4 ; now m6 is I
1694 movrow m4, [dst_reg] ; q0
1695 %if mmsize == 16 && %4 == 8
1696 movhps m4, [dst8_reg]
1701 mova m4, [rsp+mmsize*4]
1707 psubusb m1, m5 ; q0-q1
1708 psubusb m7, m4 ; q1-q0
1709 por m1, m7 ; abs(q1-q0)
1715 pcmpeqb m1, m6 ; abs(q1-q0) <= I
1716 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
1720 mova m6, [rsp+mmsize*3]
1722 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
1730 pcmpeqb m0, m7 ; max(abs(..)) <= I
1731 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
1736 mova [rsp+mmsize*3], m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1742 mova m6, m4 ; keep copies of p0/q0 around for later use
1744 psubusb m1, m4 ; p0-q0
1745 psubusb m6, m3 ; q0-p0
1746 por m1, m6 ; abs(q0-p0)
1747 paddusb m1, m1 ; m1=2*abs(q0-p0)
1753 psubusb m7, m5 ; p1-q1
1754 psubusb m6, m2 ; q1-p1
1755 por m7, m6 ; abs(q1-p1)
1758 psrlq m7, 1 ; abs(q1-p1)/2
1759 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
1761 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1762 pand m0, m7 ; normal_limit result
1764 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1765 %ifdef m8 ; x86-64 && sse2
1767 %define pb_80_var m8
1768 %else ; x86-32 or mmx/mmxext
1769 %define pb_80_var [pb_80]
1775 psubsb m1, m7 ; (signed) q0-p0
1780 psubsb m6, m7 ; (signed) p1-q1
1785 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
1804 paddusb m3, m1 ; p0+f2
1815 paddusb m4, m1 ; q0-f1
1820 mova m6, [rsp+mmsize*3]
1843 paddusb m5, m1 ; q1-a
1844 paddusb m2, m0 ; p1+a
1848 movrow [dst_reg +mstride_reg*2], m2
1849 movrow [dst_reg +mstride_reg ], m3
1850 movrow [dst_reg], m4
1851 movrow [dst_reg + stride_reg ], m5
1852 %if mmsize == 16 && %4 == 8
1853 movhps [dst8_reg+mstride_reg*2], m2
1854 movhps [dst8_reg+mstride_reg ], m3
1855 movhps [dst8_reg], m4
1856 movhps [dst8_reg+ stride_reg ], m5
1863 TRANSPOSE4x4B 2, 3, 4, 5, 6
1865 %if mmsize == 8 ; mmx/mmxext (h)
1866 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
1868 lea dst8_reg, [dst8_reg+mstride_reg+2]
1869 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
1874 %if %4 == 8 ; chroma
1878 cmp dst_reg, dst8_reg
1879 mov dst_reg, dst8_reg
1883 lea dst_reg, [dst_reg + stride_reg*8-2]
1892 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
1893 mov rsp, stack_reg ; restore stack pointer
1899 INNER_LOOPFILTER mmx, v, 6, 16, 8
1900 INNER_LOOPFILTER mmx, h, 6, 16, 8
1901 INNER_LOOPFILTER mmxext, v, 6, 16, 8
1902 INNER_LOOPFILTER mmxext, h, 6, 16, 8
1904 INNER_LOOPFILTER mmx, v, 6, 8, 8
1905 INNER_LOOPFILTER mmx, h, 6, 8, 8
1906 INNER_LOOPFILTER mmxext, v, 6, 8, 8
1907 INNER_LOOPFILTER mmxext, h, 6, 8, 8
1910 INNER_LOOPFILTER sse2, v, 5, 16, 13
1912 INNER_LOOPFILTER sse2, h, 5, 16, 13
1914 INNER_LOOPFILTER sse2, h, 6, 16, 13
1917 INNER_LOOPFILTER sse2, v, 6, 8, 13
1918 INNER_LOOPFILTER sse2, h, 6, 8, 13