1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
24 %include "x86util.asm"
28 fourtap_filter_hw_m: times 4 dw -6, 123
37 sixtap_filter_hw_m: times 4 dw 2, -11
47 fourtap_filter_hb_m: times 8 db -6, -1
56 sixtap_filter_hb_m: times 8 db 2, 1
66 fourtap_filter_v_m: times 8 dw -6
83 sixtap_filter_v_m: times 8 dw 2
102 bilinear_filter_vw_m: times 8 dw 1
110 bilinear_filter_vb_m: times 8 db 7, 1
119 %define fourtap_filter_hw r11
120 %define sixtap_filter_hw r11
121 %define fourtap_filter_hb r11
122 %define sixtap_filter_hb r11
123 %define fourtap_filter_v r11
124 %define sixtap_filter_v r11
125 %define bilinear_filter_vw r11
126 %define bilinear_filter_vb r11
128 %define fourtap_filter_hw fourtap_filter_hw_m
129 %define sixtap_filter_hw sixtap_filter_hw_m
130 %define fourtap_filter_hb fourtap_filter_hb_m
131 %define sixtap_filter_hb sixtap_filter_hb_m
132 %define fourtap_filter_v fourtap_filter_v_m
133 %define sixtap_filter_v sixtap_filter_v_m
134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
145 pw_20091: times 4 dw 20091
146 pw_17734: times 4 dw 17734
154 ;-----------------------------------------------------------------------------
155 ; subpel MC functions:
157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
158 ; uint8_t *src, int srcstride,
159 ; int height, int mx, int my);
160 ;-----------------------------------------------------------------------------
162 ; 4x4 block, H-only 4-tap filter
163 cglobal put_vp8_epel4_h4_mmxext, 6, 6
166 lea r11, [fourtap_filter_hw_m]
168 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
169 movq mm5, [fourtap_filter_hw+r5]
174 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
176 ; first set of 2 pixels
177 movq mm2, mm1 ; byte ABCD..
178 punpcklbw mm1, mm6 ; byte->word ABCD
179 pshufw mm0, mm2, 9 ; byte CDEF..
180 punpcklbw mm0, mm6 ; byte->word CDEF
181 pshufw mm3, mm1, 0x94 ; word ABBC
182 pshufw mm1, mm0, 0x94 ; word CDDE
183 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
184 movq mm0, mm1 ; backup for second set of pixels
185 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
186 paddd mm3, mm1 ; finish 1st 2px
188 ; second set of 2 pixels, use backup of above
189 punpckhbw mm2, mm6 ; byte->word EFGH
190 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
191 pshufw mm1, mm2, 0x94 ; word EFFG
192 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
193 paddd mm0, mm1 ; finish 2nd 2px
195 ; merge two sets of 2 pixels into one set of 4, round/clip/store
196 packssdw mm3, mm0 ; merge dword->word (4px)
197 paddsw mm3, mm7 ; rounding
199 packuswb mm3, mm6 ; clip and word->bytes
200 movd [r0], mm3 ; store
209 ; 4x4 block, H-only 6-tap filter
210 cglobal put_vp8_epel4_h6_mmxext, 6, 6
213 lea r11, [sixtap_filter_hw_m]
215 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
216 movq mm5, [sixtap_filter_hw+r5*8-32]
217 movq mm6, [sixtap_filter_hw+r5*8-16]
222 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
224 ; first set of 2 pixels
225 movq mm2, mm1 ; byte ABCD..
226 punpcklbw mm1, mm3 ; byte->word ABCD
227 pshufw mm0, mm2, 0x9 ; byte CDEF..
228 punpckhbw mm2, mm3 ; byte->word EFGH
229 punpcklbw mm0, mm3 ; byte->word CDEF
230 pshufw mm1, mm1, 0x94 ; word ABBC
231 pshufw mm2, mm2, 0x94 ; word EFFG
232 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
233 pshufw mm3, mm0, 0x94 ; word CDDE
234 movq mm0, mm3 ; backup for second set of pixels
235 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
236 paddd mm1, mm3 ; add to 1st 2px cache
237 movq mm3, mm2 ; backup for second set of pixels
238 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
239 paddd mm1, mm2 ; finish 1st 2px
241 ; second set of 2 pixels, use backup of above
242 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
243 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
244 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
245 paddd mm0, mm3 ; add to 2nd 2px cache
247 punpcklbw mm2, mm3 ; byte->word FGHI
248 pshufw mm2, mm2, 0xE9 ; word GHHI
249 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
250 paddd mm0, mm2 ; finish 2nd 2px
252 ; merge two sets of 2 pixels into one set of 4, round/clip/store
253 packssdw mm1, mm0 ; merge dword->word (4px)
254 paddsw mm1, mm7 ; rounding
256 packuswb mm1, mm3 ; clip and word->bytes
257 movd [r0], mm1 ; store
266 ; 4x4 block, H-only 4-tap filter
268 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
271 lea r11, [fourtap_filter_hw_m]
273 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
274 mova m6, [fourtap_filter_hw+r5]
279 punpcklbw m0, m7 ; ABCDEFGH
283 psrldq m1, 2 ; BCDEFGH
284 psrldq m2, 4 ; CDEFGH
286 punpcklwd m0, m1 ; ABBCCDDE
287 punpcklwd m2, m3 ; CDDEEFFG
293 punpcklbw m1, m7 ; ABCDEFGH
297 psrldq m2, 2 ; BCDEFGH
298 psrldq m3, 4 ; CDEFGH
300 punpcklwd m1, m2 ; ABBCCDDE
301 punpcklwd m3, m4 ; CDDEEFFG
310 movh [r0], m0 ; store
319 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
322 lea r11, [sixtap_filter_hw_m]
324 lea r5, [sixtap_filter_hw+r5*8]
331 punpcklbw m0, m7 ; ABCDEFGHI
335 psrldq m1, 2 ; BCDEFGH
336 psrldq m2, 4 ; CDEFGH
339 punpcklbw m4, m7 ; EFGH
342 punpcklwd m0, m1 ; ABBCCDDE
343 punpcklwd m2, m3 ; CDDEEFFG
344 punpcklwd m4, m5 ; EFFGGHHI
353 punpcklbw m6, m7 ; ABCDEFGHI
357 psrldq m1, 2 ; BCDEFGH
358 psrldq m2, 4 ; CDEFGH
361 punpcklbw m4, m7 ; EFGH
364 punpcklwd m6, m1 ; ABBCCDDE
365 punpcklwd m2, m3 ; CDDEEFFG
366 punpcklwd m4, m5 ; EFFGGHHI
377 movh [r0], m0 ; store
386 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
389 mova m3, [filter_h4_shuf]
390 mova m4, [filter_h6_shuf2]
392 lea r11, [fourtap_filter_hb_m]
394 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
395 mova m6, [fourtap_filter_hb+r5]
408 movh [r0], m0 ; store
417 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
419 mova m3, [filter_h6_shuf1]
420 mova m4, [filter_h6_shuf2]
422 lea r11, [sixtap_filter_hb_m]
424 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
425 mova m6, [sixtap_filter_hb+r5*8-32]
426 mova m7, [sixtap_filter_hb+r5*8-16]
434 pshufb m2, [filter_h6_shuf3]
443 movh [r0], m0 ; store
453 ; 4x4 block, V-only 4-tap filter
454 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
457 lea r11, [fourtap_filter_v_m]
459 lea r6, [fourtap_filter_v+r6-32]
475 ; first calculate negative taps (to prevent losing positive overflows)
476 movh m4, [r2+2*r3] ; read new row
483 ; then calculate positive taps
506 ; 4x4 block, V-only 6-tap filter
507 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
511 lea r11, [sixtap_filter_v_m]
513 lea r6, [sixtap_filter_v+r6-96]
533 ; first calculate negative taps (to prevent losing positive overflows)
540 ; then calculate positive taps
541 movh m5, [r2+2*r3] ; read new row
572 FILTER_V mmxext, 4, 0
576 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
579 lea r11, [fourtap_filter_hb_m]
581 mova m5, [fourtap_filter_hb+r6-16]
582 mova m6, [fourtap_filter_hb+r6]
593 movh m3, [r2+2*r3] ; read new row
615 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
618 lea r11, [sixtap_filter_hb_m]
620 lea r6, [sixtap_filter_hb+r6*8]
634 movh m5, [r2+2*r3] ; read new row
641 pmaddubsw m6, [r6-48]
642 pmaddubsw m1, [r6-32]
643 pmaddubsw m7, [r6-16]
662 %macro FILTER_BILINEAR 3
663 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
668 lea r11, [bilinear_filter_vw_m]
671 mova m4, [bilinear_filter_vw+r5-16]
672 mova m5, [bilinear_filter_vw+r6-16]
708 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
713 lea r11, [bilinear_filter_vw_m]
716 mova m4, [bilinear_filter_vw+r6-16]
717 mova m5, [bilinear_filter_vw+r5-16]
756 FILTER_BILINEAR mmxext, 4, 0
758 FILTER_BILINEAR sse2, 8, 7
760 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
763 lea r11, [bilinear_filter_vb_m]
766 mova m3, [bilinear_filter_vb+r6-16]
789 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
792 lea r11, [bilinear_filter_vb_m]
795 mova m2, [filter_h2_shuf]
796 mova m3, [bilinear_filter_vb+r5-16]
818 cglobal put_vp8_pixels8_mmx, 5,5
830 cglobal put_vp8_pixels16_mmx, 5,5
832 movq mm0, [r2+r3*0+0]
833 movq mm1, [r2+r3*0+8]
834 movq mm2, [r2+r3*1+0]
835 movq mm3, [r2+r3*1+8]
837 movq [r0+r1*0+0], mm0
838 movq [r0+r1*0+8], mm1
839 movq [r0+r1*1+0], mm2
840 movq [r0+r1*1+8], mm3
846 cglobal put_vp8_pixels16_sse, 5,5,2
848 movups xmm0, [r2+r3*0]
849 movups xmm1, [r2+r3*1]
851 movaps [r0+r1*0], xmm0
852 movaps [r0+r1*1], xmm1
858 ;-----------------------------------------------------------------------------
861 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
862 ;-----------------------------------------------------------------------------
864 cglobal vp8_idct_dc_add_mmx, 3, 3
900 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
914 pshuflw xmm0, xmm0, 0
915 punpcklqdq xmm0, xmm0
924 pextrd [r0+r2], xmm2, 1
926 pextrd [r1+r2], xmm2, 3
929 ;-----------------------------------------------------------------------------
930 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
931 ;-----------------------------------------------------------------------------
933 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
934 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
935 %macro VP8_MULTIPLY_SUMSUB 4
938 pmulhw %3, m6 ;20091(1)
939 pmulhw %4, m6 ;20091(2)
944 pmulhw %1, m7 ;35468(1)
945 pmulhw %2, m7 ;35468(2)
950 ; calculate x0=%1+%3; x1=%1-%3
951 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
952 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
953 ; %5/%6 are temporary registers
954 ; we assume m6/m7 have constant words 20091/17734 loaded in them
955 %macro VP8_IDCT_TRANSFORM4x4_1D 6
956 SUMSUB_BA m%3, m%1, m%5 ;t0, t1
957 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
958 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
959 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
965 cglobal vp8_idct_add_mmx, 3, 3
975 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
976 TRANSPOSE4x4W 0, 1, 2, 3, 4
978 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
979 TRANSPOSE4x4W 0, 1, 2, 3, 4
984 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
985 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
989 ;-----------------------------------------------------------------------------
990 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
991 ;-----------------------------------------------------------------------------
1000 mov [r0+2*16*2], r1w
1001 mov [r0+2*16*3], r2w
1004 %macro HADAMARD4_1D 4
1005 SUMSUB_BADC m%2, m%1, m%4, m%3
1006 SUMSUB_BADC m%4, m%2, m%3, m%1
1011 cglobal vp8_luma_dc_wht_mmxext, 2,3
1016 HADAMARD4_1D 0, 1, 2, 3
1017 TRANSPOSE4x4W 0, 1, 2, 3, 4
1019 HADAMARD4_1D 0, 1, 2, 3