1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
24 %include "x86util.asm"
28 fourtap_filter_hw_m: times 4 dw -6, 123
37 sixtap_filter_hw_m: times 4 dw 2, -11
47 fourtap_filter_hb_m: times 8 db -6, -1
56 sixtap_filter_hb_m: times 8 db 2, 1
66 fourtap_filter_v_m: times 8 dw -6
83 sixtap_filter_v_m: times 8 dw 2
102 bilinear_filter_vw_m: times 8 dw 1
110 bilinear_filter_vb_m: times 8 db 7, 1
119 %define fourtap_filter_hw r11
120 %define sixtap_filter_hw r11
121 %define fourtap_filter_hb r11
122 %define sixtap_filter_hb r11
123 %define fourtap_filter_v r11
124 %define sixtap_filter_v r11
125 %define bilinear_filter_vw r11
126 %define bilinear_filter_vb r11
128 %define fourtap_filter_hw fourtap_filter_hw_m
129 %define sixtap_filter_hw sixtap_filter_hw_m
130 %define fourtap_filter_hb fourtap_filter_hb_m
131 %define sixtap_filter_hb sixtap_filter_hb_m
132 %define fourtap_filter_v fourtap_filter_v_m
133 %define sixtap_filter_v sixtap_filter_v_m
134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
151 ;-----------------------------------------------------------------------------
152 ; subpel MC functions:
154 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
155 ; uint8_t *src, int srcstride,
156 ; int height, int mx, int my);
157 ;-----------------------------------------------------------------------------
159 ; 4x4 block, H-only 4-tap filter
160 cglobal put_vp8_epel4_h4_mmxext, 6, 6
163 lea r11, [fourtap_filter_hw_m]
165 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
166 movq mm5, [fourtap_filter_hw+r5]
171 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
173 ; first set of 2 pixels
174 movq mm2, mm1 ; byte ABCD..
175 punpcklbw mm1, mm6 ; byte->word ABCD
176 pshufw mm0, mm2, 9 ; byte CDEF..
177 punpcklbw mm0, mm6 ; byte->word CDEF
178 pshufw mm3, mm1, 0x94 ; word ABBC
179 pshufw mm1, mm0, 0x94 ; word CDDE
180 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
181 movq mm0, mm1 ; backup for second set of pixels
182 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
183 paddd mm3, mm1 ; finish 1st 2px
185 ; second set of 2 pixels, use backup of above
186 punpckhbw mm2, mm6 ; byte->word EFGH
187 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
188 pshufw mm1, mm2, 0x94 ; word EFFG
189 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
190 paddd mm0, mm1 ; finish 2nd 2px
192 ; merge two sets of 2 pixels into one set of 4, round/clip/store
193 packssdw mm3, mm0 ; merge dword->word (4px)
194 paddsw mm3, mm7 ; rounding
196 packuswb mm3, mm6 ; clip and word->bytes
197 movd [r0], mm3 ; store
206 ; 4x4 block, H-only 6-tap filter
207 cglobal put_vp8_epel4_h6_mmxext, 6, 6
210 lea r11, [sixtap_filter_hw_m]
212 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
213 movq mm5, [sixtap_filter_hw+r5*8-32]
214 movq mm6, [sixtap_filter_hw+r5*8-16]
219 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
221 ; first set of 2 pixels
222 movq mm2, mm1 ; byte ABCD..
223 punpcklbw mm1, mm3 ; byte->word ABCD
224 pshufw mm0, mm2, 0x9 ; byte CDEF..
225 punpckhbw mm2, mm3 ; byte->word EFGH
226 punpcklbw mm0, mm3 ; byte->word CDEF
227 pshufw mm1, mm1, 0x94 ; word ABBC
228 pshufw mm2, mm2, 0x94 ; word EFFG
229 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
230 pshufw mm3, mm0, 0x94 ; word CDDE
231 movq mm0, mm3 ; backup for second set of pixels
232 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
233 paddd mm1, mm3 ; add to 1st 2px cache
234 movq mm3, mm2 ; backup for second set of pixels
235 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
236 paddd mm1, mm2 ; finish 1st 2px
238 ; second set of 2 pixels, use backup of above
239 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
240 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
241 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
242 paddd mm0, mm3 ; add to 2nd 2px cache
244 punpcklbw mm2, mm3 ; byte->word FGHI
245 pshufw mm2, mm2, 0xE9 ; word GHHI
246 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
247 paddd mm0, mm2 ; finish 2nd 2px
249 ; merge two sets of 2 pixels into one set of 4, round/clip/store
250 packssdw mm1, mm0 ; merge dword->word (4px)
251 paddsw mm1, mm7 ; rounding
253 packuswb mm1, mm3 ; clip and word->bytes
254 movd [r0], mm1 ; store
263 ; 4x4 block, H-only 4-tap filter
265 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
268 lea r11, [fourtap_filter_hw_m]
270 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
271 mova m6, [fourtap_filter_hw+r5]
276 punpcklbw m0, m7 ; ABCDEFGH
280 psrldq m1, 2 ; BCDEFGH
281 psrldq m2, 4 ; CDEFGH
283 punpcklwd m0, m1 ; ABBCCDDE
284 punpcklwd m2, m3 ; CDDEEFFG
290 punpcklbw m1, m7 ; ABCDEFGH
294 psrldq m2, 2 ; BCDEFGH
295 psrldq m3, 4 ; CDEFGH
297 punpcklwd m1, m2 ; ABBCCDDE
298 punpcklwd m3, m4 ; CDDEEFFG
307 movh [r0], m0 ; store
316 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
319 lea r11, [sixtap_filter_hw_m]
321 lea r5, [sixtap_filter_hw+r5*8]
328 punpcklbw m0, m7 ; ABCDEFGHI
332 psrldq m1, 2 ; BCDEFGH
333 psrldq m2, 4 ; CDEFGH
336 punpcklbw m4, m7 ; EFGH
339 punpcklwd m0, m1 ; ABBCCDDE
340 punpcklwd m2, m3 ; CDDEEFFG
341 punpcklwd m4, m5 ; EFFGGHHI
350 punpcklbw m6, m7 ; ABCDEFGHI
354 psrldq m1, 2 ; BCDEFGH
355 psrldq m2, 4 ; CDEFGH
358 punpcklbw m4, m7 ; EFGH
361 punpcklwd m6, m1 ; ABBCCDDE
362 punpcklwd m2, m3 ; CDDEEFFG
363 punpcklwd m4, m5 ; EFFGGHHI
374 movh [r0], m0 ; store
383 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
386 mova m3, [filter_h4_shuf]
387 mova m4, [filter_h6_shuf2]
389 lea r11, [fourtap_filter_hb_m]
391 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
392 mova m6, [fourtap_filter_hb+r5]
405 movh [r0], m0 ; store
414 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
416 mova m3, [filter_h6_shuf1]
417 mova m4, [filter_h6_shuf2]
419 lea r11, [sixtap_filter_hb_m]
421 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
422 mova m6, [sixtap_filter_hb+r5*8-32]
423 mova m7, [sixtap_filter_hb+r5*8-16]
431 pshufb m2, [filter_h6_shuf3]
440 movh [r0], m0 ; store
450 ; 4x4 block, V-only 4-tap filter
451 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
454 lea r11, [fourtap_filter_v_m]
456 lea r6, [fourtap_filter_v+r6-32]
472 ; first calculate negative taps (to prevent losing positive overflows)
473 movh m4, [r2+2*r3] ; read new row
480 ; then calculate positive taps
503 ; 4x4 block, V-only 6-tap filter
504 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
508 lea r11, [sixtap_filter_v_m]
510 lea r6, [sixtap_filter_v+r6-96]
530 ; first calculate negative taps (to prevent losing positive overflows)
537 ; then calculate positive taps
538 movh m5, [r2+2*r3] ; read new row
569 FILTER_V mmxext, 4, 0
573 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
576 lea r11, [fourtap_filter_hb_m]
578 mova m5, [fourtap_filter_hb+r6-16]
579 mova m6, [fourtap_filter_hb+r6]
590 movh m3, [r2+2*r3] ; read new row
612 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
615 lea r11, [sixtap_filter_hb_m]
617 lea r6, [sixtap_filter_hb+r6*8]
631 movh m5, [r2+2*r3] ; read new row
638 pmaddubsw m6, [r6-48]
639 pmaddubsw m1, [r6-32]
640 pmaddubsw m7, [r6-16]
659 %macro FILTER_BILINEAR 3
660 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
665 lea r11, [bilinear_filter_vw_m]
668 mova m4, [bilinear_filter_vw+r5-16]
669 mova m5, [bilinear_filter_vw+r6-16]
705 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
710 lea r11, [bilinear_filter_vw_m]
713 mova m4, [bilinear_filter_vw+r6-16]
714 mova m5, [bilinear_filter_vw+r5-16]
753 FILTER_BILINEAR mmxext, 4, 0
755 FILTER_BILINEAR sse2, 8, 7
757 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
760 lea r11, [bilinear_filter_vb_m]
763 mova m3, [bilinear_filter_vb+r6-16]
786 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
789 lea r11, [bilinear_filter_vb_m]
792 mova m2, [filter_h2_shuf]
793 mova m3, [bilinear_filter_vb+r5-16]
815 cglobal put_vp8_pixels8_mmx, 5,5
827 cglobal put_vp8_pixels16_mmx, 5,5
829 movq mm0, [r2+r3*0+0]
830 movq mm1, [r2+r3*0+8]
831 movq mm2, [r2+r3*1+0]
832 movq mm3, [r2+r3*1+8]
834 movq [r0+r1*0+0], mm0
835 movq [r0+r1*0+8], mm1
836 movq [r0+r1*1+0], mm2
837 movq [r0+r1*1+8], mm3
843 cglobal put_vp8_pixels16_sse, 5,5,2
845 movups xmm0, [r2+r3*0]
846 movups xmm1, [r2+r3*1]
848 movaps [r0+r1*0], xmm0
849 movaps [r0+r1*1], xmm1
855 ;-----------------------------------------------------------------------------
858 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
859 ;-----------------------------------------------------------------------------
861 cglobal vp8_idct_dc_add_mmx, 3, 3
897 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
911 pshuflw xmm0, xmm0, 0
912 punpcklqdq xmm0, xmm0
921 pextrd [r0+r2], xmm2, 1
923 pextrd [r1+r2], xmm2, 3
926 ;-----------------------------------------------------------------------------
927 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
928 ;-----------------------------------------------------------------------------
941 %macro HADAMARD4_1D 4
942 SUMSUB_BADC m%2, m%1, m%4, m%3
943 SUMSUB_BADC m%4, m%2, m%3, m%1
948 cglobal vp8_luma_dc_wht_mmxext, 2,3
953 HADAMARD4_1D 0, 1, 2, 3
954 TRANSPOSE4x4W 0, 1, 2, 3, 4
956 HADAMARD4_1D 0, 1, 2, 3