1 ;*****************************************************************************
2 ;* x86-optimized functions for fspp filter
4 ;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
5 ;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License along
20 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
21 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
28 pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \
29 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \
30 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \
31 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21
32 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
33 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
34 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
35 pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
36 pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
37 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
38 pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
39 pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
40 pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
41 pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
51 ;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
52 ; ptrdiff_t dst_stride, ptrdiff_t src_stride,
53 ; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
55 cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
57 cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
58 %define dst_strideq r2m
59 %define src_strideq r3m
61 mov dither_heightq, r5m
62 mov ditherq, r6m ; log2_scale
67 sub dst_strideq, widthq
68 movd m5, ditherq ; log2_scale
69 xor ditherq, -1 ; log2_scale
71 add ditherq, 7 ; log2_scale
74 movd m2, ditherq ; log2_scale
76 lea ditherq, [pb_dither]
77 mov src_strideq, tmp2q
79 lea dither_heightq, [ditherq+dither_heightq*8]
95 movq [srcq+tmpq+8], m7
109 add srcq, src_strideq
111 add dstq, dst_strideq
112 cmp ditherq, dither_heightq
116 ;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
117 ; ptrdiff_t dst_stride, ptrdiff_t src_stride,
118 ; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
120 cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
122 cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
123 %define dst_strideq r2m
124 %define src_strideq r3m
128 mov dither_heightq, r5m
129 mov ditherq, r6m ; log2_scale
132 mov tmpq, src_strideq
134 sub dst_strideq, widthq
135 movd m5, ditherq ; log2_scale
136 xor ditherq, -1 ; log2_scale
138 add ditherq, 7 ; log2_scale
140 movd m2, ditherq ; log2_scale
142 lea ditherq, [pb_dither]
143 mov src_strideq, tmp2q
145 lea dither_heightq, [ditherq+dither_heightq*8]
161 paddw m0, [srcq+tmpq]
163 movq m6, [srcq+tmpq+8]
167 movq [srcq+tmpq+8], m7
176 add srcq, src_strideq
178 add dstq, dst_strideq
179 cmp ditherq, dither_heightq
183 ;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
184 cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
208 movq m1, [thrnq+8*7+8]
211 movq m2, [thrnq+8*7+8*2]
214 movq m3, [thrnq+8*7+8*3]
217 movq m4, [thrnq+8*7+8*4]
219 movq [thrq+8*7+8], m1
220 movq m5, [thrnq+8*7+8*5]
222 movq [thrq+8*7+8*2], m2
223 movq m6, [thrnq+8*7+8*6]
225 movq [thrq+8*7+8*3], m3
226 movq m0, [thrnq+14*8]
228 movq [thrq+8*7+8*4], m4
229 movq m1, [thrnq+14*8+8]
231 movq [thrq+8*7+8*5], m5
233 movq [thrq+8*7+8*6], m6
235 movq [thrq+14*8+8], m1
238 %macro COLUMN_FDCT 1-3 0, 0
239 movq m1, [srcq+DCTSIZE*0*2]
240 movq m7, [srcq+DCTSIZE*3*2]
242 paddw m1, [srcq+DCTSIZE*7*2]
244 paddw m7, [srcq+DCTSIZE*4*2]
246 movq m6, [srcq+DCTSIZE*1*2]
248 movq m2, [srcq+DCTSIZE*2*2]
250 paddw m6, [srcq+DCTSIZE*6*2]
252 paddw m2, [srcq+DCTSIZE*5*2]
260 movq m6, [thrq+4*16+%2]
264 paddusw m5, [thrq+%2]
269 psubusw m5, [thrq+%2]
277 psubw m1, [thrq+2*16+%2]
279 movq m7, [thrq+6*16+%2]
281 paddusw m1, [thrq+2*16+%2]
283 paddw m1, [thrq+2*16+%2]
285 psubusw m1, [thrq+2*16+%2]
287 psubw m3, [srcq+DCTSIZE*4*2]
291 psubw m4, [srcq+DCTSIZE*6*2]
293 psubw m0, [srcq+DCTSIZE*7*2]
301 movq m2, [srcq+DCTSIZE*2*2]
303 psubw m2, [srcq+DCTSIZE*5*2]
324 movq m3, [thrq+3*16+%2]
330 movq m2, [thrq+5*16+%2]
334 movq m7, [thrq+16+%2]
340 movq m4, [thrq+7*16+%2]
365 movq m5, [outq+DCTSIZE*0*2]
373 movq m7, [outq+DCTSIZE*1*2]
375 movq [outq+DCTSIZE*7*2], m4
379 movq m4, [outq+DCTSIZE*2*2]
383 movq [outq+DCTSIZE*6*2], m6
385 movq m5, [outq+DCTSIZE*5*2]
387 movq m6, [outq+DCTSIZE*3*2]
391 movq [outq+DCTSIZE*1*2], m7
393 movq [outq+DCTSIZE*2*2], m4
395 movq m7, [outq+DCTSIZE*4*2]
397 movq [outq+DCTSIZE*5*2], m5
399 movq [outq+DCTSIZE*3*2], m6
400 movq [outq+DCTSIZE*4*2], m7
404 %macro COLUMN_IDCT 0-1 0
430 movq [outq+DCTSIZE*7*2], m6
438 paddw m3, [outq+DCTSIZE*1*2]
442 movq [outq+DCTSIZE*6*2], m5
444 paddw m7, [outq+DCTSIZE*2*2]
446 paddw m4, [outq+DCTSIZE*5*2]
448 movq [outq+DCTSIZE*1*2], m3
450 movq [outq+DCTSIZE*2*2], m7
452 paddw m6, [outq+DCTSIZE*4*2]
454 paddw m1, [outq+DCTSIZE*3*2]
456 movq [outq+DCTSIZE*5*2], m4
458 movq [outq+DCTSIZE*4*2], m6
459 movq [outq+DCTSIZE*3*2], m1
463 ;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
464 cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
473 COLUMN_FDCT .idct2, 8, 16
484 ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
485 cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
487 lea stride3q, [strideq+strideq*2]
489 movq m0, [srcq+DCTSIZE*0*2]
490 movq m1, [srcq+DCTSIZE*1*2]
492 movq m2, [srcq+DCTSIZE*2*2]
494 movq m3, [srcq+DCTSIZE*3*2]
512 movq m3, [srcq+DCTSIZE*0*2+8]
514 movq m2, [srcq+DCTSIZE*1*2+8]
520 movq m5, [srcq+DCTSIZE*2*2+8]
526 movq m6, [srcq+DCTSIZE*3*2+8]
578 paddw m1, [dstq+strideq*1]
580 paddw m7, [dstq+strideq*2]
584 movq [dstq+strideq*1], m1
586 movq [dstq+strideq*2], m7
590 paddw m0, [dstq+strideq*2]
592 paddw m3, [dstq+stride3q*1]
596 paddw m6, [dstq+strideq*4]
598 movq [dstq+strideq*2], m0
602 paddw m4, [dstq+strideq*1]
603 add srcq, DCTSIZE*2*4
604 movq [dstq+stride3q*1], m3
605 movq [dstq+strideq*4], m6
607 movq [dstq+strideq*1], m4
614 ;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
615 cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
616 lea stride3q, [strideq+strideq*2]
620 movd m1, [pixq+strideq*1]
622 movd m2, [pixq+strideq*2]
627 movd m3, [pixq+strideq*4]
629 movd m4, [pixq+stride3q*1]
635 movd m3, [pixq+strideq*2]
643 movd m6, [pixq+strideq*1]
677 movq [srcq+DCTSIZE*0*2], m0
679 movq [srcq+DCTSIZE*1*2], m5
681 movq [srcq+DCTSIZE*2*2], m6
683 movq [srcq+DCTSIZE*3*2], m7
717 movq [srcq+DCTSIZE*0*2+8], m2
719 movq [srcq+DCTSIZE*1*2+8], m7
721 movq [srcq+DCTSIZE*2*2+8], m4
723 movq [srcq+DCTSIZE*3*2+8], m5
724 add srcq, DCTSIZE*4*2