1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the VP3 decoder
3 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 ; MMX-optimized functions cribbed from the original VP3 source code.
28 vp3_idct_data: times 8 dw 64277
37 pb_1F: times 8 db 0x1f
38 pb_81: times 8 db 0x81
49 ; this is off by one or two for some cases when filter_limit is greater than 63
50 ; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
51 ; out: p1 in mm4, p2 in mm3
52 %macro VP3_LOOP_FILTER 0
54 pand m6, [pb_7] ; p0&7
56 pand m7, [pb_1F] ; p0>>3
59 pand m2, [pb_1] ; (p2^p1)&1
62 paddb m2, m5 ; 3*(p2^p1)&1
63 paddb m2, m6 ; extra bits lost in shifts
65 pxor m1, m0 ; 255 - p3
66 pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
67 pxor m0, m4 ; 255 - p1
68 pavgb m0, m3 ; (256 + p2-p1) >> 1
70 pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
71 pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
72 paddusb m7, m1 ; d+128+1
77 movq m5, [r2+516] ; flim
94 %macro STORE_4_WORDS 1
107 cglobal vp3_v_loop_filter, 3, 4
121 cglobal vp3_h_loop_filter, 3, 4
129 punpcklbw m6, [r0 -2]
130 punpcklbw m4, [r0+r1 -2]
131 punpcklbw m2, [r0+r1*2-2]
132 punpcklbw m1, [r0+r3 -2]
136 TRANSPOSE4x4B 6, 4, 2, 1, 0
138 SBUTTERFLY bw, 4, 3, 5
145 %macro PAVGB_NO_RND 0
161 cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
163 lea stride3q,[strideq+strideq*2]
167 mova m2, [src1q+strideq]
168 mova m3, [src2q+strideq]
171 mova [dstq+strideq], m5
173 mova m0, [src1q+strideq*2]
174 mova m1, [src2q+strideq*2]
175 mova m2, [src1q+stride3q]
176 mova m3, [src2q+stride3q]
178 mova [dstq+strideq*2], m4
179 mova [dstq+stride3q], m5
181 lea src1q, [src1q+strideq*4]
182 lea src2q, [src2q+strideq*4]
183 lea dstq, [dstq+strideq*4]
188 ; from original comments: The Macro does IDct on 4 1-D Dcts
194 pmulhw m4, m6 ; r4 = c3*i3 - i3
196 pmulhw m6, m7 ; r6 = c3*i5 - i5
198 pmulhw m1, m2 ; r1 = c5*i3 - i3
200 pmulhw m5, m7 ; r5 = c5*i5 - i5
202 paddw m4, m2 ; r4 = c3*i3
203 paddw m6, m7 ; r6 = c3*i5
204 paddw m2, m1 ; r2 = c5*i3
206 paddw m7, m5 ; r7 = c5*i5
207 movq m5, m0 ; r5 = c1
208 pmulhw m0, m3 ; r0 = c1*i1 - i1
209 paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
210 pmulhw m5, m1 ; r5 = c1*i7 - i7
212 psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
213 paddw m0, m3 ; r0 = c1*i1
214 pmulhw m3, m7 ; r3 = c7*i1
216 pmulhw m7, m1 ; r7 = c7*i7
217 paddw m5, m1 ; r5 = c1*i7
218 movq m1, m2 ; r1 = i2
219 pmulhw m2, C(2) ; r2 = c2*i2 - i2
220 psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
222 paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
223 movq m7, m5 ; r7 = i6
224 psubsw m0, m4 ; r0 = A - C
225 pmulhw m5, C(2) ; r5 = c2*i6 - i6
226 paddw m2, m1 ; r2 = c2*i2
227 pmulhw m1, C(6) ; r1 = c6*i2
228 paddsw m4, m4 ; r4 = C + C
229 paddsw m4, m0 ; r4 = C. = A + C
230 psubsw m3, m6 ; r3 = B - D
231 paddw m5, m7 ; r5 = c2*i6
232 paddsw m6, m6 ; r6 = D + D
233 pmulhw m7, C(6) ; r7 = c6*i6
234 paddsw m6, m3 ; r6 = D. = B + D
235 movq I(1), m4 ; save C. at I(1)
236 psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
238 movq m5, m3 ; r5 = B - D
239 pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
240 paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
241 movq I(2), m6 ; save D. at I(2)
242 movq m2, m0 ; r2 = A - C
244 pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
245 paddw m5, m3 ; r5 = B. = c4 * (B - D)
247 psubsw m5, m1 ; r5 = B.. = B. - H
248 paddw m2, m0 ; r0 = A. = c4 * (A - C)
249 psubsw m6, m3 ; r6 = i0 - i4
251 pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
252 paddsw m3, m3 ; r3 = i4 + i4
253 paddsw m1, m1 ; r1 = H + H
254 paddsw m3, m0 ; r3 = i0 + i4
255 paddsw m1, m5 ; r1 = H. = B + H
256 pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
257 paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
258 psubsw m6, m2 ; r6 = F. = F - A.
259 paddsw m2, m2 ; r2 = A. + A.
260 movq m0, I(1) ; r0 = C.
261 paddsw m2, m6 ; r2 = A.. = F + A.
262 paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
263 psubsw m2, m1 ; r2 = R2 = A.. - H.
266 ; RowIDCT gets ready to transpose
269 movq m3, I(2) ; r3 = D.
270 psubsw m4, m7 ; r4 = E. = E - G
271 paddsw m1, m1 ; r1 = H. + H.
272 paddsw m7, m7 ; r7 = G + G
273 paddsw m1, m2 ; r1 = R1 = A.. + H.
274 paddsw m7, m4 ; r1 = R1 = A.. + H.
275 psubsw m4, m3 ; r4 = R4 = E. - D.
277 psubsw m6, m5 ; r6 = R6 = F. - B..
279 paddsw m3, m4 ; r3 = R3 = E. + D.
280 paddsw m5, m6 ; r5 = R5 = F. + B..
281 psubsw m7, m0 ; r7 = R7 = G. - C.
283 movq I(1), m1 ; save R1
284 paddsw m0, m7 ; r0 = R0 = G. + C.
287 ; Column IDCT normalizes and stores final results
290 paddsw m2, OC_8 ; adjust R2 (and R1) for shift
291 paddsw m1, m1 ; r1 = H. + H.
292 paddsw m1, m2 ; r1 = R1 = A.. + H.
293 psraw m2, 4 ; r2 = NR2
294 psubsw m4, m7 ; r4 = E. = E - G
295 psraw m1, 4 ; r1 = NR2
296 movq m3, I(2) ; r3 = D.
297 paddsw m7, m7 ; r7 = G + G
298 movq I(2), m2 ; store NR2 at I2
299 paddsw m7, m4 ; r7 = G. = E + G
300 movq I(1), m1 ; store NR1 at I1
301 psubsw m4, m3 ; r4 = R4 = E. - D.
302 paddsw m4, OC_8 ; adjust R4 (and R3) for shift
303 paddsw m3, m3 ; r3 = D. + D.
304 paddsw m3, m4 ; r3 = R3 = E. + D.
305 psraw m4, 4 ; r4 = NR4
306 psubsw m6, m5 ; r6 = R6 = F. - B..
307 psraw m3, 4 ; r3 = NR3
308 paddsw m6, OC_8 ; adjust R6 (and R5) for shift
309 paddsw m5, m5 ; r5 = B.. + B..
310 paddsw m5, m6 ; r5 = R5 = F. + B..
311 psraw m6, 4 ; r6 = NR6
312 movq J(4), m4 ; store NR4 at J4
313 psraw m5, 4 ; r5 = NR5
314 movq I(3), m3 ; store NR3 at I3
315 psubsw m7, m0 ; r7 = R7 = G. - C.
316 paddsw m7, OC_8 ; adjust R7 (and R0) for shift
317 paddsw m0, m0 ; r0 = C. + C.
318 paddsw m0, m7 ; r0 = R0 = G. + C.
319 psraw m7, 4 ; r7 = NR7
320 movq J(6), m6 ; store NR6 at J6
321 psraw m0, 4 ; r0 = NR0
322 movq J(5), m5 ; store NR5 at J5
323 movq J(7), m7 ; store NR7 at J7
324 movq I(0), m0 ; store NR0 at I0
327 ; Following macro does two 4x4 transposes in place.
329 ; At entry (we assume):
353 ; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
354 ; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
356 ; Since r1 is free at entry, we calculate the Js first.
358 movq m1, m4 ; r1 = e3 e2 e1 e0
359 punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
360 movq I(0), m0 ; save a3 a2 a1 a0
361 punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
362 movq m0, m6 ; r0 = g3 g2 g1 g0
363 punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
364 movq m5, m4 ; r5 = f1 e1 f0 e0
365 punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
366 punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
367 movq m6, m1 ; r6 = f3 e3 f2 e2
369 punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
371 punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
372 movq m4, I(0) ; r4 = a3 a2 a1 a0
373 punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
374 movq m5, I(1) ; r5 = b3 b2 b1 b0
375 movq m0, m4 ; r0 = a3 a2 a1 a0
377 punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
379 punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
380 movq m5, m2 ; r5 = c3 c2 c1 c0
381 punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
382 movq m1, m0 ; r1 = b1 a1 b0 a0
383 punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
384 punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
385 movq m2, m4 ; r2 = b3 a3 b2 a2
387 punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
389 punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
390 punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
395 %macro VP3_1D_IDCT_SSE2 0
396 movdqa m2, I(3) ; xmm2 = i3
397 movdqa m6, C(3) ; xmm6 = c3
398 movdqa m4, m2 ; xmm4 = i3
399 movdqa m7, I(5) ; xmm7 = i5
400 pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
401 movdqa m1, C(5) ; xmm1 = c5
402 pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
403 movdqa m5, m1 ; xmm5 = c5
404 pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
405 movdqa m3, I(1) ; xmm3 = i1
406 pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
407 movdqa m0, C(1) ; xmm0 = c1
408 paddw m4, m2 ; xmm4 = c3 * i3
409 paddw m6, m7 ; xmm6 = c3 * i5
410 paddw m2, m1 ; xmm2 = c5 * i3
411 movdqa m1, I(7) ; xmm1 = i7
412 paddw m7, m5 ; xmm7 = c5 * i5
413 movdqa m5, m0 ; xmm5 = c1
414 pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
415 paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
416 pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
417 movdqa m7, C(7) ; xmm7 = c7
418 psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
419 paddw m0, m3 ; xmm0 = c1 * i1
420 pmulhw m3, m7 ; xmm3 = c7 * i1
421 movdqa m2, I(2) ; xmm2 = i2
422 pmulhw m7, m1 ; xmm7 = c7 * i7
423 paddw m5, m1 ; xmm5 = c1 * i7
424 movdqa m1, m2 ; xmm1 = i2
425 pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
426 psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
427 movdqa m5, I(6) ; xmm5 = i6
428 paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
429 movdqa m7, m5 ; xmm7 = i6
430 psubsw m0, m4 ; xmm0 = A - C
431 pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
432 paddw m2, m1 ; xmm2 = i2 * c2
433 pmulhw m1, C(6) ; xmm1 = c6 * i2
434 paddsw m4, m4 ; xmm4 = C + C
435 paddsw m4, m0 ; xmm4 = A + C = C.
436 psubsw m3, m6 ; xmm3 = B - D
437 paddw m5, m7 ; xmm5 = c2 * i6
438 paddsw m6, m6 ; xmm6 = D + D
439 pmulhw m7, C(6) ; xmm7 = c6 * i6
440 paddsw m6, m3 ; xmm6 = B + D = D.
441 movdqa I(1), m4 ; Save C. at I(1)
442 psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
443 movdqa m4, C(4) ; xmm4 = C4
444 movdqa m5, m3 ; xmm5 = B - D
445 pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
446 paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
447 movdqa I(2), m6 ; save D. at I(2)
448 movdqa m2, m0 ; xmm2 = A - C
449 movdqa m6, I(0) ; xmm6 = i0
450 pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
451 paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
452 movdqa m3, I(4) ; xmm3 = i4
453 psubsw m5, m1 ; xmm5 = B. - H = B..
454 paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
455 psubsw m6, m3 ; xmm6 = i0 - i4
456 movdqa m0, m6 ; xmm0 = i0 - i4
457 pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
458 paddsw m3, m3 ; xmm3 = i4 + i4
459 paddsw m1, m1 ; xmm1 = H + H
460 paddsw m3, m0 ; xmm3 = i0 + i4
461 paddsw m1, m5 ; xmm1 = B. + H = H.
462 pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
463 paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
464 psubsw m6, m2 ; xmm6 = F - A. = F.
465 paddsw m2, m2 ; xmm2 = A. + A.
466 movdqa m0, I(1) ; Load C. from I(1)
467 paddsw m2, m6 ; xmm2 = F + A. = A..
468 paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
469 psubsw m2, m1 ; xmm2 = A.. - H. = R2
470 ADD(m2) ; Adjust R2 and R1 before shifting
471 paddsw m1, m1 ; xmm1 = H. + H.
472 paddsw m1, m2 ; xmm1 = A.. + H. = R1
473 SHIFT(m2) ; xmm2 = op2
474 psubsw m4, m7 ; xmm4 = E - G = E.
475 SHIFT(m1) ; xmm1 = op1
476 movdqa m3, I(2) ; Load D. from I(2)
477 paddsw m7, m7 ; xmm7 = G + G
478 paddsw m7, m4 ; xmm7 = E + G = G.
479 psubsw m4, m3 ; xmm4 = E. - D. = R4
480 ADD(m4) ; Adjust R4 and R3 before shifting
481 paddsw m3, m3 ; xmm3 = D. + D.
482 paddsw m3, m4 ; xmm3 = E. + D. = R3
483 SHIFT(m4) ; xmm4 = op4
484 psubsw m6, m5 ; xmm6 = F. - B..= R6
485 SHIFT(m3) ; xmm3 = op3
486 ADD(m6) ; Adjust R6 and R5 before shifting
487 paddsw m5, m5 ; xmm5 = B.. + B..
488 paddsw m5, m6 ; xmm5 = F. + B.. = R5
489 SHIFT(m6) ; xmm6 = op6
490 SHIFT(m5) ; xmm5 = op5
491 psubsw m7, m0 ; xmm7 = G. - C. = R7
492 ADD(m7) ; Adjust R7 and R0 before shifting
493 paddsw m0, m0 ; xmm0 = C. + C.
494 paddsw m0, m7 ; xmm0 = G. + C.
495 SHIFT(m7) ; xmm7 = op7
496 SHIFT(m0) ; xmm0 = op0
512 %define I(x) [%1+16*x]
513 %define O(x) [%1+16*x]
514 %define C(x) [vp3_idct_data+16*(x-1)]
519 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
521 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
523 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
525 %define SHIFT(x) psraw x, 4
526 %define ADD(x) paddsw x, [pw_8]
528 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
530 ; eax = quantized input
531 ; ebx = dequantizer matrix
532 ; ecx = IDCT constants
533 ; M(I) = ecx + MaskOffset(0) + I * 8
534 ; C(I) = ecx + CosineOffset(32) + (I-1) * 8
538 %define C(x) [vp3_idct_data+16*(x-1)]
540 ; at this point, function has completed dequantization + dezigzag +
541 ; partial transposition; now do the idct itself
542 %define I(x) [%1+16*x]
543 %define J(x) [%1+16*x]
547 %define I(x) [%1+16*x+8]
548 %define J(x) [%1+16*x+8]
552 %define I(x) [%1+16* x]
553 %define J(x) [%1+16*(x-4)+8]
556 %define I(x) [%1+16* x +64]
557 %define J(x) [%1+16*(x-4)+72]
559 %endif ; mmsize == 16/8
562 %macro vp3_idct_funcs 0
563 cglobal vp3_idct_put, 3, 4, 9
570 mova m0, [r2+mmsize*0+%%i]
571 mova m1, [r2+mmsize*2+%%i]
572 mova m2, [r2+mmsize*4+%%i]
573 mova m3, [r2+mmsize*6+%%i]
575 packsswb m0, [r2+mmsize*8+%%i]
576 packsswb m1, [r2+mmsize*10+%%i]
577 packsswb m2, [r2+mmsize*12+%%i]
578 packsswb m3, [r2+mmsize*14+%%i]
580 packsswb m0, [r2+mmsize*1+%%i]
581 packsswb m1, [r2+mmsize*3+%%i]
582 packsswb m2, [r2+mmsize*5+%%i]
583 packsswb m3, [r2+mmsize*7+%%i]
614 mova [r2+%%offset], m0
615 %assign %%offset %%offset+mmsize
619 cglobal vp3_idct_add, 3, 4, 9
635 paddsw m0, [r2+ 0+%%i]
636 paddsw m1, [r2+16+%%i]
637 paddsw m2, [r2+32+%%i]
638 paddsw m3, [r2+48+%%i]
666 paddsw m0, [r2+ 0+%%i]
667 paddsw m1, [r2+16+%%i]
668 paddsw m2, [r2+32+%%i]
669 paddsw m5, [r2+64+%%i]
670 paddsw m6, [r2+80+%%i]
671 paddsw m7, [r2+96+%%i]
677 paddsw m3, [r2+48+%%i]
678 paddsw m5, [r2+112+%%i]
694 %assign %%i %%i+mmsize
727 cglobal vp3_idct_dc_add, 3, 4