1 /*****************************************************************************
2 * vdec_idctmmx.S : MMX IDCT implementation
3 *****************************************************************************
4 * Copyright (C) 1999, 2000 VideoLAN
5 * $Id: idctmmx_asm.S,v 1.1 2001/01/13 12:57:20 sam Exp $
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
25 * the input data is tranposed and each 16 bit element in the 8x8 matrix
27 * for example in 11...1110000 format
28 * If the iDCT is of I macroblock then 0.5 needs to be added to the;DC Component
29 * (element[0][0] of the matrix)
35 preSC: .short 16384,22725,21407,19266,16384,12873,8867,4520
36 .short 22725,31521,29692,26722,22725,17855,12299,6270
37 .short 21407,29692,27969,25172,21407,16819,11585,5906
38 .short 19266,26722,25172,22654,19266,15137,10426,5315
39 .short 16384,22725,21407,19266,16384,12873,8867,4520
40 .short 12873,17855,16819,15137,25746,20228,13933,7103
41 .short 17734,24598,23170,20853,17734,13933,9597,4892
42 .short 18081,25080,23624,21261,18081,14206,9785,4988
45 .type x0005000200010001,@object
46 .size x0005000200010001,8
48 .long 0x00010001,0x00050002
50 .type x0040000000000000,@object
51 .size x0040000000000000,8
55 .type x5a825a825a825a82,@object
56 .size x5a825a825a825a82,8
58 .long 0x5a825a82, 0x5a825a82
60 .type x539f539f539f539f,@object
61 .size x539f539f539f539f,8
63 .long 0x539f539f,0x539f539f
65 .type x4546454645464546,@object
66 .size x4546454645464546,8
68 .long 0x45464546,0x45464546
70 .type x61f861f861f861f8,@object
71 .size x61f861f861f861f8,8
73 .long 0x61f861f8,0x61f861f8
75 .type scratch1,@object
80 .type scratch3,@object
85 .type scratch5,@object
90 .type scratch7,@object
101 /* this seems to annoy the compiler in -g mode, is it normal ? */
103 .type vdec_IDCT,@function
162 /* column 0: even part
163 * use V4, V12, V0, V8 to produce V22..V25
165 movq 8*12(%ecx), %mm0 /* maybe the first mul can be done together */
166 /* with the dequantization in iHuff module */
167 pmulhw 8*12(%esi), %mm0 /* V12 */
169 pmulhw 8*4(%esi), %mm1 /* V4 */
171 psraw $1, %mm0 /* t64=t66 */
172 pmulhw (%esi), %mm3 /* V0 */
173 movq 8*8(%ecx), %mm5 /* duplicate V4 */
174 movq %mm1, %mm2 /* added 11/1/96 */
175 pmulhw 8*8(%esi),%mm5 /* V8 */
176 psubsw %mm0, %mm1 /* V16 */
177 pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V18 */
178 paddsw %mm0, %mm2 /* V17 */
179 movq %mm2, %mm0 /* duplicate V17 */
180 psraw $1, %mm2 /* t75=t82 */
181 psraw $2, %mm0 /* t72 */
182 movq %mm3, %mm4 /* duplicate V0 */
183 paddsw %mm5, %mm3 /* V19 */
184 psubsw %mm5, %mm4 /* V20 ;mm5 free */
185 /* moved from the block below */
186 movq 8*10(%ecx), %mm7
187 psraw $1, %mm3 /* t74=t81 */
188 movq %mm3, %mm6 /* duplicate t74=t81 */
189 psraw $2, %mm4 /* t77=t79 */
190 psubsw %mm0, %mm1 /* V21 ; mm0 free */
191 paddsw %mm2, %mm3 /* V22 */
192 movq %mm1, %mm5 /* duplicate V21 */
193 paddsw %mm4, %mm1 /* V23 */
194 movq %mm3, 8*4(%esi) /* V22 */
195 psubsw %mm5, %mm4 /* V24; mm5 free */
196 movq %mm1, 8*12(%esi) /* V23 */
197 psubsw %mm2, %mm6 /* V25; mm2 free */
198 movq %mm4, (%esi) /* V24 */
199 /* keep mm6 alive all along the next block */
200 /* movq %mm6, 8*8(%esi) V25 */
201 /* column 0: odd part
202 * use V2, V6, V10, V14 to produce V31, V39, V40, V41
204 /* moved above: movq 8*10(%ecx), %mm7 */
206 pmulhw 8*10(%esi), %mm7 /* V10 */
208 pmulhw 8*6(%esi), %mm0 /* V6 */
210 movq %mm7, %mm3 /* duplicate V10 */
211 pmulhw 8*2(%esi), %mm5 /* V2 */
212 movq 8*14(%ecx), %mm4
213 psubsw %mm0, %mm7 /* V26 */
214 pmulhw 8*14(%esi), %mm4 /* V14 */
215 paddsw %mm0, %mm3 /* V29 ; free mm0 */
216 movq %mm7, %mm1 /* duplicate V26 */
217 psraw $1, %mm3 /* t91=t94 */
218 pmulhw x539f539f539f539f,%mm7 /* V33 */
219 psraw $1, %mm1 /* t96 */
220 movq %mm5, %mm0 /* duplicate V2 */
221 psraw $2, %mm4 /* t85=t87 */
222 paddsw %mm4,%mm5 /* V27 */
223 psubsw %mm4, %mm0 /* V28 ; free mm4 */
224 movq %mm0, %mm2 /* duplicate V28 */
225 psraw $1, %mm5 /* t90=t93 */
226 pmulhw x4546454645464546,%mm0 /* V35 */
227 psraw $1, %mm2 /* t97 */
228 movq %mm5, %mm4 /* duplicate t90=t93 */
229 psubsw %mm2, %mm1 /* V32 ; free mm2 */
230 pmulhw x61f861f861f861f8,%mm1 /* V36 */
231 psllw $1, %mm7 /* t107 */
232 paddsw %mm3, %mm5 /* V31 */
233 psubsw %mm3, %mm4 /* V30 ; free mm3 */
234 pmulhw x5a825a825a825a82,%mm4 /* V34 */
236 psubsw %mm1, %mm0 /* V38 */
237 psubsw %mm7, %mm1 /* V37 ; free mm7 */
238 psllw $1, %mm1 /* t114 */
239 /* move from the next block */
240 movq %mm6, %mm3 /* duplicate V25 */
241 /* move from the next block */
242 movq 8*4(%esi), %mm7 /* V22 */
243 psllw $1, %mm0 /* t110 */
244 psubsw %mm5, %mm0 /* V39 (mm5 needed for next block) */
245 psllw $2, %mm4 /* t112 */
246 /* moved from the next block */
247 movq 8*12(%esi), %mm2 /* V23 */
248 psubsw %mm0, %mm4 /* V40 */
249 paddsw %mm4, %mm1 /* V41; free mm0 */
250 /* moved from the next block */
251 psllw $1, %mm2 /* t117=t125 */
252 /* column 0: output butterfly */
254 * movq %mm6, %mm3 duplicate V25
255 * movq 8*4(%esi), %mm7 V22
256 * movq 8*12(%esi), %mm2 V23
257 * psllw $1, %mm2 t117=t125
259 psubsw %mm1, %mm6 /* tm6 */
260 paddsw %mm1, %mm3 /* tm8; free mm1 */
261 movq %mm7, %mm1 /* duplicate V22 */
262 paddsw %mm5, %mm7 /* tm0 */
263 movq %mm3, 8*8(%esi) /* tm8; free mm3 */
264 psubsw %mm5, %mm1 /* tm14; free mm5 */
265 movq %mm6, 8*6(%esi) /* tm6; free mm6 */
266 movq %mm2, %mm3 /* duplicate t117=t125 */
267 movq (%esi), %mm6 /* V24 */
268 paddsw %mm0, %mm2 /* tm2 */
269 movq %mm7, (%esi) /* tm0; free mm7 */
270 psubsw %mm0, %mm3 /* tm12; free mm0 */
271 movq %mm1, 8*14(%esi) /* tm14; free mm1 */
272 psllw $1, %mm6 /* t119=t123 */
273 movq %mm2, 8*2(%esi) /* tm2; free mm2 */
274 movq %mm6, %mm0 /* duplicate t119=t123 */
275 movq %mm3, 8*12(%esi) /* tm12; free mm3 */
276 paddsw %mm4, %mm6 /* tm4 */
277 /* moved from next block */
279 psubsw %mm4, %mm0 /* tm10; free mm4 */
280 /* moved from next block */
281 pmulhw 8*5(%esi), %mm1 /* V5 */
282 movq %mm6, 8*4(%esi) /* tm4; free mm6 */
283 movq %mm0, 8*10(%esi) /* tm10; free mm0 */
284 /* column 1: even part
285 * use V5, V13, V1, V9 to produce V56..V59
287 /* moved to prev block:
288 * movq 8*5(%ecx), %mm1
289 * pmulhw 8*5(%esi), %mm1 V5
291 movq 8*13(%ecx), %mm7
292 psllw $1, %mm1 /* t128=t130 */
293 pmulhw 8*13(%esi), %mm7 /* V13 */
294 movq %mm1, %mm2 /* duplicate t128=t130 */
296 pmulhw 8(%esi), %mm3 /* V1 */
298 psubsw %mm7, %mm1 /* V50 */
299 pmulhw 8*9(%esi), %mm5 /* V9 */
300 paddsw %mm7, %mm2 /* V51 */
301 pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V52 */
302 movq %mm2, %mm6 /* duplicate V51 */
303 psraw $1, %mm2 /* t138=t144 */
304 movq %mm3, %mm4 /* duplicate V1 */
305 psraw $2, %mm6 /* t136 */
306 paddsw %mm5, %mm3 /* V53 */
307 psubsw %mm5, %mm4 /* V54 ;mm5 free */
308 movq %mm3, %mm7 /* duplicate V53 */
309 /* moved from next block */
310 movq 8*11(%ecx), %mm0
311 psraw $1, %mm4 /* t140=t142 */
312 psubsw %mm6, %mm1 /* V55 ; mm6 free */
313 paddsw %mm2, %mm3 /* V56 */
314 movq %mm4, %mm5 /* duplicate t140=t142 */
315 paddsw %mm1, %mm4 /* V57 */
316 movq %mm3, 8*5(%esi) /* V56 */
317 psubsw %mm1, %mm5 /* V58; mm1 free */
318 movq %mm4, 8*13(%esi) /* V57 */
319 psubsw %mm2, %mm7 /* V59; mm2 free */
320 movq %mm5, 8*9(%esi) /* V58 */
321 /* keep mm7 alive all along the next block
322 * movq %mm7, 8(%esi) V59
324 * movq 8*11(%ecx), %mm0
326 pmulhw 8*11(%esi), %mm0 /* V11 */
328 pmulhw 8*7(%esi), %mm6 /* V7 */
329 movq 8*15(%ecx), %mm4
330 movq %mm0, %mm3 /* duplicate V11 */
331 pmulhw 8*15(%esi), %mm4 /* V15 */
333 psllw $1, %mm6 /* t146=t152 */
334 pmulhw 8*3(%esi), %mm5 /* V3 */
335 paddsw %mm6, %mm0 /* V63 */
336 /* note that V15 computation has a correction step:
337 * this is a 'magic' constant that rebiases the results to be closer to the
338 * expected result. this magic constant can be refined to reduce the error
339 * even more by doing the correction step in a later stage when the number
340 * is actually multiplied by 16
342 paddw x0005000200010001, %mm4
343 psubsw %mm6, %mm3 /* V60 ; free mm6 */
344 psraw $1, %mm0 /* t154=t156 */
345 movq %mm3, %mm1 /* duplicate V60 */
346 pmulhw x539f539f539f539f, %mm1 /* V67 */
347 movq %mm5, %mm6 /* duplicate V3 */
348 psraw $2, %mm4 /* t148=t150 */
349 paddsw %mm4, %mm5 /* V61 */
350 psubsw %mm4, %mm6 /* V62 ; free mm4 */
351 movq %mm5, %mm4 /* duplicate V61 */
352 psllw $1, %mm1 /* t169 */
353 paddsw %mm0, %mm5 /* V65 -> result */
354 psubsw %mm0, %mm4 /* V64 ; free mm0 */
355 pmulhw x5a825a825a825a82, %mm4 /* V68 */
356 psraw $1, %mm3 /* t158 */
357 psubsw %mm6, %mm3 /* V66 */
358 movq %mm5, %mm2 /* duplicate V65 */
359 pmulhw x61f861f861f861f8, %mm3 /* V70 */
360 psllw $1, %mm6 /* t165 */
361 pmulhw x4546454645464546, %mm6 /* V69 */
362 psraw $1, %mm2 /* t172 */
363 /* moved from next block */
364 movq 8*5(%esi), %mm0 /* V56 */
365 psllw $1, %mm4 /* t174 */
366 /* moved from next block */
367 psraw $1, %mm0 /* t177=t188 */
369 psubsw %mm3, %mm6 /* V72 */
370 psubsw %mm1, %mm3 /* V71 ; free mm1 */
371 psubsw %mm2, %mm6 /* V73 ; free mm2 */
372 /* moved from next block */
373 psraw $1, %mm5 /* t178=t189 */
374 psubsw %mm6, %mm4 /* V74 */
375 /* moved from next block */
376 movq %mm0, %mm1 /* duplicate t177=t188 */
377 paddsw %mm4, %mm3 /* V75 */
378 /* moved from next block */
379 paddsw %mm5, %mm0 /* tm1 */
389 * free mm0, mm1 & mm2
391 * movq 8*5(%esi), %mm0 V56
392 * psllw $1, %mm0 t177=t188 ! new !!
393 * psllw $1, %mm5 t178=t189 ! new !!
394 * movq %mm0, %mm1 duplicate t177=t188
395 * paddsw %mm5, %mm0 tm1
397 movq 8*13(%esi), %mm2 /* V57 */
398 psubsw %mm5, %mm1 /* tm15; free mm5 */
399 movq %mm0, 8(%esi) /* tm1; free mm0 */
400 psraw $1, %mm7 /* t182=t184 ! new !! */
401 /* save the store as used directly in the transpose
402 * movq %mm1, 120(%esi) tm15; free mm1
404 movq %mm7, %mm5 /* duplicate t182=t184 */
405 psubsw %mm3, %mm7 /* tm7 */
406 paddsw %mm3, %mm5 /* tm9; free mm3 */
407 movq 8*9(%esi), %mm0 /* V58 */
408 movq %mm2, %mm3 /* duplicate V57 */
409 movq %mm7, 8*7(%esi) /* tm7; free mm7 */
410 psubsw %mm6, %mm3 /* tm13 */
411 paddsw %mm6, %mm2 /* tm3 ; free mm6 */
412 /* moved up from the transpose */
414 /* moved up from the transpose */
416 movq %mm0, %mm6 /* duplicate V58 */
417 movq %mm2, 8*3(%esi) /* tm3; free mm2 */
418 paddsw %mm4, %mm0 /* tm5 */
419 psubsw %mm4, %mm6 /* tm11; free mm4 */
420 /* moved up from the transpose */
422 movq %mm0, 8*5(%esi) /* tm5; free mm0 */
423 /* moved up from the transpose */
425 /* transpose - M4 part
426 * --------- ---------
427 * | M1 | M2 | | M1'| M3'|
428 * --------- --> ---------
429 * | M3 | M4 | | M2'| M4'|
430 * --------- ---------
431 * Two alternatives: use full mmword approach so the following code can be
432 * scheduled before the transpose is done without stores, or use the faster
433 * half mmword stores (when possible)
435 movd %mm3, 8*9+4(%esi) /* MS part of tmt9 */
437 movd %mm7, 8*13+4(%esi) /* MS part of tmt13 */
439 movd %mm5, 8*9(%esi) /* LS part of tmt9 */
440 punpckhdq %mm3, %mm5 /* free mm3 */
441 movd %mm2, 8*13(%esi) /* LS part of tmt13 */
442 punpckhdq %mm7, %mm2 /* free mm7 */
443 /* moved up from the M3 transpose */
445 /* moved up from the M3 transpose */
446 movq 8*10(%esi), %mm1
447 /* moved up from the M3 transpose */
449 /* shuffle the rest of the data, and write it with 2 mmword writes */
450 movq %mm5, 8*11(%esi) /* tmt11 */
451 /* moved up from the M3 transpose */
453 movq %mm2, 8*15(%esi) /* tmt15 */
454 /* moved up from the M3 transpose */
456 /* transpose - M3 part
457 * moved up to previous code section
458 * movq 8*8(%esi), %mm0
459 * movq 8*10(%esi), %mm1
461 * punpcklwd %mm1, %mm0
462 * punpckhwd %mm1, %mm3
464 movq 8*12(%esi), %mm6
465 movq 8*14(%esi), %mm4
467 /* shuffle the data and write the lower parts of the transposed in 4 dwords */
472 punpckhwd %mm4, %mm2 /* free mm4 */
473 punpckldq %mm6, %mm0 /* free mm6 */
474 /* moved from next block */
475 movq 8*13(%esi), %mm4 /* tmt13 */
477 punpckhdq %mm2, %mm7 /* free mm2 */
478 /* moved from next block */
479 movq %mm3, %mm5 /* duplicate tmt5 */
480 /* column 1: even part (after transpose)
482 * movq %mm3, %mm5 duplicate tmt5
483 * movq 8*13(%esi), %mm4 tmt13
485 psubsw %mm4, %mm3 /* V134 */
486 pmulhw x5a825a825a825a82, %mm3 /* 23170 ->V136 */
487 movq 8*9(%esi), %mm6 /* tmt9 */
488 paddsw %mm4, %mm5 /* V135 ; mm4 free */
489 movq %mm0, %mm4 /* duplicate tmt1 */
490 paddsw %mm6, %mm0 /* V137 */
491 psubsw %mm6, %mm4 /* V138 ; mm6 free */
492 psllw $2, %mm3 /* t290 */
493 psubsw %mm5, %mm3 /* V139 */
494 movq %mm0, %mm6 /* duplicate V137 */
495 paddsw %mm5, %mm0 /* V140 */
496 movq %mm4, %mm2 /* duplicate V138 */
497 paddsw %mm3, %mm2 /* V141 */
498 psubsw %mm3, %mm4 /* V142 ; mm3 free */
499 movq %mm0, 8*9(%esi) /* V140 */
500 psubsw %mm5, %mm6 /* V143 ; mm5 free */
501 /* moved from next block */
502 movq 8*11(%esi), %mm0 /* tmt11 */
503 movq %mm2, 8*13(%esi) /* V141 */
504 /* moved from next block */
505 movq %mm0, %mm2 /* duplicate tmt11 */
506 /* column 1: odd part (after transpose) */
507 /* moved up to the prev block
508 * movq 8*11(%esi), %mm0 tmt11
509 * movq %mm0, %mm2 duplicate tmt11
511 movq 8*15(%esi), %mm5 /* tmt15 */
512 psubsw %mm7, %mm0 /* V144 */
513 movq %mm0, %mm3 /* duplicate V144 */
514 paddsw %mm7, %mm2 /* V147 ; free mm7 */
515 pmulhw x539f539f539f539f, %mm0 /* 21407-> V151 */
516 movq %mm1, %mm7 /* duplicate tmt3 */
517 paddsw %mm5, %mm7 /* V145 */
518 psubsw %mm5, %mm1 /* V146 ; free mm5 */
519 psubsw %mm1, %mm3 /* V150 */
520 movq %mm7, %mm5 /* duplicate V145 */
521 pmulhw x4546454645464546, %mm1 /* 17734-> V153 */
522 psubsw %mm2, %mm5 /* V148 */
523 pmulhw x61f861f861f861f8, %mm3 /* 25080-> V154 */
524 psllw $2, %mm0 /* t311 */
525 pmulhw x5a825a825a825a82, %mm5 /* 23170-> V152 */
526 paddsw %mm2, %mm7 /* V149 ; free mm2 */
527 psllw $1, %mm1 /* t313 */
528 nop /* without the nop - freeze here for one clock */
529 movq %mm3, %mm2 /* duplicate V154 */
530 psubsw %mm0, %mm3 /* V155 ; free mm0 */
531 psubsw %mm2, %mm1 /* V156 ; free mm2 */
532 /* moved from the next block */
533 movq %mm6, %mm2 /* duplicate V143 */
534 /* moved from the next block */
535 movq 8*13(%esi), %mm0 /* V141 */
536 psllw $1, %mm1 /* t315 */
537 psubsw %mm7, %mm1 /* V157 (keep V149) */
538 psllw $2, %mm5 /* t317 */
539 psubsw %mm1, %mm5 /* V158 */
540 psllw $1, %mm3 /* t319 */
541 paddsw %mm5, %mm3 /* V159 */
542 /* column 1: output butterfly (after transform)
543 * moved to the prev block
544 * movq %mm6, %mm2 duplicate V143
545 * movq 8*13(%esi), %mm0 V141
547 psubsw %mm3, %mm2 /* V163 */
548 paddsw %mm3, %mm6 /* V164 ; free mm3 */
549 movq %mm4, %mm3 /* duplicate V142 */
550 psubsw %mm5, %mm4 /* V165 ; free mm5 */
551 movq %mm2, scratch7 /* out7 */
554 paddsw %mm5, %mm3 /* V162 */
555 movq 8*9(%esi), %mm2 /* V140 */
556 movq %mm0, %mm5 /* duplicate V141 */
557 /* in order not to perculate this line up,
558 * we read 72(%esi) very near to this location
560 movq %mm6, 8*9(%esi) /* out9 */
561 paddsw %mm1, %mm0 /* V161 */
562 movq %mm3, scratch5 /* out5 */
563 psubsw %mm1, %mm5 /* V166 ; free mm1 */
564 movq %mm4, 8*11(%esi) /* out11 */
566 movq %mm0, scratch3 /* out3 */
567 movq %mm2, %mm4 /* duplicate V140 */
568 movq %mm5, 8*13(%esi) /* out13 */
569 paddsw %mm7, %mm2 /* V160 */
570 /* moved from the next block */
572 psubsw %mm7, %mm4 /* V167 ; free mm7 */
573 /* moved from the next block */
576 movq %mm2, scratch1 /* out1 */
577 /* moved from the next block */
579 movq %mm4, 8*15(%esi) /* out15 */
580 /* moved from the next block */
582 /* transpose - M2 parts
583 * moved up to the prev block
585 * movq 8*3(%esi), %mm7
587 * punpcklwd %mm7, %mm0
593 /* shuffle the data and write the lower parts of the trasposed in 4 dwords */
594 movd %mm0, 8*8(%esi) /* LS part of tmt8 */
596 movd %mm1, 8*12(%esi) /* LS part of tmt12 */
598 movd %mm5, 8*8+4(%esi) /* MS part of tmt8 */
599 punpckhdq %mm5, %mm0 /* tmt10 */
600 movd %mm3, 8*12+4(%esi) /* MS part of tmt12 */
601 punpckhdq %mm3, %mm1 /* tmt14 */
602 /* transpose - M1 parts */
609 punpckhwd %mm2, %mm6 /* free mm2 */
612 punpckhwd %mm4, %mm3 /* free mm4 */
615 punpckldq %mm5, %mm7 /* tmt0 */
616 punpckhdq %mm5, %mm2 /* tmt2 ; free mm5 */
617 /* shuffle the rest of the data, and write it with 2 mmword writes */
618 punpckldq %mm3, %mm6 /* tmt4 */
619 /* moved from next block */
620 movq %mm2, %mm5 /* duplicate tmt2 */
621 punpckhdq %mm3, %mm4 /* tmt6 ; free mm3 */
622 /* moved from next block */
623 movq %mm0, %mm3 /* duplicate tmt10 */
624 /* column 0: odd part (after transpose)
625 *moved up to prev block
626 * movq %mm0, %mm3 duplicate tmt10
627 * movq %mm2, %mm5 duplicate tmt2
629 psubsw %mm4, %mm0 /* V110 */
630 paddsw %mm4, %mm3 /* V113 ; free mm4 */
631 movq %mm0, %mm4 /* duplicate V110 */
632 paddsw %mm1, %mm2 /* V111 */
633 pmulhw x539f539f539f539f, %mm0 /* 21407-> V117 */
634 psubsw %mm1, %mm5 /* V112 ; free mm1 */
635 psubsw %mm5, %mm4 /* V116 */
636 movq %mm2, %mm1 /* duplicate V111 */
637 pmulhw x4546454645464546, %mm5 /* 17734-> V119 */
638 psubsw %mm3, %mm2 /* V114 */
639 pmulhw x61f861f861f861f8, %mm4 /* 25080-> V120 */
640 paddsw %mm3, %mm1 /* V115 ; free mm3 */
641 pmulhw x5a825a825a825a82, %mm2 /* 23170-> V118 */
642 psllw $2, %mm0 /* t266 */
643 movq %mm1, (%esi) /* save V115 */
644 psllw $1, %mm5 /* t268 */
645 psubsw %mm4, %mm5 /* V122 */
646 psubsw %mm0, %mm4 /* V121 ; free mm0 */
647 psllw $1, %mm5 /* t270 */
648 psubsw %mm1, %mm5 /* V123 ; free mm1 */
649 psllw $2, %mm2 /* t272 */
650 psubsw %mm5, %mm2 /* V124 (keep V123) */
651 psllw $1, %mm4 /* t274 */
652 movq %mm5, 8*2(%esi) /* save V123 ; free mm5 */
653 paddsw %mm2, %mm4 /* V125 (keep V124) */
654 /* column 0: even part (after transpose) */
655 movq 8*12(%esi), %mm0 /* tmt12 */
656 movq %mm6, %mm3 /* duplicate tmt4 */
657 psubsw %mm0, %mm6 /* V100 */
658 paddsw %mm0, %mm3 /* V101 ; free mm0 */
659 pmulhw x5a825a825a825a82, %mm6 /* 23170 ->V102 */
660 movq %mm7, %mm5 /* duplicate tmt0 */
661 movq 8*8(%esi), %mm1 /* tmt8 */
662 paddsw %mm1, %mm7 /* V103 */
663 psubsw %mm1, %mm5 /* V104 ; free mm1 */
664 movq %mm7, %mm0 /* duplicate V103 */
665 psllw $2, %mm6 /* t245 */
666 paddsw %mm3, %mm7 /* V106 */
667 movq %mm5, %mm1 /* duplicate V104 */
668 psubsw %mm3, %mm6 /* V105 */
669 psubsw %mm3, %mm0 /* V109; free mm3 */
670 paddsw %mm6, %mm5 /* V107 */
671 psubsw %mm6, %mm1 /* V108 ; free mm6 */
672 /* column 0: output butterfly (after transform) */
673 movq %mm1, %mm3 /* duplicate V108 */
674 paddsw %mm2, %mm1 /* out4 */
676 psubsw %mm2, %mm3 /* out10 ; free mm2 */
678 movq %mm0, %mm6 /* duplicate V109 */
679 movq %mm1, 8*4(%esi) /* out4 ; free mm1 */
680 psubsw %mm4, %mm0 /* out6 */
681 movq %mm3, 8*10(%esi) /* out10 ; free mm3 */
683 paddsw %mm4, %mm6 /* out8 ; free mm4 */
684 movq %mm7, %mm1 /* duplicate V106 */
685 movq %mm0, 8*6(%esi) /* out6 ; free mm0 */
687 movq (%esi), %mm4 /* V115 */
688 movq %mm6, 8*8(%esi) /* out8 ; free mm6 */
689 movq %mm5, %mm2 /* duplicate V107 */
690 movq 8*2(%esi), %mm3 /* V123 */
691 paddsw %mm4, %mm7 /* out0 */
692 /* moved up from next block */
695 /* moved up from next block */
697 psubsw %mm4, %mm1 /* out14 ; free mm4 */
698 paddsw %mm3, %mm5 /* out2 */
700 movq %mm7, (%esi) /* out0 ; free mm7 */
702 movq %mm1, 8*14(%esi) /* out14 ; free mm1 */
703 psubsw %mm3, %mm2 /* out12 ; free mm3 */
704 movq %mm5, 8*2(%esi) /* out2 ; free mm5 */
706 /* moved up to the prev block */
708 /* moved up to the prev block */
710 movq %mm2, 8*12(%esi) /* out12 ; free mm2 */
711 /* moved up to the prev block */
713 /* move back the data to its correct place
714 * moved up to the prev block
715 * movq scratch3, %mm0
716 * movq scratch5, %mm6
717 * movq scratch7, %mm4
723 movq %mm0, 8*3(%esi) /* out3 */
725 movq %mm6, 8*5(%esi) /* out5 */
726 movq %mm4, 8*7(%esi) /* out7 */
727 movq %mm1, 8(%esi) /* out1 */
728 /* transpose matrix */
729 movl $8, %ebx /* ebx is x_size */
730 movl %esi, %edi /* pointer to the matrix */
735 subl $4, %eax /* eax is inner loop variable */
736 addl %ebx, %ecx /* ecx is 6*row size */
737 movl %eax, %edx /* edx is the outer loop variable */
738 .L1: movq (%esi), %mm0 /* first line */
739 movq (%esi,%ebx,4), %mm2 /* third line */
740 movq %mm0, %mm6 /* copy first line */
741 punpcklwd (%esi,%ebx,2), %mm0 /* interleave fist and second lines */
742 movq %mm2, %mm7 /* copy third line */
743 punpcklwd (%esi,%ecx), %mm2 /* interleave third and fourth lines */
744 movq %mm0, %mm4 /* copy first intermediate result */
745 movq (%esi,%ebx,2), %mm1 /* second line */
746 /* the next line 'punpcklwd %mm2, %mm0' inverted two pixels. */
747 /* punpckldq make printing cleaner */
748 punpckldq %mm2, %mm0 /* interleave to produce result 1 */
749 movq (%esi,%ecx), %mm3 /* fourth line */
750 punpckhdq %mm2, %mm4 /* interleave to produce result 2 */
751 movq %mm0, (%esi) /* write result 1 */
752 punpckhwd %mm1, %mm6 /* interleave first and second lines */
753 movq %mm4, (%esi,%ebx,2) /* write result 2 */
754 punpckhwd %mm3, %mm7 /* interleave 3rd and 4th lines */
755 movq %mm6, %mm5 /* copy first intermediate result */
756 punpckldq %mm7, %mm6 /* interleave to produce result 3 */
757 leal (%edi,%ebx,8), %edi /* point to 4x4 set 4 rows down */
758 punpckhdq %mm7, %mm5 /* interleave to produce result 4 */
759 movq %mm6, (%esi,%ebx,4) /* write result 3 */
760 movq %mm5, (%esi,%ecx) /* write result 4 */
761 /* check to see if number of rows left is zero */
763 /* last time through you are done and ready to exit */
765 .L2: movq 8(%esi), %mm0 /* first line */
766 movq 8(%esi,%ebx,4), %mm2 /* third line */
767 movq %mm0, %mm6 /* copy first line */
768 punpcklwd 8(%esi,%ebx,2), %mm0 /* interleave first and second lines */
769 movq %mm2, %mm7 /* copy third line */
770 punpcklwd 8(%esi,%ecx), %mm2 /* interleave 3rd and 4th lines */
771 movq %mm0, %mm4 /* copy first intermediate */
772 movq (%edi), %mm1 /* first line */
773 punpckldq %mm2, %mm0 /* interleave to produce 1st result */
774 movq (%edi,%ebx,4), %mm3 /* third line */
775 punpckhdq %mm2, %mm4 /* interleave to produce 2nd result */
776 punpckhwd 8(%esi,%ebx,2), %mm6 /* interleave 1st and 2nd lines */
777 movq %mm1, %mm2 /* copy first line */
778 punpckhwd 8(%esi,%ecx), %mm7 /* interleave 3rd and 4th lines */
779 movq %mm6, %mm5 /* copy first intermediate */
780 movq %mm0, (%edi) /* write result 1 */
781 punpckhdq %mm7, %mm5 /* produce third result */
782 punpcklwd (%edi,%ebx,2), %mm1 /* interleave 1st and 2nd lines */
783 movq %mm3, %mm0 /* copy third line */
784 punpckhwd (%edi,%ebx,2), %mm2 /* interleave 1st and 2nd lines */
785 movq %mm4, (%edi,%ebx,2) /* write result 2 */
786 punpckldq %mm7, %mm6 /* produce fourth result */
787 punpcklwd (%edi,%ecx), %mm3 /* interleave 3rd and 4th lines */
788 movq %mm1, %mm4 /* copy first intermediate */
789 movq %mm6, (%edi,%ebx,4) /* write result 3 */
791 punpckhwd (%edi,%ecx), %mm0 /* interleave 3rd and 4th lines */
792 movq %mm2, %mm6 /* copy second intermediate */
793 movq %mm5, (%edi,%ecx) /* write result 4 */
794 punpckhdq %mm3, %mm4 /* produce second result */
795 movq %mm1, 8(%esi) /* write result 5 */
796 punpckldq %mm0, %mm2 /* produce third result */
797 movq %mm4, 8(%esi,%ebx,2) /* write result 6 */
798 punpckhdq %mm0, %mm6 /* produce fourth result */
799 movq %mm2, 8(%esi,%ebx,4) /* write result 7 */
800 movq %mm6, 8(%esi,%ecx) /* write result 8 */
801 /* increment %esi to point to next 4x4 block in same row */
803 /* increment %edi to point to nxt 4x4 block below current */
804 leal (%edi,%ebx,8), %edi
805 sub $4, %eax /* decrement inner loop var */
807 /* %edi points to start of second row in block just finished */
809 leal 8(%esi,%ebx,8), %esi
811 /* subtract the number of bytes in last row */
812 /* now we point to spot where row=col */
813 subl $8, %edx /* sub 4 from row number */
817 /* reset x_size to outer loop variable to start new row */
829 .size vdec_IDCT,.Lfe1-vdec_IDCT