1 /*****************************************************************************
2 * video_yuv_mmx.S: YUV transformation, optimized for MMX processors
3 *****************************************************************************
4 * Copyright (C) 1999, 2000 VideoLAN
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 *****************************************************************************/
23 /*****************************************************************************
24 * Following functions are defined:
26 * This function performs YUV12-to-RGB16 color conversion for H26x.
27 * It handles any format in which there are three fields, the low
28 * order field being B and fully contained in the low order byte, the
29 * second field being G and being somewhere in bits 4 through 11,
30 * and the high order field being R and fully contained in the high
33 * The YUV12 input is planar, 8 bits per pel. The Y plane may have
34 * a pitch of up to 768. It may have a width less than or equal
35 * to the pitch. It must be DWORD aligned, and preferably QWORD
36 * aligned. Pitch and Width must be a multiple of four. For best
37 * performance, Pitch should not be 4 more than a multiple of 32.
38 * Height may be any amount, but must be a multiple of two. The U
39 * and V planes may have a different pitch than the Y plane, subject
40 * to the same limitations.
41 *****************************************************************************/
55 Minusg: .long 0x00800080, 0x00800080
56 Yadd: .long 0x10101010, 0x10101010
57 VtR: .long 0x00660066, 0x00660066
58 VtG: .long 0x00340034, 0x00340034
59 UtG: .long 0x00190019, 0x00190019
60 UtB: .long 0x00810081, 0x00810081
61 Ymul: .long 0x004a004a, 0x004a004a
62 UVtG: .long 0x00340019, 0x00340019
63 VtRUtB: .long 0x01990205, 0x01990205
64 fourbitu: .quad 0xf0f0f0f0f0f0f0f0
65 fivebitu: .quad 0xe0e0e0e0e0e0e0e0
66 sixbitu: .quad 0xc0c0c0c0c0c0c0c0
70 #define LocalFrameSize 156
71 #define RegisterStorageSize 16
73 //#define DOUBLE /*double le nombre de colonnes */
76 #define YPlane LocalFrameSize + RegisterStorageSize + 4
77 #define UPlane LocalFrameSize + RegisterStorageSize + 8
78 #define VPlane LocalFrameSize + RegisterStorageSize + 12
79 #define FrameWidth LocalFrameSize + RegisterStorageSize + 16
80 #define FrameHeight LocalFrameSize + RegisterStorageSize + 20
81 #define YPitch LocalFrameSize + RegisterStorageSize + 24
82 #define ChromaPitch LocalFrameSize + RegisterStorageSize + 28
83 #define AspectAdjustmentCount LocalFrameSize + RegisterStorageSize + 32
84 #define ColorConvertedFrame LocalFrameSize + RegisterStorageSize + 36
85 #define DCIOffset LocalFrameSize + RegisterStorageSize + 40
86 #define CCOffsetToLine0 LocalFrameSize + RegisterStorageSize + 44
87 #define CCOPitch LocalFrameSize + RegisterStorageSize + 48
88 #define CCType LocalFrameSize + RegisterStorageSize + 52
89 #define EndOfArgList LocalFrameSize + RegisterStorageSize + 56
91 /* Locals (on local stack frame) */
93 #define CCOSkipDistance 4
94 #define ChromaLineLen 8
96 #define DistanceFromVToU 16
97 #define EndOfChromaLine 20
98 #define AspectCount 24
99 #define AspectBaseCount 28
100 #define tmpYCursorEven 32
101 #define tmpYCursorOdd 36
102 #define tmpCCOPitch 40
104 #define RLeftShift 92
105 #define GLeftShift 100
106 #define RRightShift 108
107 #define GRightShift 116
108 #define BRightShift 124
109 #define RUpperLimit 132
110 #define GUpperLimit 140
111 #define BUpperLimit 148
114 * extern void C ConvertYUV420RGB16MMX (
122 * UN AspectAdjustmentCount,
123 * U8* ColorConvertedFrame,
125 * U32 CCOffsetToLine0,
129 * The local variables are on the stack,
130 * The tables are in the one and only data segment.
132 * CCOffsetToLine0 is relative to ColorConvertedFrame.
133 * CCType used by RGB color convertors to determine the exact conversion type.
140 .globl ConvertYUV420RGB16MMX
141 ConvertYUV420RGB16MMX:
148 subl $LocalFrameSize,%esp
149 movl CCType(%esp),%eax
153 jmp *RGB_formats(,%eax,4)
157 movl $2,%ebx /* 10-8 for byte shift */
158 movl %ebx,RLeftShift(%esp)
159 movl %eax,RLeftShift+4(%esp)
161 movl %ebx,GLeftShift(%esp)
162 movl %eax,GLeftShift+4(%esp)
164 movl %ebx,RRightShift(%esp)
165 movl %eax,RRightShift+4(%esp)
166 movl %ebx,GRightShift(%esp)
167 movl %eax,GRightShift+4(%esp)
168 movl %ebx,BRightShift(%esp)
169 movl %eax,BRightShift+4(%esp)
171 movq %mm0,RUpperLimit(%esp)
172 movq %mm0,GUpperLimit(%esp)
173 movq %mm0,BUpperLimit(%esp)
178 movl $2,%ebx /* 8-6 */
179 movl %ebx,RLeftShift(%esp)
180 movl %eax,RLeftShift+4(%esp)
182 movl %ebx,GLeftShift(%esp)
183 movl %eax,GLeftShift+4(%esp)
185 movl %ebx,RRightShift(%esp)
186 movl %eax,RRightShift+4(%esp)
187 movl %ebx,GRightShift(%esp)
188 movl %eax,GRightShift+4(%esp)
190 movl %ebx,BRightShift(%esp)
191 movl %eax,BRightShift+4(%esp)
193 movq %mm0,RUpperLimit(%esp)
194 movq %mm0,GUpperLimit(%esp)
196 movq %mm0,BUpperLimit(%esp)
201 movl $2,%ebx /* 8-6 */
202 movl %ebx,RLeftShift(%esp)
203 movl %eax,RLeftShift+4(%esp)
205 movl %ebx,GLeftShift(%esp)
206 movl %eax,GLeftShift+4(%esp)
208 movl %ebx,RRightShift(%esp)
209 movl %eax,RRightShift+4(%esp)
211 movl %ebx,GRightShift(%esp)
212 movl %eax,GRightShift+4(%esp)
213 movl %ebx,BRightShift(%esp)
214 movl %eax,BRightShift+4(%esp)
216 movq %mm0,RUpperLimit(%esp)
218 movq %mm0,GUpperLimit(%esp)
219 movq %mm0,BUpperLimit(%esp)
224 movl $3,%ebx /* 8-5 */
225 movl %ebx,RLeftShift(%esp)
226 movl %eax,RLeftShift+4(%esp)
228 movl %ebx,GLeftShift(%esp)
229 movl %eax,GLeftShift+4(%esp)
231 movl %ebx,RRightShift(%esp)
232 movl %eax,RRightShift+4(%esp)
233 movl %ebx,BRightShift(%esp)
234 movl %eax,BRightShift+4(%esp)
236 movl %ebx,GRightShift(%esp)
237 movl %eax,GRightShift+4(%esp)
239 movq %mm0,RUpperLimit(%esp)
240 movq %mm0,BUpperLimit(%esp)
242 movq %mm0,GUpperLimit(%esp)
246 movl VPlane(%esp),%ebx
247 movl UPlane(%esp),%ecx
249 movl %ecx,DistanceFromVToU(%esp)
251 movl ColorConvertedFrame(%esp),%eax
252 addl DCIOffset(%esp),%eax
253 addl CCOffsetToLine0(%esp),%eax
254 movl %eax,CCOCursor(%esp)
257 movl YPitch(%esp),%ecx
258 movl FrameWidth(%esp),%ebx
259 movl CCOPitch(%esp),%eax
260 subl %ebx,%eax /* CCOPitch-FrameWidth */
261 subl %ebx,%eax /* CCOPitch-2*FrameWidth */
262 sarl %ebx /* FrameWidth/2 */
263 movl YPlane(%esp),%esi /* Fetch cursor over luma plane. */
264 movl %ebx,ChromaLineLen(%esp) /* FrameWidth/2 */
265 movl %eax,CCOSkipDistance(%esp) /* CCOPitch-3*FrameWidth */
266 movl %esi,YCursor(%esp)
267 movl AspectAdjustmentCount(%esp),%edx
268 movl VPlane(%esp),%esi
272 movl %edx,AspectCount(%esp)
273 movl %edx,AspectBaseCount(%esp)
276 movl ChromaLineLen(%esp),%edi
277 movl %edi,EndOfChromaLine(%esp)
278 movl CCOCursor(%esp),%edi
280 movl DistanceFromVToU(%esp),%edx
281 movl YCursor(%esp),%ebp /* Fetch Y Pitch. */
282 movl FrameWidth(%esp),%ebx
285 movl %ebp,tmpYCursorEven(%esp)
286 movl YPitch(%esp),%eax
288 movl %ebp,tmpYCursorOdd(%esp)
294 movl %ebx,FrameWidth(%esp)
301 movl AspectCount(%esp),%ebp
302 movl FrameWidth(%esp),%ebx
304 movl CCOPitch(%esp),%eax
305 movl %eax,tmpCCOPitch(%esp)
309 addl AspectAdjustmentCount(%esp),%ebp
310 movl %eax,tmpCCOPitch(%esp)
312 movl %ebp,AspectCount(%esp)
315 movl tmpYCursorEven(%esp),%ebp
316 /* here is even line */
317 movd (%edx,%ebx,),%mm1 /* 4 u values */
318 pxor %mm0,%mm0 /* mm0=0 */
319 movd (%esi,%ebx,),%mm2 /* 4 v values */
320 punpcklbw %mm0,%mm1 /* get 4 unsign u */
321 psubw Minusg,%mm1 /* get 4 unsign u-128 */
322 punpcklbw %mm0,%mm2 /* get unsign v */
323 psubw Minusg,%mm2 /* get unsign v-128 */
324 movq %mm1,%mm3 /* save the u-128 unsign */
325 movq %mm1,%mm5 /* save u-128 unsign */
326 punpcklwd %mm2,%mm1 /* get 2 low u, v unsign pairs */
328 punpckhwd %mm2,%mm3 /* create high 2 unsign uv pairs */
330 movq %mm2,temp_mmx(%esp) /* save v-128 */
331 movq (%ebp,%ebx,2),%mm6 /* mm6 has 8 y pixels */
332 psubusb Yadd,%mm6 /* mm6 has 8 y-16 pixels */
333 packssdw %mm3,%mm1 /* packed the results to signed words */
334 movq %mm6,%mm7 /* save the 8 y-16 pixels */
335 punpcklbw %mm0,%mm6 /* mm6 has 4 low y-16 unsign */
337 punpckhbw %mm0,%mm7 /* mm7 has 4 high y-16 unsign */
340 movq %mm1,temp_mmx+8(%esp) /* save 4 chroma G values */
341 punpcklwd %mm1,%mm1 /* chroma G replicate low 2 */
342 movq %mm6,%mm0 /* low y */
343 punpckhwd %mm4,%mm4 /* chroma G replicate high 2 */
344 movq %mm7,%mm3 /* high y */
345 psubw %mm1,%mm6 /* 4 low G */
346 psraw GRightShift(%esp),%mm6
347 psubw %mm4,%mm7 /* 4 high G values in signed 16 bit */
349 punpcklwd %mm5,%mm5 /* replicate the 2 low u pixels */
352 psraw GRightShift(%esp),%mm7
354 packuswb %mm7,%mm6 /* mm6: G7 G6 G5 G4 G3 G2 G1 G0 */
355 movq %mm5,temp_mmx+16(%esp) /* low chroma B */
356 paddw %mm0,%mm5 /* 4 low B values in signed 16 bit */
357 movq %mm2,temp_mmx+40(%esp) /* high chroma B */
358 paddw %mm3,%mm2 /* 4 high B values in signed 16 bit */
359 psraw BRightShift(%esp),%mm5 /* low B scaled down by 6+(8-5) */
360 psraw BRightShift(%esp),%mm2 /* high B scaled down by 6+(8-5) */
361 packuswb %mm2,%mm5 /* mm5: B7 B6 B5 B4 B3 B2 B1 B0 */
363 movq temp_mmx(%esp),%mm2 /* 4 v values */
364 movq %mm5,%mm1 /* save B */
366 punpcklwd %mm2,%mm2 /* replicate the 2 low v pixels */
370 paddusb BUpperLimit(%esp),%mm1 /* mm1: saturate B+0FF-15 */
371 movq %mm2,temp_mmx+24(%esp) /* low chroma R */
372 paddw %mm0,%mm2 /* 4 low R values in signed 16 bit */
373 psraw RRightShift(%esp),%mm2 /* low R scaled down by 6+(8-5) */
374 pxor %mm4,%mm4 /* mm4=0 for 8->16 conversion */
375 movq %mm7,temp_mmx+32(%esp) /* high chroma R */
376 paddw %mm3,%mm7 /* 4 high R values in signed 16 bit */
377 psraw RRightShift(%esp),%mm7 /* high R scaled down by 6+(8-5) */
378 psubusb BUpperLimit(%esp),%mm1
379 packuswb %mm7,%mm2 /* mm2: R7 R6 R5 R4 R3 R2 R1 R0 */
380 paddusb GUpperLimit(%esp),%mm6 /* G fast patch ih */
381 psubusb GUpperLimit(%esp),%mm6 /* fast patch ih */
382 paddusb RUpperLimit(%esp),%mm2 /* R */
383 psubusb RUpperLimit(%esp),%mm2
386 * here we are packing from RGB24 to RGB16
388 * mm6: G7 G6 G5 G4 G3 G2 G1 G0
389 * mm1: B7 B6 B5 B4 B3 B2 B1 B0
390 * mm2: R7 R6 R5 R4 R3 R2 R1 R0
391 * assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
392 * when H=2**xBITS-1 (x is for R G B)
394 * mm1- result: 4 low RGB16
395 * mm7- result: 4 high RGB16
396 * using: mm0- zero register
397 * mm3- temporary results
399 * for (i=0; i<8; i++) {
400 * RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
404 psllq RLeftShift(%esp),%mm2 /* position R in the most significant
406 movq %mm1,%mm7 /* mm1: Save B */
409 * note: no need for shift to place B on the least significant part of the byte
410 * R in left position, B in the right position so they can be combined
413 punpcklbw %mm2,%mm1 /* mm1: 4 low 16 bit RB */
414 pxor %mm0,%mm0 /* mm0: 0 */
415 punpckhbw %mm2,%mm7 /* mm5: 4 high 16 bit RB */
416 movq %mm6,%mm3 /* mm3: G */
417 punpcklbw %mm0,%mm6 /* mm6: low 4 G 16 bit */
418 psllw GLeftShift(%esp),%mm6 /* shift low G 5 positions */
419 punpckhbw %mm0,%mm3 /* mm3: high 4 G 16 bit */
420 por %mm6,%mm1 /* mm1: low RBG16 */
421 psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */
422 por %mm3,%mm7 /* mm5: high RBG16 */
424 movl tmpYCursorOdd(%esp),%ebp /* moved to here to save cycles
426 movq %mm1,(%edi) /* !! aligned */
428 /*- start odd line */
429 movq (%ebp,%ebx,2),%mm1 /* mm1 has 8 y pixels */
431 psubusb Yadd,%mm1 /* mm1 has 8 pixels y-16 */
433 punpcklbw %mm2,%mm1 /* get 4 low y-16 unsign pixels word */
434 pmullw Ymul,%mm1 /* low 4 luminance contribution */
435 punpckhbw %mm2,%mm5 /* 4 high y-16 */
436 pmullw Ymul,%mm5 /* high 4 luminance contribution */
437 movq %mm7,8(%edi) /* !! aligned */
439 paddw temp_mmx+24(%esp),%mm0 /* low 4 R */
441 psraw RRightShift(%esp),%mm0 /* low R scaled down by 6+(8-5) */
442 paddw temp_mmx+32(%esp),%mm5 /* high 4 R */
444 psraw RRightShift(%esp),%mm5 /* high R scaled down by 6+(8-5) */
445 paddw temp_mmx+16(%esp),%mm2 /* low 4 B */
446 packuswb %mm5,%mm0 /* mm0: R7 R6 R5 R4 R3 R2 R1 R0 */
447 psraw BRightShift(%esp),%mm2 /* low B scaled down by 6+(8-5) */
449 paddw temp_mmx+40(%esp),%mm6 /* high 4 B */
450 psraw BRightShift(%esp),%mm6 /* high B scaled down by 6+(8-5) */
451 movq temp_mmx+8(%esp),%mm3 /* chroma G low 4 */
452 packuswb %mm6,%mm2 /* mm2: B7 B6 B5 B4 B3 B2 B1 B0 */
454 punpcklwd %mm3,%mm3 /* replicate low 2 */
455 punpckhwd %mm4,%mm4 /* replicate high 2 */
456 psubw %mm3,%mm1 /* 4 low G */
457 psraw GRightShift(%esp),%mm1 /* low G scaled down by 6+(8-5) */
458 psubw %mm4,%mm5 /* 4 high G values in signed 16 bit */
459 psraw GRightShift(%esp),%mm5 /* high G scaled down by 6+(8-5) */
460 paddusb BUpperLimit(%esp),%mm2 /* mm1: saturate B+0FF-15 */
461 packuswb %mm5,%mm1 /*mm1: G7 G6 G5 G4 G3 G2 G1 G0 */
462 psubusb BUpperLimit(%esp),%mm2
463 paddusb GUpperLimit(%esp),%mm1 /* G */
464 psubusb GUpperLimit(%esp),%mm1
465 paddusb RUpperLimit(%esp),%mm0 /* R */
466 movl tmpCCOPitch(%esp),%eax
467 psubusb RUpperLimit(%esp),%mm0
470 * here we are packing from RGB24 to RGB16
471 * mm1: G7 G6 G5 G4 G3 G2 G1 G0
472 * mm2: B7 B6 B5 B4 B3 B2 B1 B0
473 * mm0: R7 R6 R5 R4 R3 R2 R1 R0
475 * mm2- result: 4 low RGB16
476 * mm7- result: 4 high RGB16
477 * using: mm4- zero register
478 * mm3- temporary results
481 psllq RLeftShift(%esp),%mm0 /* position R in the most significant
483 movq %mm2,%mm7 /* mm7: Save B */
486 * note: no need for shift to place B on the least significant part of the byte
487 * R in left position, B in the right position so they can be combined
490 punpcklbw %mm0,%mm2 /* mm1: 4 low 16 bit RB */
491 pxor %mm4,%mm4 /* mm4: 0 */
492 movq %mm1,%mm3 /* mm3: G */
493 punpckhbw %mm0,%mm7 /* mm7: 4 high 16 bit RB */
494 punpcklbw %mm4,%mm1 /* mm1: low 4 G 16 bit */
495 punpckhbw %mm4,%mm3 /* mm3: high 4 G 16 bit */
496 psllw GLeftShift(%esp),%mm1 /* shift low G 5 positions */
497 por %mm1,%mm2 /* mm2: low RBG16 */
498 psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */
499 por %mm3,%mm7 /* mm7: high RBG16 */
509 movq %mm0,(%edi,%eax,)
510 movq %mm1,8(%edi,%eax,)
511 movq %mm3,16(%edi,%eax,)
512 movq %mm5,24(%edi,%eax,)
517 movq %mm2,(%edi,%eax,)
518 movq %mm7,8(%edi,%eax,) /* aligned */
519 addl $16,%edi /* ih take 16 bytes (8 pixels-16 bit) */
520 addl $4,%ebx /* ? to take 4 pixels together
525 addl CCOSkipDistance(%esp),%edi /* go to begin of next line */
526 addl tmpCCOPitch(%esp),%edi /* skip odd line (if it is needed) */
528 // Lebp CCOPitch ; skip odd line
533 // Addeax AspectBaseCount
540 movl YPitch(%esp),%eax
541 movl tmpYCursorOdd(%esp),%ebp
542 addl %eax,%ebp /* skip one line */
543 // lea ebp, [ebp+2*eax] /* skip two lines */
544 movl %ebp,tmpYCursorEven(%esp)
545 // Sebp tmpYCursorOdd
547 addl %eax,%ebp /* skip one line */
548 movl %ebp,tmpYCursorOdd(%esp)
549 // Lebp tmpYCursorEven
550 // lea ebp, [ebp+2*eax]
551 // Sebp tmpYCursorEven
554 addl ChromaPitch(%esp),%esi
555 addl ChromaPitch(%esp),%edx
558 // Leax YLimit /* Done with last line? */
560 // jbe PrepareChromaLine
561 subw $2,FrameHeight(%esp)
564 /******************************************************************************/
568 addl $LocalFrameSize,%esp