1 /*****************************************************************************
2 * video_yuv_mmx.S: YUV transformation, optimized for MMX processors
3 *****************************************************************************
4 * Copyright (C) 1999, 2000 VideoLAN
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 02111-1307, USA.
22 *****************************************************************************/
24 /*****************************************************************************
25 * Following functions are defined:
27 * This function performs YUV12-to-RGB16 color conversion for H26x.
28 * It handles any format in which there are three fields, the low
29 * order field being B and fully contained in the low order byte, the
30 * second field being G and being somewhere in bits 4 through 11,
31 * and the high order field being R and fully contained in the high
34 * The YUV12 input is planar, 8 bits per pel. The Y plane may have
35 * a pitch of up to 768. It may have a width less than or equal
36 * to the pitch. It must be DWORD aligned, and preferably QWORD
37 * aligned. Pitch and Width must be a multiple of four. For best
38 * performance, Pitch should not be 4 more than a multiple of 32.
39 * Height may be any amount, but must be a multiple of two. The U
40 * and V planes may have a different pitch than the Y plane, subject
41 * to the same limitations.
42 *****************************************************************************/
56 Minusg: .long 0x00800080, 0x00800080
57 Yadd: .long 0x10101010, 0x10101010
58 VtR: .long 0x00660066, 0x00660066
59 VtG: .long 0x00340034, 0x00340034
60 UtG: .long 0x00190019, 0x00190019
61 UtB: .long 0x00810081, 0x00810081
62 Ymul: .long 0x004a004a, 0x004a004a
63 UVtG: .long 0x00340019, 0x00340019
64 VtRUtB: .long 0x01990205, 0x01990205
65 fourbitu: .quad 0xf0f0f0f0f0f0f0f0
66 fivebitu: .quad 0xe0e0e0e0e0e0e0e0
67 sixbitu: .quad 0xc0c0c0c0c0c0c0c0
71 #define LocalFrameSize 156
72 #define RegisterStorageSize 16
74 //#define DOUBLE /*double le nombre de colonnes */
77 #define YPlane LocalFrameSize + RegisterStorageSize + 4
78 #define UPlane LocalFrameSize + RegisterStorageSize + 8
79 #define VPlane LocalFrameSize + RegisterStorageSize + 12
80 #define FrameWidth LocalFrameSize + RegisterStorageSize + 16
81 #define FrameHeight LocalFrameSize + RegisterStorageSize + 20
82 #define YPitch LocalFrameSize + RegisterStorageSize + 24
83 #define ChromaPitch LocalFrameSize + RegisterStorageSize + 28
84 #define AspectAdjustmentCount LocalFrameSize + RegisterStorageSize + 32
85 #define ColorConvertedFrame LocalFrameSize + RegisterStorageSize + 36
86 #define DCIOffset LocalFrameSize + RegisterStorageSize + 40
87 #define CCOffsetToLine0 LocalFrameSize + RegisterStorageSize + 44
88 #define CCOPitch LocalFrameSize + RegisterStorageSize + 48
89 #define CCType LocalFrameSize + RegisterStorageSize + 52
90 #define EndOfArgList LocalFrameSize + RegisterStorageSize + 56
92 /* Locals (on local stack frame) */
94 #define CCOSkipDistance 4
95 #define ChromaLineLen 8
97 #define DistanceFromVToU 16
98 #define EndOfChromaLine 20
99 #define AspectCount 24
100 #define AspectBaseCount 28
101 #define tmpYCursorEven 32
102 #define tmpYCursorOdd 36
103 #define tmpCCOPitch 40
105 #define RLeftShift 92
106 #define GLeftShift 100
107 #define RRightShift 108
108 #define GRightShift 116
109 #define BRightShift 124
110 #define RUpperLimit 132
111 #define GUpperLimit 140
112 #define BUpperLimit 148
115 * extern void C ConvertYUV420RGB16MMX (
123 * UN AspectAdjustmentCount,
124 * U8* ColorConvertedFrame,
126 * U32 CCOffsetToLine0,
130 * The local variables are on the stack,
131 * The tables are in the one and only data segment.
133 * CCOffsetToLine0 is relative to ColorConvertedFrame.
134 * CCType used by RGB color convertors to determine the exact conversion type.
141 .globl ConvertYUV420RGB16MMX
142 ConvertYUV420RGB16MMX:
149 subl $LocalFrameSize,%esp
150 movl CCType(%esp),%eax
154 jmp *RGB_formats(,%eax,4)
158 movl $2,%ebx /* 10-8 for byte shift */
159 movl %ebx,RLeftShift(%esp)
160 movl %eax,RLeftShift+4(%esp)
162 movl %ebx,GLeftShift(%esp)
163 movl %eax,GLeftShift+4(%esp)
165 movl %ebx,RRightShift(%esp)
166 movl %eax,RRightShift+4(%esp)
167 movl %ebx,GRightShift(%esp)
168 movl %eax,GRightShift+4(%esp)
169 movl %ebx,BRightShift(%esp)
170 movl %eax,BRightShift+4(%esp)
172 movq %mm0,RUpperLimit(%esp)
173 movq %mm0,GUpperLimit(%esp)
174 movq %mm0,BUpperLimit(%esp)
179 movl $2,%ebx /* 8-6 */
180 movl %ebx,RLeftShift(%esp)
181 movl %eax,RLeftShift+4(%esp)
183 movl %ebx,GLeftShift(%esp)
184 movl %eax,GLeftShift+4(%esp)
186 movl %ebx,RRightShift(%esp)
187 movl %eax,RRightShift+4(%esp)
188 movl %ebx,GRightShift(%esp)
189 movl %eax,GRightShift+4(%esp)
191 movl %ebx,BRightShift(%esp)
192 movl %eax,BRightShift+4(%esp)
194 movq %mm0,RUpperLimit(%esp)
195 movq %mm0,GUpperLimit(%esp)
197 movq %mm0,BUpperLimit(%esp)
202 movl $2,%ebx /* 8-6 */
203 movl %ebx,RLeftShift(%esp)
204 movl %eax,RLeftShift+4(%esp)
206 movl %ebx,GLeftShift(%esp)
207 movl %eax,GLeftShift+4(%esp)
209 movl %ebx,RRightShift(%esp)
210 movl %eax,RRightShift+4(%esp)
212 movl %ebx,GRightShift(%esp)
213 movl %eax,GRightShift+4(%esp)
214 movl %ebx,BRightShift(%esp)
215 movl %eax,BRightShift+4(%esp)
217 movq %mm0,RUpperLimit(%esp)
219 movq %mm0,GUpperLimit(%esp)
220 movq %mm0,BUpperLimit(%esp)
225 movl $3,%ebx /* 8-5 */
226 movl %ebx,RLeftShift(%esp)
227 movl %eax,RLeftShift+4(%esp)
229 movl %ebx,GLeftShift(%esp)
230 movl %eax,GLeftShift+4(%esp)
232 movl %ebx,RRightShift(%esp)
233 movl %eax,RRightShift+4(%esp)
234 movl %ebx,BRightShift(%esp)
235 movl %eax,BRightShift+4(%esp)
237 movl %ebx,GRightShift(%esp)
238 movl %eax,GRightShift+4(%esp)
240 movq %mm0,RUpperLimit(%esp)
241 movq %mm0,BUpperLimit(%esp)
243 movq %mm0,GUpperLimit(%esp)
247 movl VPlane(%esp),%ebx
248 movl UPlane(%esp),%ecx
250 movl %ecx,DistanceFromVToU(%esp)
252 movl ColorConvertedFrame(%esp),%eax
253 addl DCIOffset(%esp),%eax
254 addl CCOffsetToLine0(%esp),%eax
255 movl %eax,CCOCursor(%esp)
258 movl YPitch(%esp),%ecx
259 movl FrameWidth(%esp),%ebx
260 movl CCOPitch(%esp),%eax
261 subl %ebx,%eax /* CCOPitch-FrameWidth */
262 subl %ebx,%eax /* CCOPitch-2*FrameWidth */
263 sarl %ebx /* FrameWidth/2 */
264 movl YPlane(%esp),%esi /* Fetch cursor over luma plane. */
265 movl %ebx,ChromaLineLen(%esp) /* FrameWidth/2 */
266 movl %eax,CCOSkipDistance(%esp) /* CCOPitch-3*FrameWidth */
267 movl %esi,YCursor(%esp)
268 movl AspectAdjustmentCount(%esp),%edx
269 movl VPlane(%esp),%esi
273 movl %edx,AspectCount(%esp)
274 movl %edx,AspectBaseCount(%esp)
277 movl ChromaLineLen(%esp),%edi
278 movl %edi,EndOfChromaLine(%esp)
279 movl CCOCursor(%esp),%edi
281 movl DistanceFromVToU(%esp),%edx
282 movl YCursor(%esp),%ebp /* Fetch Y Pitch. */
283 movl FrameWidth(%esp),%ebx
286 movl %ebp,tmpYCursorEven(%esp)
287 movl YPitch(%esp),%eax
289 movl %ebp,tmpYCursorOdd(%esp)
295 movl %ebx,FrameWidth(%esp)
302 movl AspectCount(%esp),%ebp
303 movl FrameWidth(%esp),%ebx
305 movl CCOPitch(%esp),%eax
306 movl %eax,tmpCCOPitch(%esp)
310 addl AspectAdjustmentCount(%esp),%ebp
311 movl %eax,tmpCCOPitch(%esp)
313 movl %ebp,AspectCount(%esp)
316 movl tmpYCursorEven(%esp),%ebp
317 /* here is even line */
318 movd (%edx,%ebx,),%mm1 /* 4 u values */
319 pxor %mm0,%mm0 /* mm0=0 */
320 movd (%esi,%ebx,),%mm2 /* 4 v values */
321 punpcklbw %mm0,%mm1 /* get 4 unsign u */
322 psubw Minusg,%mm1 /* get 4 unsign u-128 */
323 punpcklbw %mm0,%mm2 /* get unsign v */
324 psubw Minusg,%mm2 /* get unsign v-128 */
325 movq %mm1,%mm3 /* save the u-128 unsign */
326 movq %mm1,%mm5 /* save u-128 unsign */
327 punpcklwd %mm2,%mm1 /* get 2 low u, v unsign pairs */
329 punpckhwd %mm2,%mm3 /* create high 2 unsign uv pairs */
331 movq %mm2,temp_mmx(%esp) /* save v-128 */
332 movq (%ebp,%ebx,2),%mm6 /* mm6 has 8 y pixels */
333 psubusb Yadd,%mm6 /* mm6 has 8 y-16 pixels */
334 packssdw %mm3,%mm1 /* packed the results to signed words */
335 movq %mm6,%mm7 /* save the 8 y-16 pixels */
336 punpcklbw %mm0,%mm6 /* mm6 has 4 low y-16 unsign */
338 punpckhbw %mm0,%mm7 /* mm7 has 4 high y-16 unsign */
341 movq %mm1,temp_mmx+8(%esp) /* save 4 chroma G values */
342 punpcklwd %mm1,%mm1 /* chroma G replicate low 2 */
343 movq %mm6,%mm0 /* low y */
344 punpckhwd %mm4,%mm4 /* chroma G replicate high 2 */
345 movq %mm7,%mm3 /* high y */
346 psubw %mm1,%mm6 /* 4 low G */
347 psraw GRightShift(%esp),%mm6
348 psubw %mm4,%mm7 /* 4 high G values in signed 16 bit */
350 punpcklwd %mm5,%mm5 /* replicate the 2 low u pixels */
353 psraw GRightShift(%esp),%mm7
355 packuswb %mm7,%mm6 /* mm6: G7 G6 G5 G4 G3 G2 G1 G0 */
356 movq %mm5,temp_mmx+16(%esp) /* low chroma B */
357 paddw %mm0,%mm5 /* 4 low B values in signed 16 bit */
358 movq %mm2,temp_mmx+40(%esp) /* high chroma B */
359 paddw %mm3,%mm2 /* 4 high B values in signed 16 bit */
360 psraw BRightShift(%esp),%mm5 /* low B scaled down by 6+(8-5) */
361 psraw BRightShift(%esp),%mm2 /* high B scaled down by 6+(8-5) */
362 packuswb %mm2,%mm5 /* mm5: B7 B6 B5 B4 B3 B2 B1 B0 */
364 movq temp_mmx(%esp),%mm2 /* 4 v values */
365 movq %mm5,%mm1 /* save B */
367 punpcklwd %mm2,%mm2 /* replicate the 2 low v pixels */
371 paddusb BUpperLimit(%esp),%mm1 /* mm1: saturate B+0FF-15 */
372 movq %mm2,temp_mmx+24(%esp) /* low chroma R */
373 paddw %mm0,%mm2 /* 4 low R values in signed 16 bit */
374 psraw RRightShift(%esp),%mm2 /* low R scaled down by 6+(8-5) */
375 pxor %mm4,%mm4 /* mm4=0 for 8->16 conversion */
376 movq %mm7,temp_mmx+32(%esp) /* high chroma R */
377 paddw %mm3,%mm7 /* 4 high R values in signed 16 bit */
378 psraw RRightShift(%esp),%mm7 /* high R scaled down by 6+(8-5) */
379 psubusb BUpperLimit(%esp),%mm1
380 packuswb %mm7,%mm2 /* mm2: R7 R6 R5 R4 R3 R2 R1 R0 */
381 paddusb GUpperLimit(%esp),%mm6 /* G fast patch ih */
382 psubusb GUpperLimit(%esp),%mm6 /* fast patch ih */
383 paddusb RUpperLimit(%esp),%mm2 /* R */
384 psubusb RUpperLimit(%esp),%mm2
387 * here we are packing from RGB24 to RGB16
389 * mm6: G7 G6 G5 G4 G3 G2 G1 G0
390 * mm1: B7 B6 B5 B4 B3 B2 B1 B0
391 * mm2: R7 R6 R5 R4 R3 R2 R1 R0
392 * assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
393 * when H=2**xBITS-1 (x is for R G B)
395 * mm1- result: 4 low RGB16
396 * mm7- result: 4 high RGB16
397 * using: mm0- zero register
398 * mm3- temporary results
400 * for (i=0; i<8; i++) {
401 * RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
405 psllq RLeftShift(%esp),%mm2 /* position R in the most significant
407 movq %mm1,%mm7 /* mm1: Save B */
410 * note: no need for shift to place B on the least significant part of the byte
411 * R in left position, B in the right position so they can be combined
414 punpcklbw %mm2,%mm1 /* mm1: 4 low 16 bit RB */
415 pxor %mm0,%mm0 /* mm0: 0 */
416 punpckhbw %mm2,%mm7 /* mm5: 4 high 16 bit RB */
417 movq %mm6,%mm3 /* mm3: G */
418 punpcklbw %mm0,%mm6 /* mm6: low 4 G 16 bit */
419 psllw GLeftShift(%esp),%mm6 /* shift low G 5 positions */
420 punpckhbw %mm0,%mm3 /* mm3: high 4 G 16 bit */
421 por %mm6,%mm1 /* mm1: low RBG16 */
422 psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */
423 por %mm3,%mm7 /* mm5: high RBG16 */
425 movl tmpYCursorOdd(%esp),%ebp /* moved to here to save cycles
427 movq %mm1,(%edi) /* !! aligned */
429 /*- start odd line */
430 movq (%ebp,%ebx,2),%mm1 /* mm1 has 8 y pixels */
432 psubusb Yadd,%mm1 /* mm1 has 8 pixels y-16 */
434 punpcklbw %mm2,%mm1 /* get 4 low y-16 unsign pixels word */
435 pmullw Ymul,%mm1 /* low 4 luminance contribution */
436 punpckhbw %mm2,%mm5 /* 4 high y-16 */
437 pmullw Ymul,%mm5 /* high 4 luminance contribution */
438 movq %mm7,8(%edi) /* !! aligned */
440 paddw temp_mmx+24(%esp),%mm0 /* low 4 R */
442 psraw RRightShift(%esp),%mm0 /* low R scaled down by 6+(8-5) */
443 paddw temp_mmx+32(%esp),%mm5 /* high 4 R */
445 psraw RRightShift(%esp),%mm5 /* high R scaled down by 6+(8-5) */
446 paddw temp_mmx+16(%esp),%mm2 /* low 4 B */
447 packuswb %mm5,%mm0 /* mm0: R7 R6 R5 R4 R3 R2 R1 R0 */
448 psraw BRightShift(%esp),%mm2 /* low B scaled down by 6+(8-5) */
450 paddw temp_mmx+40(%esp),%mm6 /* high 4 B */
451 psraw BRightShift(%esp),%mm6 /* high B scaled down by 6+(8-5) */
452 movq temp_mmx+8(%esp),%mm3 /* chroma G low 4 */
453 packuswb %mm6,%mm2 /* mm2: B7 B6 B5 B4 B3 B2 B1 B0 */
455 punpcklwd %mm3,%mm3 /* replicate low 2 */
456 punpckhwd %mm4,%mm4 /* replicate high 2 */
457 psubw %mm3,%mm1 /* 4 low G */
458 psraw GRightShift(%esp),%mm1 /* low G scaled down by 6+(8-5) */
459 psubw %mm4,%mm5 /* 4 high G values in signed 16 bit */
460 psraw GRightShift(%esp),%mm5 /* high G scaled down by 6+(8-5) */
461 paddusb BUpperLimit(%esp),%mm2 /* mm1: saturate B+0FF-15 */
462 packuswb %mm5,%mm1 /*mm1: G7 G6 G5 G4 G3 G2 G1 G0 */
463 psubusb BUpperLimit(%esp),%mm2
464 paddusb GUpperLimit(%esp),%mm1 /* G */
465 psubusb GUpperLimit(%esp),%mm1
466 paddusb RUpperLimit(%esp),%mm0 /* R */
467 movl tmpCCOPitch(%esp),%eax
468 psubusb RUpperLimit(%esp),%mm0
471 * here we are packing from RGB24 to RGB16
472 * mm1: G7 G6 G5 G4 G3 G2 G1 G0
473 * mm2: B7 B6 B5 B4 B3 B2 B1 B0
474 * mm0: R7 R6 R5 R4 R3 R2 R1 R0
476 * mm2- result: 4 low RGB16
477 * mm7- result: 4 high RGB16
478 * using: mm4- zero register
479 * mm3- temporary results
482 psllq RLeftShift(%esp),%mm0 /* position R in the most significant
484 movq %mm2,%mm7 /* mm7: Save B */
487 * note: no need for shift to place B on the least significant part of the byte
488 * R in left position, B in the right position so they can be combined
491 punpcklbw %mm0,%mm2 /* mm1: 4 low 16 bit RB */
492 pxor %mm4,%mm4 /* mm4: 0 */
493 movq %mm1,%mm3 /* mm3: G */
494 punpckhbw %mm0,%mm7 /* mm7: 4 high 16 bit RB */
495 punpcklbw %mm4,%mm1 /* mm1: low 4 G 16 bit */
496 punpckhbw %mm4,%mm3 /* mm3: high 4 G 16 bit */
497 psllw GLeftShift(%esp),%mm1 /* shift low G 5 positions */
498 por %mm1,%mm2 /* mm2: low RBG16 */
499 psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */
500 por %mm3,%mm7 /* mm7: high RBG16 */
510 movq %mm0,(%edi,%eax,)
511 movq %mm1,8(%edi,%eax,)
512 movq %mm3,16(%edi,%eax,)
513 movq %mm5,24(%edi,%eax,)
518 movq %mm2,(%edi,%eax,)
519 movq %mm7,8(%edi,%eax,) /* aligned */
520 addl $16,%edi /* ih take 16 bytes (8 pixels-16 bit) */
521 addl $4,%ebx /* ? to take 4 pixels together
526 addl CCOSkipDistance(%esp),%edi /* go to begin of next line */
527 addl tmpCCOPitch(%esp),%edi /* skip odd line (if it is needed) */
529 // Lebp CCOPitch ; skip odd line
534 // Addeax AspectBaseCount
541 movl YPitch(%esp),%eax
542 movl tmpYCursorOdd(%esp),%ebp
543 addl %eax,%ebp /* skip one line */
544 // lea ebp, [ebp+2*eax] /* skip two lines */
545 movl %ebp,tmpYCursorEven(%esp)
546 // Sebp tmpYCursorOdd
548 addl %eax,%ebp /* skip one line */
549 movl %ebp,tmpYCursorOdd(%esp)
550 // Lebp tmpYCursorEven
551 // lea ebp, [ebp+2*eax]
552 // Sebp tmpYCursorEven
555 addl ChromaPitch(%esp),%esi
556 addl ChromaPitch(%esp),%edx
559 // Leax YLimit /* Done with last line? */
561 // jbe PrepareChromaLine
562 subw $2,FrameHeight(%esp)
565 /******************************************************************************/
569 addl $LocalFrameSize,%esp