1 /*****************************************************************************
2 * video_yuv_mmx.S: YUV transformation, optimized for MMX processors
4 *****************************************************************************
5 * Following functions are defined:
7 * This function performs YUV12-to-RGB16 color conversion for H26x.
8 * It handles any format in which there are three fields, the low
9 * order field being B and fully contained in the low order byte, the
10 * second field being G and being somewhere in bits 4 through 11,
11 * and the high order field being R and fully contained in the high
14 * The YUV12 input is planar, 8 bits per pel. The Y plane may have
15 * a pitch of up to 768. It may have a width less than or equal
16 * to the pitch. It must be DWORD aligned, and preferably QWORD
17 * aligned. Pitch and Width must be a multiple of four. For best
18 * performance, Pitch should not be 4 more than a multiple of 32.
19 * Height may be any amount, but must be a multiple of two. The U
20 * and V planes may have a different pitch than the Y plane, subject
21 * to the same limitations.
22 *****************************************************************************/
36 Minusg: .long 0x00800080, 0x00800080
37 Yadd: .long 0x10101010, 0x10101010
38 VtR: .long 0x00660066, 0x00660066
39 VtG: .long 0x00340034, 0x00340034
40 UtG: .long 0x00190019, 0x00190019
41 UtB: .long 0x00810081, 0x00810081
42 Ymul: .long 0x004a004a, 0x004a004a
43 UVtG: .long 0x00340019, 0x00340019
44 VtRUtB: .long 0x01990205, 0x01990205
45 fourbitu: .quad 0xf0f0f0f0f0f0f0f0
46 fivebitu: .quad 0xe0e0e0e0e0e0e0e0
47 sixbitu: .quad 0xc0c0c0c0c0c0c0c0
51 #define LocalFrameSize 156
52 #define RegisterStorageSize 16
54 //#define DOUBLE /*double le nombre de colonnes */
57 #define YPlane LocalFrameSize + RegisterStorageSize + 4
58 #define UPlane LocalFrameSize + RegisterStorageSize + 8
59 #define VPlane LocalFrameSize + RegisterStorageSize + 12
60 #define FrameWidth LocalFrameSize + RegisterStorageSize + 16
61 #define FrameHeight LocalFrameSize + RegisterStorageSize + 20
62 #define YPitch LocalFrameSize + RegisterStorageSize + 24
63 #define ChromaPitch LocalFrameSize + RegisterStorageSize + 28
64 #define AspectAdjustmentCount LocalFrameSize + RegisterStorageSize + 32
65 #define ColorConvertedFrame LocalFrameSize + RegisterStorageSize + 36
66 #define DCIOffset LocalFrameSize + RegisterStorageSize + 40
67 #define CCOffsetToLine0 LocalFrameSize + RegisterStorageSize + 44
68 #define CCOPitch LocalFrameSize + RegisterStorageSize + 48
69 #define CCType LocalFrameSize + RegisterStorageSize + 52
70 #define EndOfArgList LocalFrameSize + RegisterStorageSize + 56
72 /* Locals (on local stack frame) */
74 #define CCOSkipDistance 4
75 #define ChromaLineLen 8
77 #define DistanceFromVToU 16
78 #define EndOfChromaLine 20
79 #define AspectCount 24
80 #define AspectBaseCount 28
81 #define tmpYCursorEven 32
82 #define tmpYCursorOdd 36
83 #define tmpCCOPitch 40
86 #define GLeftShift 100
87 #define RRightShift 108
88 #define GRightShift 116
89 #define BRightShift 124
90 #define RUpperLimit 132
91 #define GUpperLimit 140
92 #define BUpperLimit 148
95 * extern void C ConvertYUV420RGB16MMX (
103 * UN AspectAdjustmentCount,
104 * U8* ColorConvertedFrame,
106 * U32 CCOffsetToLine0,
110 * The local variables are on the stack,
111 * The tables are in the one and only data segment.
113 * CCOffsetToLine0 is relative to ColorConvertedFrame.
114 * CCType used by RGB color convertors to determine the exact conversion type.
121 .globl ConvertYUV420RGB16MMX
122 ConvertYUV420RGB16MMX:
129 subl $LocalFrameSize,%esp
130 movl CCType(%esp),%eax
134 jmp *RGB_formats(,%eax,4)
138 movl $2,%ebx /* 10-8 for byte shift */
139 movl %ebx,RLeftShift(%esp)
140 movl %eax,RLeftShift+4(%esp)
142 movl %ebx,GLeftShift(%esp)
143 movl %eax,GLeftShift+4(%esp)
145 movl %ebx,RRightShift(%esp)
146 movl %eax,RRightShift+4(%esp)
147 movl %ebx,GRightShift(%esp)
148 movl %eax,GRightShift+4(%esp)
149 movl %ebx,BRightShift(%esp)
150 movl %eax,BRightShift+4(%esp)
152 movq %mm0,RUpperLimit(%esp)
153 movq %mm0,GUpperLimit(%esp)
154 movq %mm0,BUpperLimit(%esp)
159 movl $2,%ebx /* 8-6 */
160 movl %ebx,RLeftShift(%esp)
161 movl %eax,RLeftShift+4(%esp)
163 movl %ebx,GLeftShift(%esp)
164 movl %eax,GLeftShift+4(%esp)
166 movl %ebx,RRightShift(%esp)
167 movl %eax,RRightShift+4(%esp)
168 movl %ebx,GRightShift(%esp)
169 movl %eax,GRightShift+4(%esp)
171 movl %ebx,BRightShift(%esp)
172 movl %eax,BRightShift+4(%esp)
174 movq %mm0,RUpperLimit(%esp)
175 movq %mm0,GUpperLimit(%esp)
177 movq %mm0,BUpperLimit(%esp)
182 movl $2,%ebx /* 8-6 */
183 movl %ebx,RLeftShift(%esp)
184 movl %eax,RLeftShift+4(%esp)
186 movl %ebx,GLeftShift(%esp)
187 movl %eax,GLeftShift+4(%esp)
189 movl %ebx,RRightShift(%esp)
190 movl %eax,RRightShift+4(%esp)
192 movl %ebx,GRightShift(%esp)
193 movl %eax,GRightShift+4(%esp)
194 movl %ebx,BRightShift(%esp)
195 movl %eax,BRightShift+4(%esp)
197 movq %mm0,RUpperLimit(%esp)
199 movq %mm0,GUpperLimit(%esp)
200 movq %mm0,BUpperLimit(%esp)
205 movl $3,%ebx /* 8-5 */
206 movl %ebx,RLeftShift(%esp)
207 movl %eax,RLeftShift+4(%esp)
209 movl %ebx,GLeftShift(%esp)
210 movl %eax,GLeftShift+4(%esp)
212 movl %ebx,RRightShift(%esp)
213 movl %eax,RRightShift+4(%esp)
214 movl %ebx,BRightShift(%esp)
215 movl %eax,BRightShift+4(%esp)
217 movl %ebx,GRightShift(%esp)
218 movl %eax,GRightShift+4(%esp)
220 movq %mm0,RUpperLimit(%esp)
221 movq %mm0,BUpperLimit(%esp)
223 movq %mm0,GUpperLimit(%esp)
227 movl VPlane(%esp),%ebx
228 movl UPlane(%esp),%ecx
230 movl %ecx,DistanceFromVToU(%esp)
232 movl ColorConvertedFrame(%esp),%eax
233 addl DCIOffset(%esp),%eax
234 addl CCOffsetToLine0(%esp),%eax
235 movl %eax,CCOCursor(%esp)
238 movl YPitch(%esp),%ecx
239 movl FrameWidth(%esp),%ebx
240 movl CCOPitch(%esp),%eax
241 subl %ebx,%eax /* CCOPitch-FrameWidth */
242 subl %ebx,%eax /* CCOPitch-2*FrameWidth */
243 sarl %ebx /* FrameWidth/2 */
244 movl YPlane(%esp),%esi /* Fetch cursor over luma plane. */
245 movl %ebx,ChromaLineLen(%esp) /* FrameWidth/2 */
246 movl %eax,CCOSkipDistance(%esp) /* CCOPitch-3*FrameWidth */
247 movl %esi,YCursor(%esp)
248 movl AspectAdjustmentCount(%esp),%edx
249 movl VPlane(%esp),%esi
253 movl %edx,AspectCount(%esp)
254 movl %edx,AspectBaseCount(%esp)
257 movl ChromaLineLen(%esp),%edi
258 movl %edi,EndOfChromaLine(%esp)
259 movl CCOCursor(%esp),%edi
261 movl DistanceFromVToU(%esp),%edx
262 movl YCursor(%esp),%ebp /* Fetch Y Pitch. */
263 movl FrameWidth(%esp),%ebx
266 movl %ebp,tmpYCursorEven(%esp)
267 movl YPitch(%esp),%eax
269 movl %ebp,tmpYCursorOdd(%esp)
275 movl %ebx,FrameWidth(%esp)
282 movl AspectCount(%esp),%ebp
283 movl FrameWidth(%esp),%ebx
285 movl CCOPitch(%esp),%eax
286 movl %eax,tmpCCOPitch(%esp)
290 addl AspectAdjustmentCount(%esp),%ebp
291 movl %eax,tmpCCOPitch(%esp)
293 movl %ebp,AspectCount(%esp)
296 movl tmpYCursorEven(%esp),%ebp
297 /* here is even line */
298 movd (%edx,%ebx,),%mm1 /* 4 u values */
299 pxor %mm0,%mm0 /* mm0=0 */
300 movd (%esi,%ebx,),%mm2 /* 4 v values */
301 punpcklbw %mm0,%mm1 /* get 4 unsign u */
302 psubw Minusg,%mm1 /* get 4 unsign u-128 */
303 punpcklbw %mm0,%mm2 /* get unsign v */
304 psubw Minusg,%mm2 /* get unsign v-128 */
305 movq %mm1,%mm3 /* save the u-128 unsign */
306 movq %mm1,%mm5 /* save u-128 unsign */
307 punpcklwd %mm2,%mm1 /* get 2 low u, v unsign pairs */
309 punpckhwd %mm2,%mm3 /* create high 2 unsign uv pairs */
311 movq %mm2,temp_mmx(%esp) /* save v-128 */
312 movq (%ebp,%ebx,2),%mm6 /* mm6 has 8 y pixels */
313 psubusb Yadd,%mm6 /* mm6 has 8 y-16 pixels */
314 packssdw %mm3,%mm1 /* packed the results to signed words */
315 movq %mm6,%mm7 /* save the 8 y-16 pixels */
316 punpcklbw %mm0,%mm6 /* mm6 has 4 low y-16 unsign */
318 punpckhbw %mm0,%mm7 /* mm7 has 4 high y-16 unsign */
321 movq %mm1,temp_mmx+8(%esp) /* save 4 chroma G values */
322 punpcklwd %mm1,%mm1 /* chroma G replicate low 2 */
323 movq %mm6,%mm0 /* low y */
324 punpckhwd %mm4,%mm4 /* chroma G replicate high 2 */
325 movq %mm7,%mm3 /* high y */
326 psubw %mm1,%mm6 /* 4 low G */
327 psraw GRightShift(%esp),%mm6
328 psubw %mm4,%mm7 /* 4 high G values in signed 16 bit */
330 punpcklwd %mm5,%mm5 /* replicate the 2 low u pixels */
333 psraw GRightShift(%esp),%mm7
335 packuswb %mm7,%mm6 /* mm6: G7 G6 G5 G4 G3 G2 G1 G0 */
336 movq %mm5,temp_mmx+16(%esp) /* low chroma B */
337 paddw %mm0,%mm5 /* 4 low B values in signed 16 bit */
338 movq %mm2,temp_mmx+40(%esp) /* high chroma B */
339 paddw %mm3,%mm2 /* 4 high B values in signed 16 bit */
340 psraw BRightShift(%esp),%mm5 /* low B scaled down by 6+(8-5) */
341 psraw BRightShift(%esp),%mm2 /* high B scaled down by 6+(8-5) */
342 packuswb %mm2,%mm5 /* mm5: B7 B6 B5 B4 B3 B2 B1 B0 */
344 movq temp_mmx(%esp),%mm2 /* 4 v values */
345 movq %mm5,%mm1 /* save B */
347 punpcklwd %mm2,%mm2 /* replicate the 2 low v pixels */
351 paddusb BUpperLimit(%esp),%mm1 /* mm1: saturate B+0FF-15 */
352 movq %mm2,temp_mmx+24(%esp) /* low chroma R */
353 paddw %mm0,%mm2 /* 4 low R values in signed 16 bit */
354 psraw RRightShift(%esp),%mm2 /* low R scaled down by 6+(8-5) */
355 pxor %mm4,%mm4 /* mm4=0 for 8->16 conversion */
356 movq %mm7,temp_mmx+32(%esp) /* high chroma R */
357 paddw %mm3,%mm7 /* 4 high R values in signed 16 bit */
358 psraw RRightShift(%esp),%mm7 /* high R scaled down by 6+(8-5) */
359 psubusb BUpperLimit(%esp),%mm1
360 packuswb %mm7,%mm2 /* mm2: R7 R6 R5 R4 R3 R2 R1 R0 */
361 paddusb GUpperLimit(%esp),%mm6 /* G fast patch ih */
362 psubusb GUpperLimit(%esp),%mm6 /* fast patch ih */
363 paddusb RUpperLimit(%esp),%mm2 /* R */
364 psubusb RUpperLimit(%esp),%mm2
367 * here we are packing from RGB24 to RGB16
369 * mm6: G7 G6 G5 G4 G3 G2 G1 G0
370 * mm1: B7 B6 B5 B4 B3 B2 B1 B0
371 * mm2: R7 R6 R5 R4 R3 R2 R1 R0
372 * assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
373 * when H=2**xBITS-1 (x is for R G B)
375 * mm1- result: 4 low RGB16
376 * mm7- result: 4 high RGB16
377 * using: mm0- zero register
378 * mm3- temporary results
380 * for (i=0; i<8; i++) {
381 * RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
385 psllq RLeftShift(%esp),%mm2 /* position R in the most significant
387 movq %mm1,%mm7 /* mm1: Save B */
390 * note: no need for shift to place B on the least significant part of the byte
391 * R in left position, B in the right position so they can be combined
394 punpcklbw %mm2,%mm1 /* mm1: 4 low 16 bit RB */
395 pxor %mm0,%mm0 /* mm0: 0 */
396 punpckhbw %mm2,%mm7 /* mm5: 4 high 16 bit RB */
397 movq %mm6,%mm3 /* mm3: G */
398 punpcklbw %mm0,%mm6 /* mm6: low 4 G 16 bit */
399 psllw GLeftShift(%esp),%mm6 /* shift low G 5 positions */
400 punpckhbw %mm0,%mm3 /* mm3: high 4 G 16 bit */
401 por %mm6,%mm1 /* mm1: low RBG16 */
402 psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */
403 por %mm3,%mm7 /* mm5: high RBG16 */
405 movl tmpYCursorOdd(%esp),%ebp /* moved to here to save cycles
407 movq %mm1,(%edi) /* !! aligned */
409 /*- start odd line */
410 movq (%ebp,%ebx,2),%mm1 /* mm1 has 8 y pixels */
412 psubusb Yadd,%mm1 /* mm1 has 8 pixels y-16 */
414 punpcklbw %mm2,%mm1 /* get 4 low y-16 unsign pixels word */
415 pmullw Ymul,%mm1 /* low 4 luminance contribution */
416 punpckhbw %mm2,%mm5 /* 4 high y-16 */
417 pmullw Ymul,%mm5 /* high 4 luminance contribution */
418 movq %mm7,8(%edi) /* !! aligned */
420 paddw temp_mmx+24(%esp),%mm0 /* low 4 R */
422 psraw RRightShift(%esp),%mm0 /* low R scaled down by 6+(8-5) */
423 paddw temp_mmx+32(%esp),%mm5 /* high 4 R */
425 psraw RRightShift(%esp),%mm5 /* high R scaled down by 6+(8-5) */
426 paddw temp_mmx+16(%esp),%mm2 /* low 4 B */
427 packuswb %mm5,%mm0 /* mm0: R7 R6 R5 R4 R3 R2 R1 R0 */
428 psraw BRightShift(%esp),%mm2 /* low B scaled down by 6+(8-5) */
430 paddw temp_mmx+40(%esp),%mm6 /* high 4 B */
431 psraw BRightShift(%esp),%mm6 /* high B scaled down by 6+(8-5) */
432 movq temp_mmx+8(%esp),%mm3 /* chroma G low 4 */
433 packuswb %mm6,%mm2 /* mm2: B7 B6 B5 B4 B3 B2 B1 B0 */
435 punpcklwd %mm3,%mm3 /* replicate low 2 */
436 punpckhwd %mm4,%mm4 /* replicate high 2 */
437 psubw %mm3,%mm1 /* 4 low G */
438 psraw GRightShift(%esp),%mm1 /* low G scaled down by 6+(8-5) */
439 psubw %mm4,%mm5 /* 4 high G values in signed 16 bit */
440 psraw GRightShift(%esp),%mm5 /* high G scaled down by 6+(8-5) */
441 paddusb BUpperLimit(%esp),%mm2 /* mm1: saturate B+0FF-15 */
442 packuswb %mm5,%mm1 /*mm1: G7 G6 G5 G4 G3 G2 G1 G0 */
443 psubusb BUpperLimit(%esp),%mm2
444 paddusb GUpperLimit(%esp),%mm1 /* G */
445 psubusb GUpperLimit(%esp),%mm1
446 paddusb RUpperLimit(%esp),%mm0 /* R */
447 movl tmpCCOPitch(%esp),%eax
448 psubusb RUpperLimit(%esp),%mm0
451 * here we are packing from RGB24 to RGB16
452 * mm1: G7 G6 G5 G4 G3 G2 G1 G0
453 * mm2: B7 B6 B5 B4 B3 B2 B1 B0
454 * mm0: R7 R6 R5 R4 R3 R2 R1 R0
456 * mm2- result: 4 low RGB16
457 * mm7- result: 4 high RGB16
458 * using: mm4- zero register
459 * mm3- temporary results
462 psllq RLeftShift(%esp),%mm0 /* position R in the most significant
464 movq %mm2,%mm7 /* mm7: Save B */
467 * note: no need for shift to place B on the least significant part of the byte
468 * R in left position, B in the right position so they can be combined
471 punpcklbw %mm0,%mm2 /* mm1: 4 low 16 bit RB */
472 pxor %mm4,%mm4 /* mm4: 0 */
473 movq %mm1,%mm3 /* mm3: G */
474 punpckhbw %mm0,%mm7 /* mm7: 4 high 16 bit RB */
475 punpcklbw %mm4,%mm1 /* mm1: low 4 G 16 bit */
476 punpckhbw %mm4,%mm3 /* mm3: high 4 G 16 bit */
477 psllw GLeftShift(%esp),%mm1 /* shift low G 5 positions */
478 por %mm1,%mm2 /* mm2: low RBG16 */
479 psllw GLeftShift(%esp),%mm3 /* shift high G 5 positions */
480 por %mm3,%mm7 /* mm7: high RBG16 */
490 movq %mm0,(%edi,%eax,)
491 movq %mm1,8(%edi,%eax,)
492 movq %mm3,16(%edi,%eax,)
493 movq %mm5,24(%edi,%eax,)
498 movq %mm2,(%edi,%eax,)
499 movq %mm7,8(%edi,%eax,) /* aligned */
500 addl $16,%edi /* ih take 16 bytes (8 pixels-16 bit) */
501 addl $4,%ebx /* ? to take 4 pixels together
506 addl CCOSkipDistance(%esp),%edi /* go to begin of next line */
507 addl tmpCCOPitch(%esp),%edi /* skip odd line (if it is needed) */
509 // Lebp CCOPitch ; skip odd line
514 // Addeax AspectBaseCount
521 movl YPitch(%esp),%eax
522 movl tmpYCursorOdd(%esp),%ebp
523 addl %eax,%ebp /* skip one line */
524 // lea ebp, [ebp+2*eax] /* skip two lines */
525 movl %ebp,tmpYCursorEven(%esp)
526 // Sebp tmpYCursorOdd
528 addl %eax,%ebp /* skip one line */
529 movl %ebp,tmpYCursorOdd(%esp)
530 // Lebp tmpYCursorEven
531 // lea ebp, [ebp+2*eax]
532 // Sebp tmpYCursorEven
535 addl ChromaPitch(%esp),%esi
536 addl ChromaPitch(%esp),%edx
539 // Leax YLimit /* Done with last line? */
541 // jbe PrepareChromaLine
542 subw $2,FrameHeight(%esp)
545 /******************************************************************************/
549 addl $LocalFrameSize,%esp