X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=bigscreen%2Ftinyptc%2Fyv12.s;fp=bigscreen%2Ftinyptc%2Fyv12.s;h=aa158c2c1bb57c4e935a3833b52e77db4d3020f5;hb=140a0aae7299d15459fe9ec74ad5020887e0a960;hp=0000000000000000000000000000000000000000;hpb=fd56777b1051147ba14850b72f1f959cd82a1827;p=ccbs diff --git a/bigscreen/tinyptc/yv12.s b/bigscreen/tinyptc/yv12.s new file mode 100644 index 0000000..aa158c2 --- /dev/null +++ b/bigscreen/tinyptc/yv12.s @@ -0,0 +1,594 @@ +; +; TinyPTC x11 v0.7.3 MMX-Optimized YV12 converter +; Copyright (C) 2002 Fred Howell +; +; http://www.sourceforge.net/projects/tinyptc/ +; +; This library is free software; you can redistribute it and/or +; modify it under the terms of the GNU Lesser General Public +; License as published by the Free Software Foundation; either +; version 2 of the License, or (at your option) any later version. +; +; This library is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Lesser General Public License for more details. +; +; You should have received a copy of the GNU Lesser General Public +; License along with this library; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; + +bits 32 + +global convert_yv12_mmx + +section .data + +align 16 + +; static short ygr0bcoff[] = {33058/2,16390/2,0,6405/2}; +; static short vgr0bcoff[] = {-24110/2,28781/2,0,-4671/2}; +; static short ugr0bcoff[] = {-19068/2,-9713/2,0,28781/2}; +; static short yb0grcoff[] = {6405/2,0,33058/2,16390/2}; +; static short vb0grcoff[] = {-4671/2,0,-24110/2,28781/2}; +; static short ub0grcoff[] = {28781/2,-19068/2,-9713/2,0}; +; +; static unsigned short add2w[] = {1,1,1,1}; +; static short aoff[] = {16,128,16,128}; +; static unsigned char bmask[] = {0xff, 0, 0, 0, 0, 0, 0, 0}; +; static unsigned short grmask[] = {0,0xffff,0,0}; + +thezero dw 0,0,0,0 +ygr0bcoff dw 0x4091,0x2003,0x0000,0x0c82 +ugr0bcoff dw 0xdac2,0xed08,0x0000,0x3836 +vgr0bcoff dw 0xd0e9,0x3836,0x0000,0xf6e1 +yb0grcoff dw 0x0c82,0x0000,0x4091,0x2003 +ub0grcoff dw 0x3836,0x0000,0xdac2,0xed08 +vb0grcoff dw 0xf6e1,0x0000,0xd0e9,0x3836 +yoff dw 0x0010,0x0010,0x0010,0x0010 +uvoff dw 0x0080,0x0080,0x0080,0x0080 +add2w dw 1,1,1,1 +grmask dw 0,0xffff,0,0 +bmask db 0xff,0,0,0, 0,0,0,0 + + +section text + +align 16 + +convert_yv12_mmx: + +push ebp +push eax +push ebx +push ecx +push edx +push edi +push esi + +;// initialisation du mm7 à zero +pxor mm7,mm7 + +%assign _P 7*4 +mov edx, [esp+_P+ 4] +mov ebx, [esp+_P+8] +mov ebp,[esp+_P+12] +mov eax,[esp+_P+16] +mov ecx,[esp+_P+20] +mov edi,ecx +mov esi,ecx +shr ecx, 3 +shl edi, 2 + + +.while: +; 1ere quad 1ere ligne + movq mm3, [edx]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + +; Y + movq mm5, mm3; + + pmaddwd mm5, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm5, mm1 + psrad mm5,15; + + +; U + movq mm6, mm3; + + pmaddwd mm6, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm6, mm1; + psrad mm6,15; + + +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + movq mm7, mm3; + +; 1ere quad 2eme ligne + movq mm3, [edx+edi]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + + +; Y + movq mm2, mm3; + + pmaddwd mm2, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm2, mm1 + psrad mm2,15; + +; U + movq mm0, mm3; + + pmaddwd mm0, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm0, mm1; + psrad mm0,15; + + packssdw mm6,mm0; + pmaddwd mm6,[add2w]; + + packssdw mm6,[thezero]; +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + packssdw mm7,mm3; + pmaddwd mm7,[add2w]; + + packssdw mm7,[thezero]; + +; 2eme quad 1ere ligne + movq mm3, [edx+8]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + +; Y + movq mm0, mm3; + + pmaddwd mm0, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm0, mm1 + psrad mm0,15; + packssdw mm5,mm0 + + paddw mm5,[yoff] + packuswb mm5,mm5 + + movd [ebx], mm5 + +; U + movq mm0, mm3; + + pmaddwd mm0, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm0, mm1; + psrad mm0,15; + + packssdw mm0,mm0 + psllq mm0,32 + + paddw mm6,mm0 + + +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + packssdw mm3,mm3 + psllq mm3,32 + + paddw mm7,mm3 + +; 2eme quad 2eme ligne + movq mm3, [edx+edi+8]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + + +; Y + movq mm0, mm3; + + pmaddwd mm0, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm0, mm1 + psrad mm0,15; + packssdw mm2,mm0 + + paddw mm2,[yoff] + packuswb mm2,mm2 + + movd [ebx+esi], mm2 + +; U + movq mm0, mm3; + + pmaddwd mm0, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm0, mm1; + psrad mm0,15; + + packssdw mm0,mm0 + psllq mm0,32 + + paddw mm6,mm0 + pmaddwd mm6,[add2w] + packssdw mm6,[thezero] +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + packssdw mm3,mm3 + psllq mm3,32 + + paddw mm7,mm3 + pmaddwd mm7,[add2w] + packssdw mm7,[thezero] + +; 3eme quad 1ere ligne + movq mm3, [edx+16]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + +; Y + movq mm5, mm3; + + pmaddwd mm5, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm5, mm1 + psrad mm5,15; + + +; U + movq mm0, mm3; + + pmaddwd mm0, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm0, mm1; + psrad mm0,15; + + packssdw mm0,mm0 + pmaddwd mm0,[add2w] + psllq mm0,32 + paddw mm6, mm0 + +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + packssdw mm3,mm3 + pmaddwd mm3,[add2w] + psllq mm3,32 + paddw mm7, mm3 + +; 3eme quad 2eme ligne + movq mm3, [edx+edi+16]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + + +; Y + movq mm2, mm3; + + pmaddwd mm2, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm2, mm1 + psrad mm2,15; + +; U + movq mm0, mm3; + + pmaddwd mm0, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm0, mm1; + psrad mm0,15; + + packssdw mm0,mm0 + pmaddwd mm0,[add2w] + psllq mm0,32 + paddw mm6, mm0 +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + packssdw mm3,mm3 + pmaddwd mm3,[add2w] + psllq mm3,32 + paddw mm7, mm3 + +; 4eme quad 1ere ligne + movq mm3, [edx+24]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + +; Y + movq mm0, mm3; + + pmaddwd mm0, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm0, mm1 + psrad mm0,15; + packssdw mm5,mm0 + + paddw mm5,[yoff] + packuswb mm5,mm5 + + movd [ebx+4], mm5 + +; U + movq mm0, mm3; + + pmaddwd mm0, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm0, mm1; + psrad mm0,15; + + packssdw mm0,mm0 + pmaddwd mm0,[add2w] + psllq mm0,48 + paddw mm6, mm0 + +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + packssdw mm3,mm3 + pmaddwd mm3,[add2w] + psllq mm3,48 + paddw mm7, mm3 + +; 4eme quad 2eme line + movq mm3, [edx+edi+24]; + movq mm1,mm3; + psrlq mm3,8; + movq mm4,mm1; + psrlq mm1, 24; + pand mm4, [bmask]; + pand mm1, [grmask]; + por mm4, mm1; + punpcklbw mm3, [thezero]; + punpcklbw mm4, [thezero]; + + +; Y + movq mm0, mm3; + + pmaddwd mm0, [ygr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [yb0grcoff]; + + paddd mm0, mm1 + psrad mm0,15; + packssdw mm2,mm0 + + paddw mm2,[yoff] + packuswb mm2,mm2 + + movd [ebx+esi+4], mm2 + +; U + movq mm0, mm3; + + pmaddwd mm0, [ugr0bcoff]; + + movq mm1, mm4; + + pmaddwd mm1, [ub0grcoff]; + + paddd mm0, mm1; + psrad mm0,15; + + packssdw mm0,mm0 + pmaddwd mm0,[add2w] + psllq mm0,48 + paddw mm6, mm0 + + psraw mm6,2 + paddw mm6, [uvoff] + packuswb mm6,mm6 + movd [eax],mm6 +; V + + pmaddwd mm3, [vgr0bcoff]; + + + pmaddwd mm4, [vb0grcoff]; + + paddd mm3, mm4; + psrad mm3,15; + + packssdw mm3,mm3 + pmaddwd mm3,[add2w] + psllq mm3,48 + paddw mm7, mm3 + + psraw mm7,2 + paddw mm7, [uvoff] + packuswb mm7,mm7 + movd [ebp],mm7 + + + dec ecx + + cmp ecx,0 + +jz .fin_while + +; preparations pour les 4 quads suivantes + lea edx, [edx + 32]; + lea ebx, [ebx + 8]; + lea eax, [eax + 4]; + lea ebp, [ebp + 4]; + +jmp .while + +.fin_while: +emms + +pop esi +pop edi +pop edx +pop ecx +pop ebx +pop eax +pop ebp + +ret ;//The End