X-Git-Url: https://git.sesse.net/?p=ccbs;a=blobdiff_plain;f=bigscreen%2Ftinyptc%2Fmmx.s;fp=bigscreen%2Ftinyptc%2Fmmx.s;h=6509c4759c9564ff6516ae22e43b69de3bf29e56;hp=0000000000000000000000000000000000000000;hb=140a0aae7299d15459fe9ec74ad5020887e0a960;hpb=fd56777b1051147ba14850b72f1f959cd82a1827 diff --git a/bigscreen/tinyptc/mmx.s b/bigscreen/tinyptc/mmx.s new file mode 100644 index 0000000..6509c47 --- /dev/null +++ b/bigscreen/tinyptc/mmx.s @@ -0,0 +1,582 @@ +; +; TinyPTC x11 v0.7.3 MMX-Optimized pixelformat converters +; Copyright (C) 2000-2002 Alessandro Gatti +; Copyright (C) 2000-2001 Glenn Fiedler +; +; http://www.sourceforge.net/projects/tinyptc/ +; +; This library is free software; you can redistribute it and/or +; modify it under the terms of the GNU Lesser General Public +; License as published by the Free Software Foundation; either +; version 2 of the License, or (at your option) any later version. +; +; This library is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Lesser General Public License for more details. +; +; You should have received a copy of the GNU Lesser General Public +; License along with this library; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; + +%include "mmx.i" + +bits 32 + +%ifdef __PTC_MMX__ +global mmx_memcpy +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888 +global mmx_convert_32_to_32_bgr888 +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888 +global mmx_convert_32_to_24_rgb888 +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888 +global mmx_convert_32_to_24_bgr888 +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565 +global mmx_convert_32_to_16_rgb565 +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565 +global mmx_convert_32_to_16_bgr565 +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555 +global mmx_convert_32_to_16_rgb555 +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555 +global mmx_convert_32_to_16_bgr555 +%endif + +section .data + +align 16 + +mmx_rgb888_mask dd 00ffffffh,00ffffffh + +mmx_rgb565_b dd 000000f8h, 000000f8h +mmx_rgb565_g dd 0000fc00h, 0000fc00h +mmx_rgb565_r dd 00f80000h, 00f80000h + +mmx_rgb555_rb dd 00f800f8h,00f800f8h +mmx_rgb555_g dd 0000f800h,0000f800h +mmx_rgb555_mul dd 20000008h,20000008h +mmx_bgr555_mul dd 00082000h,00082000h + +section .text + +%ifdef __PTC_MMX__ + +align 16 + +mmx_memcpy: + + push ebp + mov ebp,esp + + pushad + + mov edi,[ebp+8] ; destination + mov esi,[ebp+12] ; source + mov ecx,[ebp+16] ; bytes + + mov eax,ecx + shr ecx,6 + mov ebx,ecx + shl ebx,6 + sub eax,ebx + +align 16 + + .loop + + movq mm0,[esi] + movq mm1,[esi+8] + movq mm2,[esi+16] + movq mm3,[esi+24] + movq mm4,[esi+32] + movq mm5,[esi+40] + movq mm6,[esi+48] + movq mm7,[esi+56] + movq [edi],mm0 + movq [edi+8],mm1 + movq [edi+16],mm2 + movq [edi+24],mm3 + movq [edi+32],mm4 + movq [edi+40],mm5 + movq [edi+48],mm6 + movq [edi+56],mm7 + add esi,8*8 + add edi,8*8 + dec ecx + jnz .loop + + mov ecx,eax + rep movsb + + emms + + popad + + pop ebp + ret + +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888 + +align 16 + +mmx_convert_32_to_32_bgr888: + + ret + +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888 + + +align 16 + +mmx_convert_32_to_24_rgb888: + + push ebp + mov ebp,esp + + pushad + + mov edi,[ebp+8] ; destination + mov esi,[ebp+12] ; source + mov ecx,[ebp+16] ; bytes + + ; set up mm6 as the mask, mm7 as zero + movq mm6, qword [mmx_rgb888_mask] + pxor mm7, mm7 + + mov edx, ecx ; save ecx + and ecx, 0fffffffch ; clear lower two bits + jnz .L1 + jmp .L2 + +.L1: + + movq mm0, [esi] ; A R G B a r g b + pand mm0, mm6 ; 0 R G B 0 r g b + movq mm1, [esi+8] ; A R G B a r g b + pand mm1, mm6 ; 0 R G B 0 r g b + + movq mm2, mm0 ; 0 R G B 0 r g b + punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B + punpckldq mm0, mm7 ; 0 0 0 0 0 r g b + psllq mm2, 24 ; 0 0 R G B 0 0 0 + por mm0, mm2 ; 0 0 R G B r g b + + movq mm3, mm1 ; 0 R G B 0 r g b + psllq mm3, 48 ; g b 0 0 0 0 0 0 + por mm0, mm3 ; g b R G B r g b + + movq mm4, mm1 ; 0 R G B 0 r g b + punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B + punpckldq mm1, mm7 ; 0 0 0 0 0 r g b + psrlq mm1, 16 ; 0 0 0 R G B 0 r + psllq mm4, 8 ; 0 0 0 0 R G B 0 + por mm1, mm4 ; 0 0 0 0 R G B r + + movq [edi], mm0 + add esi, BYTE 16 + movd [edi+8], mm1 + add edi, BYTE 12 + sub ecx, BYTE 4 + jnz .L1 + +.L2: + mov ecx, edx + and ecx, BYTE 3 + jz .L4 +.L3: + mov al, [esi] + mov bl, [esi+1] + mov dl, [esi+2] + mov [edi], al + mov [edi+1], bl + mov [edi+2], dl + add esi, BYTE 4 + add edi, BYTE 3 + dec ecx + jnz .L3 +.L4: + + emms + + popad + + pop ebp + ret + +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888 + +align 16 + +mmx_convert_32_to_24_bgr888: + + ret + +%endif + + +%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565 + +align 16 + +mmx_convert_32_to_16_rgb565: + + push ebp + mov ebp,esp + + pushad + + mov edi,[ebp+8] ; destination + mov esi,[ebp+12] ; source + mov ecx,[ebp+16] ; bytes + + ; set up masks + movq mm5, [mmx_rgb565_b] + movq mm6, [mmx_rgb565_g] + movq mm7, [mmx_rgb565_r] + + mov edx, ecx + shr ecx, 2 + jnz .L1 + jmp .L2 ; not necessary at the moment, but doesn't hurt (much) + +.L1: + movq mm0, [esi] ; argb + movq mm1, mm0 ; argb + pand mm0, mm6 ; 00g0 + movq mm3, mm1 ; argb + pand mm1, mm5 ; 000b + pand mm3, mm7 ; 0r00 + pslld mm1, 2 ; 0 0 000000bb bbb00000 + por mm0, mm1 ; 0 0 ggggggbb bbb00000 + psrld mm0, 5 ; 0 0 00000ggg gggbbbbb + + movq mm4, [esi+8] ; argb + movq mm2, mm4 ; argb + pand mm4, mm6 ; 00g0 + movq mm1, mm2 ; argb + pand mm2, mm5 ; 000b + pand mm1, mm7 ; 0r00 + pslld mm2, 2 ; 0 0 000000bb bbb00000 + por mm4, mm2 ; 0 0 ggggggbb bbb00000 + psrld mm4, 5 ; 0 0 00000ggg gggbbbbb + + packuswb mm3, mm1 ; R 0 r 0 + packssdw mm0, mm4 ; as above.. ish + por mm0, mm3 ; done. + movq [edi], mm0 + + add esi, 16 + add edi, 8 + dec ecx + jnz .L1 + +.L2: + mov ecx, edx + and ecx, BYTE 3 + jz .L4 +.L3: + mov al, [esi] + mov bh, [esi+1] + mov ah, [esi+2] + shr al, 3 + and eax, 0F81Fh ; BYTE? + shr ebx, 5 + and ebx, 07E0h ; BYTE? + add eax, ebx + mov [edi], al + mov [edi+1], ah + add esi, BYTE 4 + add edi, BYTE 2 + dec ecx + jnz .L3 + +.L4: + + emms + + popad + + pop ebp + ret + +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565 + +align 16 + +mmx_convert_32_to_16_bgr565: + + push ebp + mov ebp,esp + + pushad + + mov edi,[ebp+8] ; destination + mov esi,[ebp+12] ; source + mov ecx,[ebp+16] ; bytes + + movq mm5, [mmx_rgb565_r] + movq mm6, [mmx_rgb565_g] + movq mm7, [mmx_rgb565_b] + + mov edx, ecx + shr ecx, 2 + jnz .L1 + jmp .L2 + +.L1: + movq mm0, [esi] ; a r g b + movq mm1, mm0 ; a r g b + pand mm0, mm6 ; 0 0 g 0 + movq mm3, mm1 ; a r g b + pand mm1, mm5 ; 0 r 0 0 + pand mm3, mm7 ; 0 0 0 b + + psllq mm3, 16 ; 0 b 0 0 + psrld mm1, 14 ; 0 0 000000rr rrr00000 + por mm0, mm1 ; 0 0 ggggggrr rrr00000 + psrld mm0, 5 ; 0 0 00000ggg gggrrrrr + + movq mm4, [esi+8] ; a r g b + movq mm2, mm4 ; a r g b + pand mm4, mm6 ; 0 0 g 0 + movq mm1, mm2 ; a r g b + pand mm2, mm5 ; 0 r 0 0 + pand mm1, mm7 ; 0 0 0 b + + psllq mm1, 16 ; 0 b 0 0 + psrld mm2, 14 ; 0 0 000000rr rrr00000 + por mm4, mm2 ; 0 0 ggggggrr rrr00000 + psrld mm4, 5 ; 0 0 00000ggg gggrrrrr + + packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 + packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR + por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr + movq [edi], mm0 + + add esi, BYTE 16 + add edi, BYTE 8 + dec ecx + jnz .L1 + +.L2: + and edx, BYTE 3 + jz .L4 +.L3: + mov al, [esi+2] + mov bh, [esi+1] + mov ah, [esi] + shr al, 3 + and eax, 0F81Fh ; BYTE ? + shr ebx, 5 + and ebx, 07E0h ; BYTE ? + add eax, ebx + mov [edi], al + mov [edi+1], ah + add esi, BYTE 4 + add edi, BYTE 2 + dec edx + jnz .L3 + +.L4: + + emms + + popad + + pop ebp + ret + +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555 + +align 16 + +mmx_convert_32_to_16_bgr555: + + ; the 16BGR555 converter is identical to the RGB555 one, + ; except it uses a different multiplier for the pmaddwd + ; instruction. cool huh. + + movq mm7, qword [mmx_bgr555_mul] + +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555 +%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555 + jmp _convert_bgr555_cheat +%endif + + ; This is the same as the Intel version.. they obviously went to + ; much more trouble to expand/coil the loop than I did, so theirs + ; would almost certainly be faster, even if only a little. + ; I did rename 'mmx_rgb555_add' to 'mmx_rgb555_mul', which is + ; (I think) a more accurate name.. + +align 16 + +mmx_convert_32_to_16_rgb555: + + movq mm7,qword [mmx_rgb555_mul] + +%endif + +%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555 +%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555 + +_convert_bgr555_cheat: + + movq mm6,qword [mmx_rgb555_g] + push ebp + mov ebp,esp + + pushad + + mov edi,[ebp+8] ; destination + mov esi,[ebp+12] ; source + mov ecx,[ebp+16] ; bytes + + mov edx,ecx ; Save ecx + + and ecx,BYTE 0fffffff8h ; clear lower three bits + jnz .L_OK + jmp .L2 + +.L_OK: + + movq mm2,[esi+8] + + movq mm0,[esi] + movq mm3,mm2 + + pand mm3,qword [mmx_rgb555_rb] + movq mm1,mm0 + + pand mm1,qword [mmx_rgb555_rb] + pmaddwd mm3,mm7 + + pmaddwd mm1,mm7 + pand mm2,mm6 + +.L1: + movq mm4,[esi+24] + pand mm0,mm6 + + movq mm5,[esi+16] + por mm3,mm2 + + psrld mm3,6 + por mm1,mm0 + + movq mm0,mm4 + psrld mm1,6 + + pand mm0,qword [mmx_rgb555_rb] + packssdw mm1,mm3 + + movq mm3,mm5 + pmaddwd mm0,mm7 + + pand mm3,qword [mmx_rgb555_rb] + pand mm4,mm6 + + movq [edi],mm1 + pmaddwd mm3,mm7 + + add esi,BYTE 32 + por mm4,mm0 + + pand mm5,mm6 + psrld mm4,6 + + movq mm2,[esi+8] + por mm5,mm3 + + movq mm0,[esi] + psrld mm5,6 + + movq mm3,mm2 + movq mm1,mm0 + + pand mm3,qword [mmx_rgb555_rb] + packssdw mm5,mm4 + + pand mm1,qword [mmx_rgb555_rb] + pand mm2,mm6 + + movq [edi+8],mm5 + pmaddwd mm3,mm7 + + pmaddwd mm1,mm7 + add edi,BYTE 16 + + sub ecx,BYTE 8 + jz .L2 + jmp .L1 + +.L2: + mov ecx,edx + + and ecx,BYTE 7 + jz .L4 + +.L3: + mov ebx,[esi] + add esi,BYTE 4 + + mov eax,ebx + mov edx,ebx + + shr eax,3 + shr edx,6 + + and eax,BYTE 0000000000011111b + and edx, 0000001111100000b + + shr ebx,9 + + or eax,edx + + and ebx, 0111110000000000b + + or eax,ebx + + mov [edi],ax + add edi,BYTE 2 + + dec ecx + jnz .L3 + +.L4: + + emms + + popad + + pop ebp + ret + +%endif +%endif