+++ /dev/null
-;
-; TinyPTC x11 v0.7.3 MMX-Optimized pixelformat converters
-; Copyright (C) 2000-2002 Alessandro Gatti <a.gatti@tiscali.it>
-; Copyright (C) 2000-2001 Glenn Fiedler <gaffer@gaffer.org>
-;
-; http://www.sourceforge.net/projects/tinyptc/
-;
-; This library is free software; you can redistribute it and/or
-; modify it under the terms of the GNU Lesser General Public
-; License as published by the Free Software Foundation; either
-; version 2 of the License, or (at your option) any later version.
-;
-; This library is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-; Lesser General Public License for more details.
-;
-; You should have received a copy of the GNU Lesser General Public
-; License along with this library; if not, write to the Free Software
-; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-;
-
-%include "mmx.i"
-
-bits 32
-
-%ifdef __PTC_MMX__
-global mmx_memcpy
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888
-global mmx_convert_32_to_32_bgr888
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888
-global mmx_convert_32_to_24_rgb888
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888
-global mmx_convert_32_to_24_bgr888
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565
-global mmx_convert_32_to_16_rgb565
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565
-global mmx_convert_32_to_16_bgr565
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
-global mmx_convert_32_to_16_rgb555
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
-global mmx_convert_32_to_16_bgr555
-%endif
-
-section .data
-
-align 16
-
-mmx_rgb888_mask dd 00ffffffh,00ffffffh
-
-mmx_rgb565_b dd 000000f8h, 000000f8h
-mmx_rgb565_g dd 0000fc00h, 0000fc00h
-mmx_rgb565_r dd 00f80000h, 00f80000h
-
-mmx_rgb555_rb dd 00f800f8h,00f800f8h
-mmx_rgb555_g dd 0000f800h,0000f800h
-mmx_rgb555_mul dd 20000008h,20000008h
-mmx_bgr555_mul dd 00082000h,00082000h
-
-section .text
-
-%ifdef __PTC_MMX__
-
-align 16
-
-mmx_memcpy:
-
- push ebp
- mov ebp,esp
-
- pushad
-
- mov edi,[ebp+8] ; destination
- mov esi,[ebp+12] ; source
- mov ecx,[ebp+16] ; bytes
-
- mov eax,ecx
- shr ecx,6
- mov ebx,ecx
- shl ebx,6
- sub eax,ebx
-
-align 16
-
- .loop
-
- movq mm0,[esi]
- movq mm1,[esi+8]
- movq mm2,[esi+16]
- movq mm3,[esi+24]
- movq mm4,[esi+32]
- movq mm5,[esi+40]
- movq mm6,[esi+48]
- movq mm7,[esi+56]
- movq [edi],mm0
- movq [edi+8],mm1
- movq [edi+16],mm2
- movq [edi+24],mm3
- movq [edi+32],mm4
- movq [edi+40],mm5
- movq [edi+48],mm6
- movq [edi+56],mm7
- add esi,8*8
- add edi,8*8
- dec ecx
- jnz .loop
-
- mov ecx,eax
- rep movsb
-
- emms
-
- popad
-
- pop ebp
- ret
-
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888
-
-align 16
-
-mmx_convert_32_to_32_bgr888:
-
- ret
-
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888
-
-
-align 16
-
-mmx_convert_32_to_24_rgb888:
-
- push ebp
- mov ebp,esp
-
- pushad
-
- mov edi,[ebp+8] ; destination
- mov esi,[ebp+12] ; source
- mov ecx,[ebp+16] ; bytes
-
- ; set up mm6 as the mask, mm7 as zero
- movq mm6, qword [mmx_rgb888_mask]
- pxor mm7, mm7
-
- mov edx, ecx ; save ecx
- and ecx, 0fffffffch ; clear lower two bits
- jnz .L1
- jmp .L2
-
-.L1:
-
- movq mm0, [esi] ; A R G B a r g b
- pand mm0, mm6 ; 0 R G B 0 r g b
- movq mm1, [esi+8] ; A R G B a r g b
- pand mm1, mm6 ; 0 R G B 0 r g b
-
- movq mm2, mm0 ; 0 R G B 0 r g b
- punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
- punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
- psllq mm2, 24 ; 0 0 R G B 0 0 0
- por mm0, mm2 ; 0 0 R G B r g b
-
- movq mm3, mm1 ; 0 R G B 0 r g b
- psllq mm3, 48 ; g b 0 0 0 0 0 0
- por mm0, mm3 ; g b R G B r g b
-
- movq mm4, mm1 ; 0 R G B 0 r g b
- punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
- punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
- psrlq mm1, 16 ; 0 0 0 R G B 0 r
- psllq mm4, 8 ; 0 0 0 0 R G B 0
- por mm1, mm4 ; 0 0 0 0 R G B r
-
- movq [edi], mm0
- add esi, BYTE 16
- movd [edi+8], mm1
- add edi, BYTE 12
- sub ecx, BYTE 4
- jnz .L1
-
-.L2:
- mov ecx, edx
- and ecx, BYTE 3
- jz .L4
-.L3:
- mov al, [esi]
- mov bl, [esi+1]
- mov dl, [esi+2]
- mov [edi], al
- mov [edi+1], bl
- mov [edi+2], dl
- add esi, BYTE 4
- add edi, BYTE 3
- dec ecx
- jnz .L3
-.L4:
-
- emms
-
- popad
-
- pop ebp
- ret
-
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888
-
-align 16
-
-mmx_convert_32_to_24_bgr888:
-
- ret
-
-%endif
-
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565
-
-align 16
-
-mmx_convert_32_to_16_rgb565:
-
- push ebp
- mov ebp,esp
-
- pushad
-
- mov edi,[ebp+8] ; destination
- mov esi,[ebp+12] ; source
- mov ecx,[ebp+16] ; bytes
-
- ; set up masks
- movq mm5, [mmx_rgb565_b]
- movq mm6, [mmx_rgb565_g]
- movq mm7, [mmx_rgb565_r]
-
- mov edx, ecx
- shr ecx, 2
- jnz .L1
- jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
-
-.L1:
- movq mm0, [esi] ; argb
- movq mm1, mm0 ; argb
- pand mm0, mm6 ; 00g0
- movq mm3, mm1 ; argb
- pand mm1, mm5 ; 000b
- pand mm3, mm7 ; 0r00
- pslld mm1, 2 ; 0 0 000000bb bbb00000
- por mm0, mm1 ; 0 0 ggggggbb bbb00000
- psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
-
- movq mm4, [esi+8] ; argb
- movq mm2, mm4 ; argb
- pand mm4, mm6 ; 00g0
- movq mm1, mm2 ; argb
- pand mm2, mm5 ; 000b
- pand mm1, mm7 ; 0r00
- pslld mm2, 2 ; 0 0 000000bb bbb00000
- por mm4, mm2 ; 0 0 ggggggbb bbb00000
- psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
-
- packuswb mm3, mm1 ; R 0 r 0
- packssdw mm0, mm4 ; as above.. ish
- por mm0, mm3 ; done.
- movq [edi], mm0
-
- add esi, 16
- add edi, 8
- dec ecx
- jnz .L1
-
-.L2:
- mov ecx, edx
- and ecx, BYTE 3
- jz .L4
-.L3:
- mov al, [esi]
- mov bh, [esi+1]
- mov ah, [esi+2]
- shr al, 3
- and eax, 0F81Fh ; BYTE?
- shr ebx, 5
- and ebx, 07E0h ; BYTE?
- add eax, ebx
- mov [edi], al
- mov [edi+1], ah
- add esi, BYTE 4
- add edi, BYTE 2
- dec ecx
- jnz .L3
-
-.L4:
-
- emms
-
- popad
-
- pop ebp
- ret
-
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565
-
-align 16
-
-mmx_convert_32_to_16_bgr565:
-
- push ebp
- mov ebp,esp
-
- pushad
-
- mov edi,[ebp+8] ; destination
- mov esi,[ebp+12] ; source
- mov ecx,[ebp+16] ; bytes
-
- movq mm5, [mmx_rgb565_r]
- movq mm6, [mmx_rgb565_g]
- movq mm7, [mmx_rgb565_b]
-
- mov edx, ecx
- shr ecx, 2
- jnz .L1
- jmp .L2
-
-.L1:
- movq mm0, [esi] ; a r g b
- movq mm1, mm0 ; a r g b
- pand mm0, mm6 ; 0 0 g 0
- movq mm3, mm1 ; a r g b
- pand mm1, mm5 ; 0 r 0 0
- pand mm3, mm7 ; 0 0 0 b
-
- psllq mm3, 16 ; 0 b 0 0
- psrld mm1, 14 ; 0 0 000000rr rrr00000
- por mm0, mm1 ; 0 0 ggggggrr rrr00000
- psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
-
- movq mm4, [esi+8] ; a r g b
- movq mm2, mm4 ; a r g b
- pand mm4, mm6 ; 0 0 g 0
- movq mm1, mm2 ; a r g b
- pand mm2, mm5 ; 0 r 0 0
- pand mm1, mm7 ; 0 0 0 b
-
- psllq mm1, 16 ; 0 b 0 0
- psrld mm2, 14 ; 0 0 000000rr rrr00000
- por mm4, mm2 ; 0 0 ggggggrr rrr00000
- psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
-
- packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
- packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
- por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
- movq [edi], mm0
-
- add esi, BYTE 16
- add edi, BYTE 8
- dec ecx
- jnz .L1
-
-.L2:
- and edx, BYTE 3
- jz .L4
-.L3:
- mov al, [esi+2]
- mov bh, [esi+1]
- mov ah, [esi]
- shr al, 3
- and eax, 0F81Fh ; BYTE ?
- shr ebx, 5
- and ebx, 07E0h ; BYTE ?
- add eax, ebx
- mov [edi], al
- mov [edi+1], ah
- add esi, BYTE 4
- add edi, BYTE 2
- dec edx
- jnz .L3
-
-.L4:
-
- emms
-
- popad
-
- pop ebp
- ret
-
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
-
-align 16
-
-mmx_convert_32_to_16_bgr555:
-
- ; the 16BGR555 converter is identical to the RGB555 one,
- ; except it uses a different multiplier for the pmaddwd
- ; instruction. cool huh.
-
- movq mm7, qword [mmx_bgr555_mul]
-
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
-%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
- jmp _convert_bgr555_cheat
-%endif
-
- ; This is the same as the Intel version.. they obviously went to
- ; much more trouble to expand/coil the loop than I did, so theirs
- ; would almost certainly be faster, even if only a little.
- ; I did rename 'mmx_rgb555_add' to 'mmx_rgb555_mul', which is
- ; (I think) a more accurate name..
-
-align 16
-
-mmx_convert_32_to_16_rgb555:
-
- movq mm7,qword [mmx_rgb555_mul]
-
-%endif
-
-%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
-%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
-
-_convert_bgr555_cheat:
-
- movq mm6,qword [mmx_rgb555_g]
- push ebp
- mov ebp,esp
-
- pushad
-
- mov edi,[ebp+8] ; destination
- mov esi,[ebp+12] ; source
- mov ecx,[ebp+16] ; bytes
-
- mov edx,ecx ; Save ecx
-
- and ecx,BYTE 0fffffff8h ; clear lower three bits
- jnz .L_OK
- jmp .L2
-
-.L_OK:
-
- movq mm2,[esi+8]
-
- movq mm0,[esi]
- movq mm3,mm2
-
- pand mm3,qword [mmx_rgb555_rb]
- movq mm1,mm0
-
- pand mm1,qword [mmx_rgb555_rb]
- pmaddwd mm3,mm7
-
- pmaddwd mm1,mm7
- pand mm2,mm6
-
-.L1:
- movq mm4,[esi+24]
- pand mm0,mm6
-
- movq mm5,[esi+16]
- por mm3,mm2
-
- psrld mm3,6
- por mm1,mm0
-
- movq mm0,mm4
- psrld mm1,6
-
- pand mm0,qword [mmx_rgb555_rb]
- packssdw mm1,mm3
-
- movq mm3,mm5
- pmaddwd mm0,mm7
-
- pand mm3,qword [mmx_rgb555_rb]
- pand mm4,mm6
-
- movq [edi],mm1
- pmaddwd mm3,mm7
-
- add esi,BYTE 32
- por mm4,mm0
-
- pand mm5,mm6
- psrld mm4,6
-
- movq mm2,[esi+8]
- por mm5,mm3
-
- movq mm0,[esi]
- psrld mm5,6
-
- movq mm3,mm2
- movq mm1,mm0
-
- pand mm3,qword [mmx_rgb555_rb]
- packssdw mm5,mm4
-
- pand mm1,qword [mmx_rgb555_rb]
- pand mm2,mm6
-
- movq [edi+8],mm5
- pmaddwd mm3,mm7
-
- pmaddwd mm1,mm7
- add edi,BYTE 16
-
- sub ecx,BYTE 8
- jz .L2
- jmp .L1
-
-.L2:
- mov ecx,edx
-
- and ecx,BYTE 7
- jz .L4
-
-.L3:
- mov ebx,[esi]
- add esi,BYTE 4
-
- mov eax,ebx
- mov edx,ebx
-
- shr eax,3
- shr edx,6
-
- and eax,BYTE 0000000000011111b
- and edx, 0000001111100000b
-
- shr ebx,9
-
- or eax,edx
-
- and ebx, 0111110000000000b
-
- or eax,ebx
-
- mov [edi],ax
- add edi,BYTE 2
-
- dec ecx
- jnz .L3
-
-.L4:
-
- emms
-
- popad
-
- pop ebp
- ret
-
-%endif
-%endif