X-Git-Url: https://git.sesse.net/?p=ccbs;a=blobdiff_plain;f=bigscreen%2Ftinyptc%2Fmmx.s;fp=bigscreen%2Ftinyptc%2Fmmx.s;h=6509c4759c9564ff6516ae22e43b69de3bf29e56;hp=0000000000000000000000000000000000000000;hb=140a0aae7299d15459fe9ec74ad5020887e0a960;hpb=fd56777b1051147ba14850b72f1f959cd82a1827

diff --git a/bigscreen/tinyptc/mmx.s b/bigscreen/tinyptc/mmx.s
new file mode 100644
index 0000000..6509c47
--- /dev/null
+++ b/bigscreen/tinyptc/mmx.s
@@ -0,0 +1,582 @@
+;
+; TinyPTC x11 v0.7.3 MMX-Optimized pixelformat converters
+; Copyright (C) 2000-2002 Alessandro Gatti <a.gatti@tiscali.it>
+; Copyright (C) 2000-2001 Glenn Fiedler <gaffer@gaffer.org>
+;
+; http://www.sourceforge.net/projects/tinyptc/
+;
+; This library is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2 of the License, or (at your option) any later version.
+;
+; This library is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public
+; License along with this library; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+; 
+
+%include "mmx.i"
+
+bits 32
+
+%ifdef __PTC_MMX__
+global mmx_memcpy
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888
+global mmx_convert_32_to_32_bgr888
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888
+global mmx_convert_32_to_24_rgb888
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888
+global mmx_convert_32_to_24_bgr888
+%endif 
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565
+global mmx_convert_32_to_16_rgb565
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565
+global mmx_convert_32_to_16_bgr565
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
+global mmx_convert_32_to_16_rgb555
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
+global mmx_convert_32_to_16_bgr555
+%endif
+
+section .data
+
+align 16
+
+mmx_rgb888_mask dd 00ffffffh,00ffffffh
+
+mmx_rgb565_b dd 000000f8h, 000000f8h
+mmx_rgb565_g dd 0000fc00h, 0000fc00h
+mmx_rgb565_r dd 00f80000h, 00f80000h
+
+mmx_rgb555_rb dd 00f800f8h,00f800f8h
+mmx_rgb555_g dd 0000f800h,0000f800h
+mmx_rgb555_mul dd 20000008h,20000008h
+mmx_bgr555_mul dd 00082000h,00082000h
+
+section .text
+
+%ifdef __PTC_MMX__
+
+align 16
+
+mmx_memcpy:
+
+    push ebp
+    mov ebp,esp
+
+    pushad
+
+    mov edi,[ebp+8]       ; destination
+    mov esi,[ebp+12]      ; source
+    mov ecx,[ebp+16]      ; bytes
+
+    mov eax,ecx
+    shr ecx,6
+    mov ebx,ecx
+    shl ebx,6
+    sub eax,ebx
+
+align 16
+             
+    .loop
+
+        movq mm0,[esi]
+        movq mm1,[esi+8]
+        movq mm2,[esi+16]
+        movq mm3,[esi+24]
+        movq mm4,[esi+32]
+        movq mm5,[esi+40]
+        movq mm6,[esi+48]
+        movq mm7,[esi+56]
+        movq [edi],mm0
+        movq [edi+8],mm1
+        movq [edi+16],mm2
+        movq [edi+24],mm3
+        movq [edi+32],mm4
+        movq [edi+40],mm5
+        movq [edi+48],mm6
+        movq [edi+56],mm7
+        add esi,8*8
+        add edi,8*8
+        dec ecx
+        jnz .loop
+
+    mov ecx,eax
+    rep movsb
+
+    emms
+
+    popad
+    
+    pop ebp
+    ret
+
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888
+
+align 16
+
+mmx_convert_32_to_32_bgr888:
+
+    ret
+
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888
+
+
+align 16
+
+mmx_convert_32_to_24_rgb888:
+
+    push ebp
+    mov ebp,esp
+
+    pushad
+
+    mov edi,[ebp+8]       ; destination
+    mov esi,[ebp+12]      ; source
+    mov ecx,[ebp+16]      ; bytes
+
+    ; set up mm6 as the mask, mm7 as zero
+    movq mm6, qword [mmx_rgb888_mask]
+    pxor mm7, mm7
+
+    mov edx, ecx                    ; save ecx
+    and ecx, 0fffffffch             ; clear lower two bits
+    jnz .L1
+    jmp .L2
+
+.L1:
+
+    movq mm0, [esi]                 ; A R G B a r g b
+    pand mm0, mm6                   ; 0 R G B 0 r g b
+    movq mm1, [esi+8]               ; A R G B a r g b
+    pand mm1, mm6                   ; 0 R G B 0 r g b
+
+    movq mm2, mm0                   ; 0 R G B 0 r g b
+    punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
+    punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
+    psllq mm2, 24                   ; 0 0 R G B 0 0 0
+    por mm0, mm2                    ; 0 0 R G B r g b
+
+    movq mm3, mm1                   ; 0 R G B 0 r g b
+    psllq mm3, 48                   ; g b 0 0 0 0 0 0
+    por mm0, mm3                    ; g b R G B r g b
+
+    movq mm4, mm1                   ; 0 R G B 0 r g b
+    punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
+    punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
+    psrlq mm1, 16                   ; 0 0 0 R G B 0 r
+    psllq mm4, 8                    ; 0 0 0 0 R G B 0
+    por mm1, mm4                    ; 0 0 0 0 R G B r
+
+    movq [edi], mm0
+    add esi, BYTE 16
+    movd [edi+8], mm1
+    add edi, BYTE 12
+    sub ecx, BYTE 4
+    jnz .L1
+
+.L2:
+    mov ecx, edx
+    and ecx, BYTE 3
+    jz .L4
+.L3:
+    mov al, [esi]
+    mov bl, [esi+1]
+    mov dl, [esi+2]
+    mov [edi], al
+    mov [edi+1], bl
+    mov [edi+2], dl
+    add esi, BYTE 4
+    add edi, BYTE 3
+    dec ecx
+    jnz .L3
+.L4:
+
+    emms
+
+    popad
+    
+    pop ebp
+    ret
+
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888
+
+align 16
+
+mmx_convert_32_to_24_bgr888:
+
+    ret
+
+%endif
+
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565
+
+align 16
+
+mmx_convert_32_to_16_rgb565:
+
+    push ebp
+    mov ebp,esp
+
+    pushad
+
+    mov edi,[ebp+8]       ; destination
+    mov esi,[ebp+12]      ; source
+    mov ecx,[ebp+16]      ; bytes
+
+    ; set up masks
+    movq mm5, [mmx_rgb565_b]
+    movq mm6, [mmx_rgb565_g]
+    movq mm7, [mmx_rgb565_r]
+
+    mov edx, ecx
+    shr ecx, 2
+    jnz .L1
+    jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
+
+.L1:
+    movq mm0, [esi]         ; argb
+    movq mm1, mm0           ; argb
+    pand mm0, mm6           ; 00g0
+    movq mm3, mm1           ; argb
+    pand mm1, mm5           ; 000b
+    pand mm3, mm7           ; 0r00
+    pslld mm1, 2            ; 0 0 000000bb bbb00000
+    por mm0, mm1            ; 0 0 ggggggbb bbb00000
+    psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
+
+    movq mm4, [esi+8]       ; argb
+    movq mm2, mm4           ; argb
+    pand mm4, mm6           ; 00g0
+    movq mm1, mm2           ; argb
+    pand mm2, mm5           ; 000b
+    pand mm1, mm7           ; 0r00
+    pslld mm2, 2            ; 0 0 000000bb bbb00000
+    por mm4, mm2            ; 0 0 ggggggbb bbb00000
+    psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
+
+    packuswb mm3, mm1       ; R 0 r 0
+    packssdw mm0, mm4       ; as above.. ish
+    por mm0, mm3            ; done.
+    movq [edi], mm0
+
+    add esi, 16
+    add edi, 8
+    dec ecx
+    jnz .L1
+
+.L2:
+    mov ecx, edx
+    and ecx, BYTE 3
+    jz .L4
+.L3:
+    mov al, [esi]
+    mov bh, [esi+1]
+    mov ah, [esi+2]
+    shr al, 3
+    and eax, 0F81Fh            ; BYTE?
+    shr ebx, 5
+    and ebx, 07E0h             ; BYTE?
+    add eax, ebx
+    mov [edi], al
+    mov [edi+1], ah
+    add esi, BYTE 4
+    add edi, BYTE 2
+    dec ecx
+    jnz .L3
+
+.L4:
+
+    emms
+
+    popad
+    
+    pop ebp
+    ret
+
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565
+
+align 16
+
+mmx_convert_32_to_16_bgr565:
+
+    push ebp
+    mov ebp,esp
+
+    pushad
+
+    mov edi,[ebp+8]       ; destination
+    mov esi,[ebp+12]      ; source
+    mov ecx,[ebp+16]      ; bytes
+
+    movq mm5, [mmx_rgb565_r]
+    movq mm6, [mmx_rgb565_g]
+    movq mm7, [mmx_rgb565_b]
+
+    mov edx, ecx
+    shr ecx, 2
+    jnz .L1
+    jmp .L2
+
+.L1:
+    movq mm0, [esi]                 ; a r g b
+    movq mm1, mm0                   ; a r g b
+    pand mm0, mm6                   ; 0 0 g 0
+    movq mm3, mm1                   ; a r g b
+    pand mm1, mm5                   ; 0 r 0 0
+    pand mm3, mm7                   ; 0 0 0 b
+
+    psllq mm3, 16                   ; 0 b 0 0
+    psrld mm1, 14                   ; 0 0 000000rr rrr00000
+    por mm0, mm1                    ; 0 0 ggggggrr rrr00000
+    psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
+
+    movq mm4, [esi+8]               ; a r g b
+    movq mm2, mm4                   ; a r g b
+    pand mm4, mm6                   ; 0 0 g 0
+    movq mm1, mm2                   ; a r g b
+    pand mm2, mm5                   ; 0 r 0 0
+    pand mm1, mm7                   ; 0 0 0 b
+
+    psllq mm1, 16                   ; 0 b 0 0
+    psrld mm2, 14                   ; 0 0 000000rr rrr00000
+    por mm4, mm2                    ; 0 0 ggggggrr rrr00000
+    psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
+
+    packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
+    packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
+    por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
+    movq [edi], mm0
+
+    add esi, BYTE 16
+    add edi, BYTE 8
+    dec ecx
+    jnz .L1
+
+.L2:
+    and edx, BYTE 3
+    jz .L4
+.L3:
+    mov al, [esi+2]
+    mov bh, [esi+1]
+    mov ah, [esi]
+    shr al, 3
+    and eax, 0F81Fh                    ; BYTE ?
+    shr ebx, 5
+    and ebx, 07E0h                     ; BYTE ?
+    add eax, ebx
+    mov [edi], al
+    mov [edi+1], ah
+    add esi, BYTE 4
+    add edi, BYTE 2
+    dec edx
+    jnz .L3
+
+.L4:
+
+    emms
+
+    popad
+    
+    pop ebp
+    ret
+
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
+
+align 16
+
+mmx_convert_32_to_16_bgr555:
+
+    ; the 16BGR555 converter is identical to the RGB555 one,
+    ; except it uses a different multiplier for the pmaddwd
+    ; instruction.  cool huh.
+
+    movq mm7, qword [mmx_bgr555_mul]
+
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
+%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
+    jmp _convert_bgr555_cheat
+%endif
+
+    ; This is the same as the Intel version.. they obviously went to
+    ; much more trouble to expand/coil the loop than I did, so theirs
+    ; would almost certainly be faster, even if only a little.
+    ; I did rename 'mmx_rgb555_add' to 'mmx_rgb555_mul', which is
+    ; (I think) a more accurate name..
+
+align 16
+
+mmx_convert_32_to_16_rgb555:
+
+    movq mm7,qword [mmx_rgb555_mul]
+
+%endif
+
+%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
+%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
+
+_convert_bgr555_cheat:
+
+    movq mm6,qword [mmx_rgb555_g]
+    push ebp
+    mov ebp,esp
+
+    pushad
+
+    mov edi,[ebp+8]       ; destination
+    mov esi,[ebp+12]      ; source
+    mov ecx,[ebp+16]      ; bytes
+        
+	mov edx,ecx		           ; Save ecx 
+
+    and ecx,BYTE 0fffffff8h            ; clear lower three bits
+	jnz .L_OK
+    jmp .L2 
+
+.L_OK:
+	
+	movq mm2,[esi+8]
+
+	movq mm0,[esi]
+	movq mm3,mm2
+
+	pand mm3,qword [mmx_rgb555_rb]
+	movq mm1,mm0
+
+	pand mm1,qword [mmx_rgb555_rb]
+	pmaddwd mm3,mm7
+
+	pmaddwd mm1,mm7
+	pand mm2,mm6
+
+.L1:
+	movq mm4,[esi+24]
+	pand mm0,mm6
+
+	movq mm5,[esi+16]
+	por mm3,mm2
+
+	psrld mm3,6
+	por mm1,mm0
+
+	movq mm0,mm4
+	psrld mm1,6
+
+	pand mm0,qword [mmx_rgb555_rb]
+	packssdw mm1,mm3
+
+	movq mm3,mm5
+	pmaddwd mm0,mm7
+
+	pand mm3,qword [mmx_rgb555_rb]
+	pand mm4,mm6
+
+	movq [edi],mm1			
+	pmaddwd mm3,mm7
+
+        add esi,BYTE 32
+	por mm4,mm0
+
+	pand mm5,mm6
+	psrld mm4,6
+
+	movq mm2,[esi+8]
+	por mm5,mm3
+
+	movq mm0,[esi]
+	psrld mm5,6
+
+	movq mm3,mm2
+	movq mm1,mm0
+
+	pand mm3,qword [mmx_rgb555_rb]
+	packssdw mm5,mm4
+
+	pand mm1,qword [mmx_rgb555_rb]
+	pand mm2,mm6
+
+	movq [edi+8],mm5
+	pmaddwd mm3,mm7
+
+	pmaddwd mm1,mm7
+    add edi,BYTE 16
+
+    sub ecx,BYTE 8
+	jz .L2
+    jmp .L1
+
+.L2:	
+	mov ecx,edx
+	
+    and ecx,BYTE 7
+	jz .L4
+	
+.L3:	
+	mov ebx,[esi]
+    add esi,BYTE 4
+	
+    mov eax,ebx
+    mov edx,ebx
+
+    shr eax,3
+    shr edx,6
+
+    and eax,BYTE 0000000000011111b
+    and edx,     0000001111100000b
+
+    shr ebx,9
+
+    or eax,edx
+
+    and ebx,     0111110000000000b
+
+    or eax,ebx
+
+    mov [edi],ax
+    add edi,BYTE 2
+
+    dec ecx
+    jnz .L3	
+
+.L4:
+
+    emms
+
+    popad
+    
+    pop ebp
+    ret
+
+%endif
+%endif