;
; TinyPTC x11 v0.7.3 MMX-Optimized pixelformat converters
; Copyright (C) 2000-2002 Alessandro Gatti <a.gatti@tiscali.it>
; Copyright (C) 2000-2001 Glenn Fiedler <gaffer@gaffer.org>
;
; http://www.sourceforge.net/projects/tinyptc/
;
; This library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public
; License as published by the Free Software Foundation; either
; version 2 of the License, or (at your option) any later version.
;
; This library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with this library; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; 

%include "mmx.i"

bits 32

%ifdef __PTC_MMX__
global mmx_memcpy
%endif

%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888
global mmx_convert_32_to_32_bgr888
%endif

%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888
global mmx_convert_32_to_24_rgb888
%endif

%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888
global mmx_convert_32_to_24_bgr888
%endif 

%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565
global mmx_convert_32_to_16_rgb565
%endif

%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565
global mmx_convert_32_to_16_bgr565
%endif

%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
global mmx_convert_32_to_16_rgb555
%endif

%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
global mmx_convert_32_to_16_bgr555
%endif

section .data

align 16

mmx_rgb888_mask dd 00ffffffh,00ffffffh

mmx_rgb565_b dd 000000f8h, 000000f8h
mmx_rgb565_g dd 0000fc00h, 0000fc00h
mmx_rgb565_r dd 00f80000h, 00f80000h

mmx_rgb555_rb dd 00f800f8h,00f800f8h
mmx_rgb555_g dd 0000f800h,0000f800h
mmx_rgb555_mul dd 20000008h,20000008h
mmx_bgr555_mul dd 00082000h,00082000h

section .text

%ifdef __PTC_MMX__

align 16

mmx_memcpy:

    push ebp
    mov ebp,esp

    pushad

    mov edi,[ebp+8]       ; destination
    mov esi,[ebp+12]      ; source
    mov ecx,[ebp+16]      ; bytes

    mov eax,ecx
    shr ecx,6
    mov ebx,ecx
    shl ebx,6
    sub eax,ebx

align 16
             
    .loop

        movq mm0,[esi]
        movq mm1,[esi+8]
        movq mm2,[esi+16]
        movq mm3,[esi+24]
        movq mm4,[esi+32]
        movq mm5,[esi+40]
        movq mm6,[esi+48]
        movq mm7,[esi+56]
        movq [edi],mm0
        movq [edi+8],mm1
        movq [edi+16],mm2
        movq [edi+24],mm3
        movq [edi+32],mm4
        movq [edi+40],mm5
        movq [edi+48],mm6
        movq [edi+56],mm7
        add esi,8*8
        add edi,8*8
        dec ecx
        jnz .loop

    mov ecx,eax
    rep movsb

    emms

    popad
    
    pop ebp
    ret

%endif

%ifdef __PTC_MMX_CONVERT_32_TO_32_BGR888

align 16

mmx_convert_32_to_32_bgr888:

    ret

%endif

%ifdef __PTC_MMX_CONVERT_32_TO_24_RGB888


align 16

mmx_convert_32_to_24_rgb888:

    push ebp
    mov ebp,esp

    pushad

    mov edi,[ebp+8]       ; destination
    mov esi,[ebp+12]      ; source
    mov ecx,[ebp+16]      ; bytes

    ; set up mm6 as the mask, mm7 as zero
    movq mm6, qword [mmx_rgb888_mask]
    pxor mm7, mm7

    mov edx, ecx                    ; save ecx
    and ecx, 0fffffffch             ; clear lower two bits
    jnz .L1
    jmp .L2

.L1:

    movq mm0, [esi]                 ; A R G B a r g b
    pand mm0, mm6                   ; 0 R G B 0 r g b
    movq mm1, [esi+8]               ; A R G B a r g b
    pand mm1, mm6                   ; 0 R G B 0 r g b

    movq mm2, mm0                   ; 0 R G B 0 r g b
    punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
    punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
    psllq mm2, 24                   ; 0 0 R G B 0 0 0
    por mm0, mm2                    ; 0 0 R G B r g b

    movq mm3, mm1                   ; 0 R G B 0 r g b
    psllq mm3, 48                   ; g b 0 0 0 0 0 0
    por mm0, mm3                    ; g b R G B r g b

    movq mm4, mm1                   ; 0 R G B 0 r g b
    punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
    punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
    psrlq mm1, 16                   ; 0 0 0 R G B 0 r
    psllq mm4, 8                    ; 0 0 0 0 R G B 0
    por mm1, mm4                    ; 0 0 0 0 R G B r

    movq [edi], mm0
    add esi, BYTE 16
    movd [edi+8], mm1
    add edi, BYTE 12
    sub ecx, BYTE 4
    jnz .L1

.L2:
    mov ecx, edx
    and ecx, BYTE 3
    jz .L4
.L3:
    mov al, [esi]
    mov bl, [esi+1]
    mov dl, [esi+2]
    mov [edi], al
    mov [edi+1], bl
    mov [edi+2], dl
    add esi, BYTE 4
    add edi, BYTE 3
    dec ecx
    jnz .L3
.L4:

    emms

    popad
    
    pop ebp
    ret

%endif

%ifdef __PTC_MMX_CONVERT_32_TO_24_BGR888

align 16

mmx_convert_32_to_24_bgr888:

    ret

%endif


%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB565

align 16

mmx_convert_32_to_16_rgb565:

    push ebp
    mov ebp,esp

    pushad

    mov edi,[ebp+8]       ; destination
    mov esi,[ebp+12]      ; source
    mov ecx,[ebp+16]      ; bytes

    ; set up masks
    movq mm5, [mmx_rgb565_b]
    movq mm6, [mmx_rgb565_g]
    movq mm7, [mmx_rgb565_r]

    mov edx, ecx
    shr ecx, 2
    jnz .L1
    jmp .L2         ; not necessary at the moment, but doesn't hurt (much)

.L1:
    movq mm0, [esi]         ; argb
    movq mm1, mm0           ; argb
    pand mm0, mm6           ; 00g0
    movq mm3, mm1           ; argb
    pand mm1, mm5           ; 000b
    pand mm3, mm7           ; 0r00
    pslld mm1, 2            ; 0 0 000000bb bbb00000
    por mm0, mm1            ; 0 0 ggggggbb bbb00000
    psrld mm0, 5            ; 0 0 00000ggg gggbbbbb

    movq mm4, [esi+8]       ; argb
    movq mm2, mm4           ; argb
    pand mm4, mm6           ; 00g0
    movq mm1, mm2           ; argb
    pand mm2, mm5           ; 000b
    pand mm1, mm7           ; 0r00
    pslld mm2, 2            ; 0 0 000000bb bbb00000
    por mm4, mm2            ; 0 0 ggggggbb bbb00000
    psrld mm4, 5            ; 0 0 00000ggg gggbbbbb

    packuswb mm3, mm1       ; R 0 r 0
    packssdw mm0, mm4       ; as above.. ish
    por mm0, mm3            ; done.
    movq [edi], mm0

    add esi, 16
    add edi, 8
    dec ecx
    jnz .L1

.L2:
    mov ecx, edx
    and ecx, BYTE 3
    jz .L4
.L3:
    mov al, [esi]
    mov bh, [esi+1]
    mov ah, [esi+2]
    shr al, 3
    and eax, 0F81Fh            ; BYTE?
    shr ebx, 5
    and ebx, 07E0h             ; BYTE?
    add eax, ebx
    mov [edi], al
    mov [edi+1], ah
    add esi, BYTE 4
    add edi, BYTE 2
    dec ecx
    jnz .L3

.L4:

    emms

    popad
    
    pop ebp
    ret

%endif

%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR565

align 16

mmx_convert_32_to_16_bgr565:

    push ebp
    mov ebp,esp

    pushad

    mov edi,[ebp+8]       ; destination
    mov esi,[ebp+12]      ; source
    mov ecx,[ebp+16]      ; bytes

    movq mm5, [mmx_rgb565_r]
    movq mm6, [mmx_rgb565_g]
    movq mm7, [mmx_rgb565_b]

    mov edx, ecx
    shr ecx, 2
    jnz .L1
    jmp .L2

.L1:
    movq mm0, [esi]                 ; a r g b
    movq mm1, mm0                   ; a r g b
    pand mm0, mm6                   ; 0 0 g 0
    movq mm3, mm1                   ; a r g b
    pand mm1, mm5                   ; 0 r 0 0
    pand mm3, mm7                   ; 0 0 0 b

    psllq mm3, 16                   ; 0 b 0 0
    psrld mm1, 14                   ; 0 0 000000rr rrr00000
    por mm0, mm1                    ; 0 0 ggggggrr rrr00000
    psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr

    movq mm4, [esi+8]               ; a r g b
    movq mm2, mm4                   ; a r g b
    pand mm4, mm6                   ; 0 0 g 0
    movq mm1, mm2                   ; a r g b
    pand mm2, mm5                   ; 0 r 0 0
    pand mm1, mm7                   ; 0 0 0 b

    psllq mm1, 16                   ; 0 b 0 0
    psrld mm2, 14                   ; 0 0 000000rr rrr00000
    por mm4, mm2                    ; 0 0 ggggggrr rrr00000
    psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr

    packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
    packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
    por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
    movq [edi], mm0

    add esi, BYTE 16
    add edi, BYTE 8
    dec ecx
    jnz .L1

.L2:
    and edx, BYTE 3
    jz .L4
.L3:
    mov al, [esi+2]
    mov bh, [esi+1]
    mov ah, [esi]
    shr al, 3
    and eax, 0F81Fh                    ; BYTE ?
    shr ebx, 5
    and ebx, 07E0h                     ; BYTE ?
    add eax, ebx
    mov [edi], al
    mov [edi+1], ah
    add esi, BYTE 4
    add edi, BYTE 2
    dec edx
    jnz .L3

.L4:

    emms

    popad
    
    pop ebp
    ret

%endif

%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555

align 16

mmx_convert_32_to_16_bgr555:

    ; the 16BGR555 converter is identical to the RGB555 one,
    ; except it uses a different multiplier for the pmaddwd
    ; instruction.  cool huh.

    movq mm7, qword [mmx_bgr555_mul]

%endif

%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555
    jmp _convert_bgr555_cheat
%endif

    ; This is the same as the Intel version.. they obviously went to
    ; much more trouble to expand/coil the loop than I did, so theirs
    ; would almost certainly be faster, even if only a little.
    ; I did rename 'mmx_rgb555_add' to 'mmx_rgb555_mul', which is
    ; (I think) a more accurate name..

align 16

mmx_convert_32_to_16_rgb555:

    movq mm7,qword [mmx_rgb555_mul]

%endif

%ifdef __PTC_MMX_CONVERT_32_TO_16_RGB555
%ifdef __PTC_MMX_CONVERT_32_TO_16_BGR555

_convert_bgr555_cheat:

    movq mm6,qword [mmx_rgb555_g]
    push ebp
    mov ebp,esp

    pushad

    mov edi,[ebp+8]       ; destination
    mov esi,[ebp+12]      ; source
    mov ecx,[ebp+16]      ; bytes
        
	mov edx,ecx		           ; Save ecx 

    and ecx,BYTE 0fffffff8h            ; clear lower three bits
	jnz .L_OK
    jmp .L2 

.L_OK:
	
	movq mm2,[esi+8]

	movq mm0,[esi]
	movq mm3,mm2

	pand mm3,qword [mmx_rgb555_rb]
	movq mm1,mm0

	pand mm1,qword [mmx_rgb555_rb]
	pmaddwd mm3,mm7

	pmaddwd mm1,mm7
	pand mm2,mm6

.L1:
	movq mm4,[esi+24]
	pand mm0,mm6

	movq mm5,[esi+16]
	por mm3,mm2

	psrld mm3,6
	por mm1,mm0

	movq mm0,mm4
	psrld mm1,6

	pand mm0,qword [mmx_rgb555_rb]
	packssdw mm1,mm3

	movq mm3,mm5
	pmaddwd mm0,mm7

	pand mm3,qword [mmx_rgb555_rb]
	pand mm4,mm6

	movq [edi],mm1			
	pmaddwd mm3,mm7

        add esi,BYTE 32
	por mm4,mm0

	pand mm5,mm6
	psrld mm4,6

	movq mm2,[esi+8]
	por mm5,mm3

	movq mm0,[esi]
	psrld mm5,6

	movq mm3,mm2
	movq mm1,mm0

	pand mm3,qword [mmx_rgb555_rb]
	packssdw mm5,mm4

	pand mm1,qword [mmx_rgb555_rb]
	pand mm2,mm6

	movq [edi+8],mm5
	pmaddwd mm3,mm7

	pmaddwd mm1,mm7
    add edi,BYTE 16

    sub ecx,BYTE 8
	jz .L2
    jmp .L1

.L2:	
	mov ecx,edx
	
    and ecx,BYTE 7
	jz .L4
	
.L3:	
	mov ebx,[esi]
    add esi,BYTE 4
	
    mov eax,ebx
    mov edx,ebx

    shr eax,3
    shr edx,6

    and eax,BYTE 0000000000011111b
    and edx,     0000001111100000b

    shr ebx,9

    or eax,edx

    and ebx,     0111110000000000b

    or eax,ebx

    mov [edi],ax
    add edi,BYTE 2

    dec ecx
    jnz .L3	

.L4:

    emms

    popad
    
    pop ebp
    ret

%endif
%endif