;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
;*****************************************************************************

BITS 32

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

%macro cglobal 1
    %ifdef PREFIX
        global _%1
        %define %1 _%1
    %else
        global %1
    %endif
%endmacro

;=============================================================================
; Read only data
;=============================================================================

SECTION .rodata data align=16

ALIGN 16
mmx_dw_one:
    times 4 dw 16
mmx_dd_one:
    times 2 dd 512
mmx_dw_20:
    times 4 dw 20
mmx_dw_5:
    times 4 dw -5

%assign twidth  0
%assign theight 4
%assign tdstp1  8
%assign tdstp2  12
%assign tdst1   16
%assign tdst2   20
%assign tsrc    24
%assign tsrcp   28
%assign toffset 32
%assign tbuffer 36


;=============================================================================
; Macros
;=============================================================================

%macro LOAD_4 9
    movd %1, %5
    movd %2, %6
    movd %3, %7
    movd %4, %8
    punpcklbw %1, %9
    punpcklbw %2, %9
    punpcklbw %3, %9
    punpcklbw %4, %9
%endmacro

%macro FILT_2 2
    psubw %1, %2
    psllw %2, 2
    psubw %1, %2
%endmacro

%macro FILT_4 3
    paddw %2, %3
    psllw %2, 2
    paddw %1, %2
    psllw %2, 2
    paddw %1, %2
%endmacro

%macro FILT_6 4
    psubw %1, %2
    psllw %2, 2
    psubw %1, %2
    paddw %1, %3
    paddw %1, %4
    psraw %1, 5
%endmacro

%macro FILT_ALL 1
    LOAD_4      mm1, mm2, mm3, mm4, [%1], [%1 + ecx], [%1 + 2 * ecx], [%1 + ebx], mm0
    FILT_2      mm1, mm2
    movd        mm5, [%1 + 4 * ecx]
    movd        mm6, [%1 + edx]
    FILT_4      mm1, mm3, mm4
    punpcklbw   mm5, mm0
    punpcklbw   mm6, mm0
    psubw       mm1, mm5
    psllw       mm5, 2
    psubw       mm1, mm5
    paddw       mm1, mm6
%endmacro


;=============================================================================
; Code
;=============================================================================

SECTION .text

cglobal x264_vertical_filter_mmxext
cglobal x264_horizontal_filter_mmxext
cglobal x264_center_filter_mmxext

;-----------------------------------------------------------------------------
;
; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
;                                 uint8_t *dst2, int i_dst2_stride,
;                                  uint8_t *src, int i_src_stride,
;                                  int i_width, int i_height );
;
;-----------------------------------------------------------------------------

ALIGN 16
x264_center_filter_mmxext :

    push        edi
    push        esi
    push        ebx
    push        ebp

    mov         edx,      [esp + 40]         ; src_stride
    lea         edx,      [edx + edx + 18 + tbuffer]
    sub         esp,      edx
    mov         [esp + toffset] ,edx
    
    mov         eax,      [esp + edx + 20]   ; dst1
    mov         [esp + tdst1]   ,eax
    
    mov         eax,      [esp + edx + 28]   ; dst2
    mov         [esp + tdst2]   ,eax
    
    mov         eax,      [esp + edx + 44]   ; width
    mov         [esp + twidth]  ,eax
    
    mov         eax,      [esp + edx + 48]   ; height
    mov         [esp + theight] ,eax
    
    mov         eax,      [esp + edx + 24]   ; dst1_stride
    mov         [esp + tdstp1]  ,eax
    
    mov         eax,      [esp + edx + 32]   ; dst2_stride
    mov         [esp + tdstp2]  ,eax

    mov         ecx,      [esp + edx + 40]   ; src_stride
    mov         [esp + tsrcp]   ,ecx
    
    mov         eax,      [esp + edx + 36]   ; src
    sub         eax,      ecx
    sub         eax,      ecx
    mov         [esp + tsrc]    ,eax         ; src - 2 * src_stride

    lea         ebx,      [ecx + ecx * 2]    ; 3 * src_stride
    lea         edx,      [ecx + ecx * 4]    ; 5 * src_stride

    pxor        mm0,      mm0                ; 0 ---> mm0
    movq        mm7,      [mmx_dd_one]       ; for rounding


loopcy:

;   mov         eax,    [esp + twidth]
    xor         eax,    eax
    mov         edi,    [esp + tdst1]
    lea         ebp,    [esp + tbuffer]
    mov         esi,    [esp + tsrc]

    FILT_ALL    esi

    pshufw      mm2,    mm1, 0
    movq        [ebp + 8],  mm1
    movq        [ebp],  mm2
    paddw       mm1,    [mmx_dw_one]
    psraw       mm1,    5

    packuswb    mm1,    mm1
    movd        [edi],  mm1

    add         eax,    8
    add         esi,    4

loopcx1:

    FILT_ALL    esi

    movq        [ebp + 2 * eax],  mm1
    paddw       mm1,    [mmx_dw_one]
    psraw       mm1,    5
    packuswb    mm1,    mm1
    movd        [edi + eax - 4],  mm1

    add         esi,    4
    add         eax,    4
    cmp         eax,    [esp + twidth]
    jnz         loopcx1

    FILT_ALL    esi

    pshufw      mm2,    mm1,  7
    movq        [ebp + 2 * eax],  mm1
    movq        [ebp + 2 * eax + 8],  mm2
    paddw       mm1,    [mmx_dw_one]
    psraw       mm1,    5
    packuswb    mm1,    mm1
    movd        [edi + eax - 4],  mm1

    mov         esi,    [esp + tsrc]
    add         esi,    ecx
    mov         [esp + tsrc],  esi

    add         edi,    [esp + tdstp1]
    mov         [esp + tdst1], edi

    mov         edi,    [esp + tdst2]
    xor         eax,    eax

loopcx2:

    movq        mm2,    [esp + 2 * eax + 2  + 4 + tbuffer]
    movq        mm3,    [esp + 2 * eax + 4  + 4 + tbuffer]
    movq        mm4,    [esp + 2 * eax + 6  + 4 + tbuffer]
    movq        mm5,    [esp + 2 * eax + 8  + 4 + tbuffer]
    movq        mm1,    [esp + 2 * eax      + 4 + tbuffer]
    movq        mm6,    [esp + 2 * eax + 10 + 4 + tbuffer]
    paddw       mm2,    mm5
    paddw       mm3,    mm4
    paddw       mm1,    mm6

    movq        mm5,    [mmx_dw_20]
    movq        mm4,    [mmx_dw_5]
    movq        mm6,    mm1
    pxor        mm7,    mm7

    punpckhwd   mm5,    mm2
    punpcklwd   mm4,    mm3
    punpcklwd   mm2,    [mmx_dw_20]
    punpckhwd   mm3,    [mmx_dw_5]

    pcmpgtw     mm7,    mm1

    pmaddwd     mm2,    mm4
    pmaddwd     mm3,    mm5

    punpcklwd   mm1,    mm7
    punpckhwd   mm6,    mm7

    paddd       mm2,    mm1
    paddd       mm3,    mm6

    paddd       mm2,    [mmx_dd_one]
    paddd       mm3,    [mmx_dd_one]

    psrad       mm2,    10
    psrad       mm3,    10

    packssdw    mm2,    mm3
    packuswb    mm2,    mm0

    movd        [edi + eax], mm2

    add         eax,    4
    cmp         eax,    [esp + twidth]
    jnz         loopcx2

    add         edi,    [esp + tdstp2]
    mov         [esp + tdst2], edi

    mov         ebp,    [esp + theight]
    dec         ebp
    test        ebp,    ebp
    mov         [esp + theight], ebp
    jnz         loopcy

    add         esp,    [esp + toffset]

    pop         ebp
    pop         ebx
    pop         esi
    pop         edi

    ret

;-----------------------------------------------------------------------------
;
; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
;                                     uint8_t *src, int i_src_stride,
;                                     int i_width, int i_height );
;
;-----------------------------------------------------------------------------

ALIGN 16
x264_horizontal_filter_mmxext :
    push edi
    push esi

    mov         edi,    [esp + 12]           ; dst
    mov         esi,    [esp + 20]           ; src

    pxor        mm0,    mm0
    movq        mm7,    [mmx_dw_one]

    mov         ecx,    [esp + 32]           ; height

    sub         esi,    2

loophy:

    dec         ecx
    xor         eax,    eax

loophx:

    prefetchnta [esi + eax + 48]       

    LOAD_4      mm1,    mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0
    FILT_2      mm1,    mm2
    movd        mm5,    [esi + eax + 4]
    movd        mm6,    [esi + eax + 5]
    FILT_4      mm1,    mm3, mm4
    movd        mm2,    [esi + eax + 4]
    movd        mm3,    [esi + eax + 6]
    punpcklbw   mm5,    mm0
    punpcklbw   mm6,    mm0
    FILT_6      mm1,    mm5, mm6, mm7
    movd        mm4,    [esi + eax + 7]
    movd        mm5,    [esi + eax + 8]
    punpcklbw   mm2,    mm0
    punpcklbw   mm3,    mm0                  ; mm2(1), mm3(20), mm6(-5) ready
    FILT_2      mm2,    mm6
    movd        mm6,    [esi + eax + 9]
    punpcklbw   mm4,    mm0
    punpcklbw   mm5,    mm0                  ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
    FILT_4      mm2,    mm3, mm4
    punpcklbw   mm6,    mm0
    FILT_6      mm2,    mm5, mm6, mm7

    packuswb    mm1,    mm2
    movq        [edi + eax],  mm1

    add         eax,    8
    cmp         eax,    [esp + 28]           ; width
    jnz         loophx

    add         esi,    [esp + 24]           ; src_pitch
    add         edi,    [esp + 16]           ; dst_pitch

    test        ecx,    ecx
    jnz         loophy

    pop         esi
    pop         edi

    ret