Phenom CPU optimizations

author Fiona Glaser <fiona@x264.com>

Fri, 21 Nov 2008 11:39:11 +0000 (03:39 -0800)

committer Fiona Glaser <fiona@x264.com>

Sun, 23 Nov 2008 03:36:03 +0000 (19:36 -0800)
author Fiona Glaser <fiona@x264.com>
Fri, 21 Nov 2008 11:39:11 +0000 (03:39 -0800)
committer Fiona Glaser <fiona@x264.com>
Sun, 23 Nov 2008 03:36:03 +0000 (19:36 -0800)
diff --git a/Makefile b/Makefile

index 5693fdbcdcc438afff09af0150cd439372d131cb..fb97d2529608be82c11d795b56531f51731ebf42 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -23,7 +23,7 @@ endif
  ifneq ($(AS),)
  X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
            pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
-          cpu-32.asm dct-32.asm
+          cpu-a.asm dct-32.asm
  X86SRC = $(X86SRC0:%=common/x86/%)
  
  ifeq ($(ARCH),X86)
diff --git a/common/cpu.c b/common/cpu.c

index 2d722c690449e773ff06a61b607b642c481d92b2..d576276473b4aeca16c280c943fc6a6ea6edb637 100644 (file)
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -52,12 +52,15 @@ const x264_cpu_name_t x264_cpu_names[] = {
      {"SSE4.2",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
      {"Cache32", X264_CPU_CACHELINE_32},
      {"Cache64", X264_CPU_CACHELINE_64},
+    {"SSEMisalign", X264_CPU_SSE_MISALIGN},
      {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
      {"", 0},
  };
  
+
  #ifdef HAVE_MMX
  extern int  x264_cpu_cpuid_test( void );
+extern void x264_cpu_mask_misalign_sse( void );
  extern uint32_t  x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
  
  uint32_t x264_cpu_detect( void )
@@ -111,7 +114,11 @@ uint32_t x264_cpu_detect( void )
          if( cpu & X264_CPU_SSE2 )
          {
              if( ecx&0x00000040 ) /* SSE4a */
+            {
                  cpu |= X264_CPU_SSE2_IS_FAST;
+                cpu |= X264_CPU_SSE_MISALIGN;
+                x264_cpu_mask_misalign_sse();
+            }
              else
                  cpu |= X264_CPU_SSE2_IS_SLOW;
          }
diff --git a/common/pixel.c b/common/pixel.c

index 808eaceacccdf00effcd517a68a3082d0d51f627..48f45d5bb484d299a0f3dc0bba7bb174bc1d64c0 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -689,6 +689,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
              INIT2( sad_x4, _cache64_sse2 );
          }
  #endif
+        if( cpu&X264_CPU_SSE_MISALIGN )
+        {
+            INIT2( sad_x3, _sse2_misalign );
+            INIT2( sad_x4, _sse2_misalign );
+        }
      }
      if( cpu&X264_CPU_SSE2 )
      {
diff --git a/common/x86/cpu-64.asm b/common/x86/cpu-64.asm

deleted file mode 100644 (file)

index 8f8a838..0000000
--- a/common/x86/cpu-64.asm
+++ /dev/null
@@ -1,51 +0,0 @@
-;*****************************************************************************
-;* cpu-64.asm: h264 encoder library
-;*****************************************************************************
-;* Copyright (C) 2003-2008 x264 project
-;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
-;*          Loren Merritt <lorenm@u.washington.edu>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*****************************************************************************
-
-%include "x86inc.asm"
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
-;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid
-    push    rbx
-    mov     r10,   r3
-    mov     r11,   r2
-    mov     r9,    r1
-    mov     eax,   r0d
-    cpuid
-    mov     [r9],  eax
-    mov     [r11], ebx
-    mov     [r10], ecx
-    mov     [r8],  edx
-    pop     rbx
-    ret
-
-;-----------------------------------------------------------------------------
-; void x264_emms( void )
-;-----------------------------------------------------------------------------
-cglobal x264_emms
-    emms
-    ret
-
diff --git a/common/x86/cpu-32.asm b/common/x86/cpu-a.asm

similarity index 76%

rename from common/x86/cpu-32.asm

rename to common/x86/cpu-a.asm

index 090b9483aadeb351467f0b74b54851824fed3554..13266fa11dc273ed2da67fa51180c78977e85ca8 100644 (file)
--- a/common/x86/cpu-32.asm
+++ b/common/x86/cpu-a.asm
@@ -25,6 +25,26 @@
  
  SECTION .text
  
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+;-----------------------------------------------------------------------------
+cglobal x264_cpu_cpuid
+    push    rbx
+    mov     r10,   r3
+    mov     r11,   r2
+    mov     r9,    r1
+    mov     eax,   r0d
+    cpuid
+    mov     [r9],  eax
+    mov     [r11], ebx
+    mov     [r10], ecx
+    mov     [r8],  edx
+    pop     rbx
+    ret
+
+%else
+
  ;-----------------------------------------------------------------------------
  ; int x264_cpu_cpuid_test( void )
  ; return 0 if unsupported
@@ -67,13 +87,6 @@ cglobal x264_cpu_cpuid, 0,6
      mov     [esi],  edx
      RET
  
-;-----------------------------------------------------------------------------
-; void x264_emms( void )
-;-----------------------------------------------------------------------------
-cglobal x264_emms
-    emms
-    ret
-
  ;-----------------------------------------------------------------------------
  ; void x264_stack_align( void (*func)(void*), void *arg );
  ;-----------------------------------------------------------------------------
@@ -88,4 +101,22 @@ cglobal x264_stack_align
      call ecx
      leave
      ret
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_emms( void )
+;-----------------------------------------------------------------------------
+cglobal x264_emms
+    emms
+    ret
  
+;-----------------------------------------------------------------------------
+; void x264_cpu_mask_misalign_sse(void)
+;-----------------------------------------------------------------------------
+cglobal x264_cpu_mask_misalign_sse
+    sub   rsp, 4
+    stmxcsr [rsp]
+    or dword [rsp], 1<<17
+    ldmxcsr [rsp]
+    add   rsp, 4
+    ret
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 8fb1145c1ae36c33dd14ecdc753ba054e53fc0ae..29a3f855266a9f7b07cfdf9ed3735138de5bfc71 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -386,18 +386,24 @@ cglobal x264_pixel_avg2_w16_sse2, 6,7
      jg     .height_loop
      REP_RET
  
-cglobal x264_pixel_avg2_w20_sse2, 6,7
+%macro AVG2_W20 1
+cglobal x264_pixel_avg2_w20_%1, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
      movdqu xmm0, [r2]
      movdqu xmm2, [r2+r3]
-    movdqu xmm1, [r2+r4]
-    movdqu xmm3, [r2+r6]
      movd   mm4,  [r2+16]
      movd   mm5,  [r2+r3+16]
+%ifidn %1, sse2_misalign
+    pavgb  xmm0, [r2+r4]
+    pavgb  xmm2, [r2+r6]
+%else
+    movdqu xmm1, [r2+r4]
+    movdqu xmm3, [r2+r6]
      pavgb  xmm0, xmm1
      pavgb  xmm2, xmm3
+%endif
      pavgb  mm4,  [r2+r4+16]
      pavgb  mm5,  [r2+r6+16]
      movdqa [r0], xmm0
@@ -409,6 +415,10 @@ cglobal x264_pixel_avg2_w20_sse2, 6,7
      sub    r5d, 2
      jg     .height_loop
      REP_RET
+%endmacro
+
+AVG2_W20 sse2
+AVG2_W20 sse2_misalign
  
  ; Cacheline split code for processors with high latencies for loads
  ; split over cache lines.  See sad-a.asm for a more detailed explanation.
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 845806d1d2578ab8f3d2302e06fdd1728e0c91f7..d58cc4e66b84d83ec7b03b1776450f1ca28c5d72 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -249,6 +249,14 @@ cglobal x264_hpel_filter_c_%1, 3,3
      %define tpw_32 [pw_32 GLOBAL]
  %endif
  .loop:
+%ifidn %1,sse2_misalign
+    movu    m0, [src-4]
+    movu    m1, [src-2]
+    mova    m2, [src]
+    paddw   m0, [src+6]
+    paddw   m1, [src+4]
+    paddw   m2, [src+2]
+%else
      mova    m6, [src-16]
      mova    m2, [src]
      mova    m3, [src+16]
@@ -264,6 +272,7 @@ cglobal x264_hpel_filter_c_%1, 3,3
      paddw   m2, m3
      paddw   m1, m4
      paddw   m0, m5
+%endif
      FILT_H  m0, m1, m2
      paddw   m0, tpw_32
      psraw   m0, 6
@@ -322,6 +331,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3
      jl .loop
      REP_RET
  
+%ifndef ARCH_X86_64
  ;-----------------------------------------------------------------------------
  ; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
  ;-----------------------------------------------------------------------------
@@ -387,11 +397,14 @@ cglobal x264_hpel_filter_h_ssse3, 3,3
  
      jl .loop
      REP_RET
-
+%endif
  
  %define PALIGNR PALIGNR_MMX
-HPEL_V sse2
+%ifndef ARCH_X86_64
  HPEL_C sse2
+%endif
+HPEL_V sse2
+HPEL_C sse2_misalign
  %define PALIGNR PALIGNR_SSSE3
  HPEL_C ssse3
  
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index d6278918fe7e477a77a4ec5185ab65861ecce02f..0ec7adef074bfa25326d87b821e5c7605703478e 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -83,6 +83,7 @@ PIXEL_AVG_WALL(cache32_mmxext)
  PIXEL_AVG_WALL(cache64_mmxext)
  PIXEL_AVG_WALL(cache64_sse2)
  PIXEL_AVG_WALL(sse2)
+PIXEL_AVG_WALL(sse2_misalign)
  
  #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
  static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
@@ -98,13 +99,15 @@ static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *,
  /* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
  #define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
  #define x264_pixel_avg2_w12_sse3         x264_pixel_avg2_w16_sse3
+#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w16_sse2
  
  PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
  #ifdef ARCH_X86
  PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
  PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
  #endif
-PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
+PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
+PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
  PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
  
  #define MC_COPY_WTAB(instr, name1, name2, name3)\
@@ -184,6 +187,7 @@ GET_REF(cache32_mmxext)
  GET_REF(cache64_mmxext)
  #endif
  GET_REF(sse2)
+GET_REF(sse2_misalign)
  GET_REF(cache64_sse2)
  
  #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
@@ -225,6 +229,7 @@ void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_
  HPEL(16, sse2, sse2, sse2, sse2)
  HPEL(16, ssse3, sse2, ssse3, ssse3)
  #endif
+HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
  
  void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  {
@@ -293,6 +298,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
      pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
      pf->hpel_filter = x264_hpel_filter_sse2;
+    if( cpu&X264_CPU_SSE_MISALIGN )
+        pf->hpel_filter = x264_hpel_filter_sse2_misalign;
      pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
      pf->mc_chroma = x264_mc_chroma_sse2;
  
@@ -305,6 +312,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
              pf->mc_luma = mc_luma_cache64_sse2;
              pf->get_ref = get_ref_cache64_sse2;
          }
+        if( cpu&X264_CPU_SSE_MISALIGN )
+            pf->get_ref = get_ref_sse2_misalign;
      }
  
      if( !(cpu&X264_CPU_SSSE3) )
diff --git a/common/x86/pixel.h b/common/x86/pixel.h

index f8b469a10c7a6ba5c181444197a56d30fc22b2b9..c27b84946d8ee04fd57d26f93b6eee6dfa295807 100644 (file)
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -42,6 +42,7 @@
  
  DECL_X1( sad, mmxext )
  DECL_X1( sad, sse2 )
+DECL_X4( sad, sse2_misalign )
  DECL_X1( sad, sse3 )
  DECL_X1( sad, sse2_aligned )
  DECL_X4( sad, mmxext )
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm

index 1069ef717f38a306b9586774e8104d56481ccc54..1106baa405d8fcf3448ce867f91cd8ae53dd5eb9 100644 (file)
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -830,6 +830,80 @@ SAD_X 4,  4,  4
      RET
  %endmacro
  
+%macro SAD_X3_START_1x16P_SSE2_MISALIGN 0
+    movdqa xmm2, [r0]
+    movdqu xmm0, [r1]
+    movdqu xmm1, [r2]
+    psadbw xmm0, xmm2
+    psadbw xmm1, xmm2
+    psadbw xmm2, [r3]
+%endmacro
+
+%macro SAD_X3_1x16P_SSE2_MISALIGN 2
+    movdqa xmm3, [r0+%1]
+    movdqu xmm4, [r1+%2]
+    movdqu xmm5, [r2+%2]
+    psadbw xmm4, xmm3
+    psadbw xmm5, xmm3
+    psadbw xmm3, [r3+%2]
+    paddw  xmm0, xmm4
+    paddw  xmm1, xmm5
+    paddw  xmm2, xmm3
+%endmacro
+
+%macro SAD_X4_START_1x16P_SSE2_MISALIGN 0
+    movdqa xmm3, [r0]
+    movdqu xmm0, [r1]
+    movdqu xmm1, [r2]
+    movdqu xmm2, [r3]
+    psadbw xmm0, xmm3
+    psadbw xmm1, xmm3
+    psadbw xmm2, xmm3
+    psadbw xmm3, [r4]
+%endmacro
+
+%macro SAD_X4_1x16P_SSE2_MISALIGN 2
+    movdqa xmm7, [r0+%1]
+    movdqu xmm4, [r1+%2]
+    movdqu xmm5, [r2+%2]
+    movdqu xmm6, [r3+%2]
+    psadbw xmm4, xmm7
+    psadbw xmm5, xmm7
+    psadbw xmm6, xmm7
+    psadbw xmm7, [r4+%2]
+    paddw  xmm0, xmm4
+    paddw  xmm1, xmm5
+    paddw  xmm2, xmm6
+    paddw  xmm3, xmm7
+%endmacro
+
+%macro SAD_X3_2x16P_SSE2_MISALIGN 1
+%if %1
+    SAD_X3_START_1x16P_SSE2_MISALIGN
+%else
+    SAD_X3_1x16P_SSE2_MISALIGN 0, 0
+%endif
+    SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4
+    add  r0, 2*FENC_STRIDE
+    lea  r1, [r1+2*r4]
+    lea  r2, [r2+2*r4]
+    lea  r3, [r3+2*r4]
+%endmacro
+
+%macro SAD_X4_2x16P_SSE2_MISALIGN 1
+%if %1
+    SAD_X4_START_1x16P_SSE2_MISALIGN
+%else
+    SAD_X4_1x16P_SSE2_MISALIGN 0, 0
+%endif
+    SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5
+    add  r0, 2*FENC_STRIDE
+    lea  r1, [r1+2*r5]
+    lea  r2, [r2+2*r5]
+    lea  r3, [r3+2*r5]
+    lea  r4, [r4+2*r5]
+%endmacro
+
  ;-----------------------------------------------------------------------------
  ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
  ;                                    uint8_t *pix2, int i_stride, int scores[3] )
@@ -843,6 +917,15 @@ cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
      SAD_X%1_END_SSE2
  %endmacro
  
+%macro SAD_X_SSE2_MISALIGN 4
+cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1
+    SAD_X%1_2x%2P_SSE2_MISALIGN 1
+%rep %3/2-1
+    SAD_X%1_2x%2P_SSE2_MISALIGN 0
+%endrep
+    SAD_X%1_END_SSE2
+%endmacro
+
  SAD_X_SSE2 3, 16, 16, sse2
  SAD_X_SSE2 3, 16,  8, sse2
  SAD_X_SSE2 3,  8, 16, sse2
@@ -854,6 +937,11 @@ SAD_X_SSE2 4,  8, 16, sse2
  SAD_X_SSE2 4,  8,  8, sse2
  SAD_X_SSE2 4,  8,  4, sse2
  
+SAD_X_SSE2_MISALIGN 3, 16, 16, sse2
+SAD_X_SSE2_MISALIGN 3, 16,  8, sse2
+SAD_X_SSE2_MISALIGN 4, 16, 16, sse2
+SAD_X_SSE2_MISALIGN 4, 16,  8, sse2
+
  %define movdqu lddqu
  SAD_X_SSE2 3, 16, 16, sse3
  SAD_X_SSE2 3, 16,  8, sse3
@@ -869,8 +957,8 @@ SAD_X_SSE2 4, 16,  8, sse3
  
  ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
  ; unless the unaligned data spans the border between 2 cachelines, in which
-; case it's really slow. The exact numbers may differ, but all Intel cpus
-; have a large penalty for cacheline splits.
+; case it's really slow. The exact numbers may differ, but all Intel cpus prior
+; to Nehalem have a large penalty for cacheline splits.
  ; (8-byte alignment exactly half way between two cachelines is ok though.)
  ; LDDQU was supposed to fix this, but it only works on Pentium 4.
  ; So in the split case we load aligned data and explicitly perform the
diff --git a/tools/checkasm.c b/tools/checkasm.c

index fb33a604d60390befd189fb5d9afca1e53ddab75..b4472eb9c9f891992066c9b36b44b3c0b54ba2c1 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -154,7 +154,8 @@ static void print_bench(void)
                      b->cpu&X264_CPU_SSE2 ? "sse2" :
                      b->cpu&X264_CPU_MMX ? "mmx" : "c",
                      b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
-                    b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : "",
+                    b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
+                    b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "",
                      ((int64_t)10*b->cycles/b->den - nop_time)/4 );
          }
  }
@@ -1262,6 +1263,12 @@ static int check_all_flags( void )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
      }
+    if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
+    {
+        cpu1 &= ~X264_CPU_CACHELINE_64;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
+        cpu1 &= ~X264_CPU_SSE_MISALIGN;
+    }
      if( x264_cpu_detect() & X264_CPU_SSE3 )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
      if( x264_cpu_detect() & X264_CPU_SSSE3 )
diff --git a/x264.h b/x264.h

index 323f9bbc4376f9edaf622a3e74d532073d36aebf..51be79eeba38da4e338e4009442cebe45500278f 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -61,6 +61,7 @@ typedef struct x264_t x264_t;
  #define X264_CPU_STACK_MOD4     0x001000  /* if stack is only mod4 and not mod16 */
  #define X264_CPU_SSE4           0x002000  /* SSE4.1 */
  #define X264_CPU_SSE42          0x004000  /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN   0x008000  /* Phenom support for misaligned SSE instruction arguments */
  
  /* Analyse flags
   */
author	Fiona Glaser <fiona@x264.com>
	Fri, 21 Nov 2008 11:39:11 +0000 (03:39 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Sun, 23 Nov 2008 03:36:03 +0000 (19:36 -0800)
Makefile		patch \| blob \| history
common/cpu.c		patch \| blob \| history
common/pixel.c		patch \| blob \| history
common/x86/cpu-64.asm	[deleted file]	patch \| blob \| history
common/x86/cpu-a.asm	[moved from common/x86/cpu-32.asm with 76% similarity]	patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/pixel.h		patch \| blob \| history
common/x86/sad-a.asm		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.h		patch \| blob \| history