x264 will now terminate gracefully rather than SIGILL when run on a machine with no MMXEXT support.
A configure option is now available to build x264 without assembly support for support on such old CPUs as the Pentium 2, K6, etc.
return sum;
}
-#ifdef HAVE_MMX
-#include "x86/util.h"
-#endif
-
/****************************************************************************
*
****************************************************************************/
// included at the end because it needs x264_t
#include "macroblock.h"
+#ifdef HAVE_MMX
+#include "x86/util.h"
+#endif
+
#endif
int8_t *cache = &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y];
cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
}
+#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
+#define array_non_zero_int array_non_zero_int_c
+static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
+{
+ uint64_t *x = v;
+ if(i_count == 8)
+ return !!x[0];
+ else if(i_count == 16)
+ return !!(x[0]|x[1]);
+ else if(i_count == 32)
+ return !!(x[0]|x[1]|x[2]|x[3]);
+ else
+ {
+ int i;
+ i_count /= sizeof(uint64_t);
+ for( i = 0; i < i_count; i++ )
+ if( x[i] ) return 1;
+ return 0;
+ }
+}
+/* This function and its MMX version only work on arrays of size 16 */
+static ALWAYS_INLINE int array_non_zero_count( int16_t *v )
+{
+ int i;
+ int i_nz;
+
+ for( i = 0, i_nz = 0; i < 16; i++ )
+ if( v[i] )
+ i_nz++;
+
+ return i_nz;
+}
#endif
"paddusw %%mm0, %%mm4 \n"
"jg 1b \n"
"movq %%mm4, %0 \n"
- :"=m"(output), "+r"(i_mvc), "+r"(mvc)
+ :"=m"(output), "+r"(i_mvc)
+ :"r"(mvc)
);
sum += output[0] + output[1] + output[2] + output[3];
return sum;
}
+#define array_non_zero_count array_non_zero_count_mmx
+static inline int array_non_zero_count_mmx( int16_t *v )
+{
+ static const uint64_t pw_2 = 0x0202020202020202ULL;
+ int count;
+ asm(
+ "pxor %%mm7, %%mm7 \n"
+ "movq (%1), %%mm0 \n"
+ "movq 16(%1), %%mm1 \n"
+ "packsswb 8(%1), %%mm0 \n"
+ "packsswb 24(%1), %%mm1 \n"
+ "pcmpeqb %%mm7, %%mm0 \n"
+ "pcmpeqb %%mm7, %%mm1 \n"
+ "paddb %%mm0, %%mm1 \n"
+ "paddb %2, %%mm1 \n"
+ "psadbw %%mm7, %%mm1 \n"
+ "movd %%mm1, %0 \n"
+ :"=r"(count)
+ :"r"(v), "m"(pw_2)
+ );
+ return count;
+}
+#undef array_non_zero_int
+#define array_non_zero_int array_non_zero_int_mmx
+static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
+{
+ if(i_count == 128)
+ {
+ int nonzero;
+ asm(
+ "movq (%1), %%mm0 \n"
+ "por 8(%1), %%mm0 \n"
+ "por 16(%1), %%mm0 \n"
+ "por 24(%1), %%mm0 \n"
+ "por 32(%1), %%mm0 \n"
+ "por 40(%1), %%mm0 \n"
+ "por 48(%1), %%mm0 \n"
+ "por 56(%1), %%mm0 \n"
+ "por 64(%1), %%mm0 \n"
+ "por 72(%1), %%mm0 \n"
+ "por 80(%1), %%mm0 \n"
+ "por 88(%1), %%mm0 \n"
+ "por 96(%1), %%mm0 \n"
+ "por 104(%1), %%mm0 \n"
+ "por 112(%1), %%mm0 \n"
+ "por 120(%1), %%mm0 \n"
+ "packsswb %%mm0, %%mm0 \n"
+ "movd %%mm0, %0 \n"
+ :"=r"(nonzero)
+ :"r"(v)
+ );
+ return !!nonzero;
+ }
+ else return array_non_zero_int_c( v, i_count );
+}
#endif
#endif
echo "available options:"
echo ""
echo " --help print this message"
-echo " --enable-avis-input enables avisynth input (win32 only)"
-echo " --enable-mp4-output enables mp4 output (using gpac)"
+echo " --disable-avis-input disables avisynth input (win32 only)"
+echo " --disable-mp4-output disables mp4 output (using gpac)"
+echo " --disable-pthread disables multithreaded encoding"
+echo " --disable-asm disables assembly optimizations on x86"
echo " --enable-gtk build GTK+ interface"
-echo " --enable-pthread enables multithreaded encoding"
echo " --enable-debug adds -g, doesn't strip"
echo " --enable-gprof adds -pg, doesn't strip"
echo " --enable-visualize enables visualization (X11 only)"
avis_input="auto"
mp4_output="auto"
pthread="auto"
+asm="yes"
debug="no"
gprof="no"
pic="no"
--includedir=*)
includedir="$optarg"
;;
+ --enable-asm)
+ asm="yes"
+ ;;
+ --disable-asm)
+ asm="no"
+ ;;
--enable-avis-input)
avis_input="yes"
;;
pic="yes"
fi
-if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
+if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\
"`yasm --version 2>$DEVNULL | head -n 1`" "<" "yasm 0.6.2" ] ; then
echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
if as_check "pabsw xmm0, xmm0" ; then
CFLAGS="$CFLAGS -DHAVE_MMX"
else
- echo "No suitable assembler found. x264 will be several times slower."
- echo "Please install 'yasm' to get MMX/SSE optimized code."
- AS=""
+ echo "No suitable assembler found. Install 'yasm' to get MMX/SSE optimized code."
+ echo "If you really want to compile without asm, configure with --disable-asm."
+ exit 1
fi
+else
+ AS=""
fi
CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
echo "Platform: $ARCH"
echo "System: $SYS"
+echo "asm: $asm"
echo "avis input: $avis_input"
echo "mp4 output: $mp4_output"
echo "pthread: $pthread"
if( h->mb.i_cbp_luma & (1 << i8) )
for( i4 = 0; i4 < 4; i4++ )
{
- h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
+ h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, s, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
}
if( h->mb.i_cbp_luma != 0 )
for( i = 0; i < 16; i++ )
{
- h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+ h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
}
}
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
for( i = 16; i < 24; i++ )
{
- h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+ h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
}
}
for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
{
x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
- h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8]+1, 15 );
+ h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
block_residual_write_cavlc( h, &s, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
- h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8]+1, 15 );
+ h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
block_residual_write_cavlc( h, &s, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
i8 += x264_pixel_size[i_pixel].h >> 3;
}
for( i = 0; i < 16; i++ )
h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] =
- array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
+ array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
return h->out.bs.i_bits_encoded;
int i;
for( i = 16; i < 24; i++ )
{
- h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+ h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, &h->out.bs, i, h->dct.luma4x4[i]+1, 15 );
}
}
static int x264_validate_parameters( x264_t *h )
{
+#ifdef HAVE_MMX
+ if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+ x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
+ return -1;
+ }
+#endif
if( h->param.i_width <= 0 || h->param.i_height <= 0 )
{
x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
void x264_noise_reduction_update( x264_t *h );
void x264_denoise_dct( x264_t *h, int16_t *dct );
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
-static inline int array_non_zero_int( void *v, int i_count )
-{
- int i;
- uint64_t *x = v;
- i_count /= sizeof(uint64_t);
- for( i = 0; i < i_count; i++ )
- if( x[i] ) return 1;
- return 0;
-}
-
-static inline int array_non_zero_count( int16_t *v, int i_count )
-{
- int i;
- int i_nz;
-
- for( i = 0, i_nz = 0; i < i_count; i++ )
- if( v[i] )
- i_nz++;
-
- return i_nz;
-}
-
-
#endif