From: Maksym Veremeyenko Date: Thu, 13 Feb 2014 14:27:23 +0000 (+0200) Subject: implement SSE optimized swab function X-Git-Url: https://git.sesse.net/?p=mlt;a=commitdiff_plain;h=f299d75d186d9faa0b3f93ac04aab3cb5a8559df implement SSE optimized swab function --- diff --git a/src/modules/decklink/common.cpp b/src/modules/decklink/common.cpp index 83f5e531..cafdda9f 100644 --- a/src/modules/decklink/common.cpp +++ b/src/modules/decklink/common.cpp @@ -19,6 +19,7 @@ #include "common.h" #include +#include #ifdef __DARWIN__ @@ -89,3 +90,46 @@ void freeDLString( DLString aDLString ) #endif + +void swab2( const void *from, void *to, int n ) +{ +#if defined(USE_SSE) +#define SWAB_STEP 16 + __asm__ volatile + ( + "loop_start: \n\t" + + /* load */ + "movdqa 0(%[from]), %%xmm0 \n\t" + "add $0x10, %[from] \n\t" + + /* duplicate to temp registers */ + "movdqa %%xmm0, %%xmm1 \n\t" + + /* shift right temp register */ + "psrlw $8, %%xmm1 \n\t" + + /* shift left main register */ + "psllw $8, %%xmm0 \n\t" + + /* compose them back */ + "por %%xmm0, %%xmm1 \n\t" + + /* save */ + "movdqa %%xmm1, 0(%[to]) \n\t" + "add $0x10, %[to] \n\t" + + "dec %[cnt] \n\t" + "jnz loop_start \n\t" + + : + : [from]"r"(from), [to]"r"(to), [cnt]"r"(n / SWAB_STEP) + : "xmm0", "xmm1" + ); + + from = (unsigned char*) from + n - (n % SWAB_STEP); + to = (unsigned char*) to + n - (n % SWAB_STEP); + n = (n % SWAB_STEP); +#endif + swab(from, to, n); +}; diff --git a/src/modules/decklink/common.h b/src/modules/decklink/common.h index 3b48b9c0..98a85361 100644 --- a/src/modules/decklink/common.h +++ b/src/modules/decklink/common.h @@ -38,5 +38,6 @@ char* getCString( DLString aDLString ); void freeCString( char* aCString ); void freeDLString( DLString aDLString ); +void swab2( const void *from, void *to, int n ); #endif // DECKLINK_COMMON_H diff --git a/src/modules/decklink/consumer_decklink.cpp b/src/modules/decklink/consumer_decklink.cpp index 8f01dea5..ac6f5a92 100644 --- a/src/modules/decklink/consumer_decklink.cpp +++ b/src/modules/decklink/consumer_decklink.cpp @@ -416,9 +416,9 @@ public: // Normal non-keyer playout - needs byte swapping if ( !progressive && m_displayMode->GetFieldDominance() == bmdUpperFieldFirst ) // convert lower field first to top field first - swab( (char*) image, (char*) buffer + stride, stride * ( height - 1 ) ); + swab2( (char*) image, (char*) buffer + stride, stride * ( height - 1 ) ); else - swab( (char*) image, (char*) buffer, stride * height ); + swab2( (char*) image, (char*) buffer, stride * height ); } else if ( !mlt_properties_get_int( MLT_FRAME_PROPERTIES( frame ), "test_image" ) ) { diff --git a/src/modules/decklink/producer_decklink.cpp b/src/modules/decklink/producer_decklink.cpp index 94348938..6801fad8 100644 --- a/src/modules/decklink/producer_decklink.cpp +++ b/src/modules/decklink/producer_decklink.cpp @@ -432,7 +432,7 @@ public: for ( int i = 1; i < m_vancLines + 1; i++ ) { if ( vanc->GetBufferForVerticalBlankingLine( i, &buffer ) == S_OK ) - swab( (char*) buffer, (char*) image + ( i - 1 ) * video->GetRowBytes(), video->GetRowBytes() ); + swab2( (char*) buffer, (char*) image + ( i - 1 ) * video->GetRowBytes(), video->GetRowBytes() ); else mlt_log_debug( getProducer(), "failed capture vanc line %d\n", i ); } @@ -445,7 +445,7 @@ public: if ( image && buffer ) { size = video->GetRowBytes() * video->GetHeight(); - swab( (char*) buffer, (char*) image + m_vancLines * video->GetRowBytes(), size ); + swab2( (char*) buffer, (char*) image + m_vancLines * video->GetRowBytes(), size ); mlt_frame_set_image( frame, (uint8_t*) image, size, mlt_pool_release ); } else if ( image )