From: Steinar H. Gunderson Date: Sun, 17 Sep 2017 09:50:04 +0000 (+0200) Subject: Go down to 4 rANS streams instead of 8. X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=0d22d7fb73e0c14431a15c7bb0522d0691452a31;p=narabu Go down to 4 rANS streams instead of 8. Costs approx 0.8% bitrate, but reduces GPU cost from 1,3 to 1,2 ms (~8%) due to less L1 cache pressure. --- diff --git a/coded.dat b/coded.dat index 02a6c03..4ecb22c 100644 Binary files a/coded.dat and b/coded.dat differ diff --git a/decoder.shader b/decoder.shader index 8b772c7..0902791 100644 --- a/decoder.shader +++ b/decoder.shader @@ -40,6 +40,16 @@ const uint ff_zigzag_direct[64] = { 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }; +const uint stream_mapping[64] = { + 0, 0, 1, 1, 2, 2, 3, 3, + 0, 0, 1, 2, 2, 2, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 1, 2, 2, 2, 2, 3, 3, 3, + 2, 2, 2, 2, 3, 3, 3, 3, + 2, 2, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, +}; layout(std430, binding = 9) buffer layoutName { @@ -204,7 +214,7 @@ void main() //const uint coeff_num = ff_zigzag_direct[thread_num]; const uint coeff_num = thread_num; const uint stream_num = coeff_num * num_blocks + block_row; - const uint model_num = min((coeff_num % 8) + (coeff_num / 8), 7); + const uint model_num = stream_mapping[coeff_num]; const uint sign_bias = sign_bias_per_model[model_num]; // Initialize rANS decoder. diff --git a/narabu.cpp b/narabu.cpp index ba1e967..1419ff0 100644 --- a/narabu.cpp +++ b/narabu.cpp @@ -23,7 +23,7 @@ using namespace std::chrono; const unsigned prob_bits = 12; const unsigned prob_scale = 1 << prob_bits; const unsigned NUM_SYMS = 256; -const unsigned NUM_TABLES = 16; +const unsigned NUM_TABLES = 8; struct RansDecSymbol { unsigned sym_start; diff --git a/qdc.cpp b/qdc.cpp index b5594b7..9519152 100644 --- a/qdc.cpp +++ b/qdc.cpp @@ -25,7 +25,7 @@ // of coefficients to rANS probability distributions. This is randomized, // so you might want to run it a few times. #define FIND_OPTIMAL_STREAM_ASSIGNMENT 0 -#define NUM_CLUSTERS 8 +#define NUM_CLUSTERS 4 static constexpr uint32_t prob_bits = 12; static constexpr uint32_t prob_scale = 1 << prob_bits; @@ -154,12 +154,37 @@ SymbolStats stats[128]; float kl_dist[64][64]; #endif +const int luma_mapping[64] = { + 0, 0, 1, 1, 2, 2, 3, 3, + 0, 0, 1, 2, 2, 2, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 1, 2, 2, 2, 2, 3, 3, 3, + 2, 2, 2, 2, 3, 3, 3, 3, + 2, 2, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, +}; +const int chroma_mapping[64] = { + 0, 1, 1, 2, 2, 2, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 2, 2, 2, 2, 3, 3, 3, 3, + 2, 2, 2, 3, 3, 3, 3, 3, + 2, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, +}; + int pick_stats_for(int x, int y, bool is_chroma) { #if FIND_OPTIMAL_STREAM_ASSIGNMENT return y * 8 + x + is_chroma * 64; #else - return std::min(x + y, 7) + is_chroma * 8; + if (is_chroma) { + return chroma_mapping[y * 8 + x] + 4; + } else { + return luma_mapping[y * 8 + x]; + } #endif } diff --git a/qdd.cpp b/qdd.cpp index a4fa907..522957d 100644 --- a/qdd.cpp +++ b/qdd.cpp @@ -29,7 +29,8 @@ struct RansDecodeTable { int cum2sym[prob_scale]; RansDecSymbol dsyms[NUM_SYMS]; }; -RansDecodeTable decode_tables[16]; +#define NUM_TABLES 8 +RansDecodeTable decode_tables[NUM_TABLES]; static const unsigned char std_luminance_quant_tbl[64] = { #if 0 @@ -55,10 +56,34 @@ static const unsigned char std_luminance_quant_tbl[64] = { }; -int pick_stats_for(int y, int x) +const int luma_mapping[64] = { + 0, 0, 1, 1, 2, 2, 3, 3, + 0, 0, 1, 2, 2, 2, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 1, 2, 2, 2, 2, 3, 3, 3, + 2, 2, 2, 2, 3, 3, 3, 3, + 2, 2, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, +}; +const int chroma_mapping[64] = { + 0, 1, 1, 2, 2, 2, 3, 3, + 1, 1, 2, 2, 2, 3, 3, 3, + 2, 2, 2, 2, 3, 3, 3, 3, + 2, 2, 2, 3, 3, 3, 3, 3, + 2, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, +}; + +int pick_stats_for(int x, int y, bool is_chroma) { - if (x + y >= 7) return 7; - return x + y; + if (is_chroma) { + return chroma_mapping[y * 8 + x] + 4; + } else { + return luma_mapping[y * 8 + x]; + } } uint32_t read_varint(FILE *fp) @@ -103,8 +128,8 @@ int main(void) exit(1); } - uint32_t sign_bias[16]; - for (unsigned table = 0; table < 16; ++table) { + uint32_t sign_bias[NUM_TABLES]; + for (unsigned table = 0; table < NUM_TABLES; ++table) { uint32_t cum_freq = 0; for (unsigned sym = 0; sym < NUM_SYMS; ++sym) { uint32_t freq = read_varint(fp); @@ -123,7 +148,7 @@ int main(void) // loop over all coefficients for (unsigned y = 0; y < 8; ++y) { for (unsigned x = 0; x < 8; ++x) { - unsigned tbl = pick_stats_for(x, y); + unsigned tbl = pick_stats_for(x, y, false); RansState rans = 0;