X-Git-Url: https://git.sesse.net/?p=fjl;a=blobdiff_plain;f=dehuff.h;h=0f86c3b9c4f4e6703e13751dbb9d5306267ed264;hp=b79a79ace3fe976b2f38d8bf375b980b90ccafe7;hb=bbe665e5ef6a103657f4d921ddd4b540f38daf3d;hpb=47de6c270a336574dce220cde780a802a513d113 diff --git a/dehuff.h b/dehuff.h index b79a79a..0f86c3b 100644 --- a/dehuff.h +++ b/dehuff.h @@ -5,10 +5,23 @@ #include #include -// A function to read bytes from some input source. The bytes should be -// already unstuffed (and thus without markers). -// A return value of -1 indicates error, a return value of 0 indicates EOF. -typedef ssize_t (raw_input_func_t)(void*, uint8_t*, size_t); +#include "bytesource.h" +#include "bitsource.h" + +// About 99% of all Huffman codes are <= 8 bits long (see codelen.txt), +// and it's what libjpeg uses. Thus, it seems like a reasonable size. +#define DEHUF_TABLE_BITS 8 +#define DEHUF_TABLE_SIZE (1 << DEHUF_TABLE_BITS) +static const int DEHUF_SLOW_PATH = -1; + +// About 98% of all AC coefficients (control byte + coefficient) are <= 10 bits +// long; again, see codelen.txt. This will cost us about 6 kB of data to store +// in L1 cache. +#define DEHUF_AC_TABLE_BITS 10 +#define DEHUF_AC_TABLE_SIZE (1 << DEHUF_AC_TABLE_BITS) +static const int AC_DEHUF_SLOW_PATH = 0xf0000000; +static const int AC_END_OF_BLOCK = 0xf0000001; +static const int AC_SIXTEEN_ZEROS = 0xf0000002; struct huffman_table { unsigned num_codes[17]; // BITS @@ -20,6 +33,24 @@ struct huffman_table { int maxcode[16]; int mincode[16]; unsigned valptr[16]; + + // Lookup table for fast decoding; given eight bits, + // return the symbol and length in bits. For longer codes, + // DEHUF_SLOW_PATH is returned. + + // Note that the codes we return are 8-bit, but the type of + // the lookup tables is int to avoid extra zero extending. + int lookup_table_codes[DEHUF_TABLE_SIZE]; + int lookup_table_length[DEHUF_TABLE_SIZE]; + + // Further lookup tables for decoding AC coefficients. + // (Generated but obviously not used for DC coefficients.) + // Maps from 10-bit lookahead values to the signed coeffient (_codes), + // number of bits to skip (_length) and the number of zero coefficients + // after this one (_skip). + int ac_table_codes[DEHUF_AC_TABLE_SIZE]; + uint8_t ac_table_length[DEHUF_AC_TABLE_SIZE]; + uint8_t ac_table_skip[DEHUF_AC_TABLE_SIZE]; }; enum coefficient_class { @@ -29,6 +60,63 @@ enum coefficient_class { }; typedef struct huffman_table huffman_tables_t[NUM_COEFF_CLASSES][4]; -void read_huffman_tables(huffman_tables_t* dst, raw_input_func_t* input_func, void* userdata); +// Read Huffman tables from a stream, and compute the derived values. +void read_huffman_tables(huffman_tables_t* dst, input_func_t* input_func, void* userdata); + +unsigned read_huffman_symbol_slow_path(const struct huffman_table* table, + struct bit_source* source); + +static inline unsigned read_huffman_symbol_no_refill( + const struct huffman_table* table, + struct bit_source* source) +{ + assert(source->bits_available >= DEHUF_TABLE_BITS); + unsigned lookup = peek_bits(source, DEHUF_TABLE_BITS); + int code = table->lookup_table_codes[lookup]; + int length = table->lookup_table_length[lookup]; + + if (code == DEHUF_SLOW_PATH) { + return read_huffman_symbol_slow_path(table, source); + } + + read_bits(source, length); + return code; +} + +static inline unsigned read_huffman_symbol(const struct huffman_table* table, + struct bit_source* source) +{ + possibly_refill(source, DEHUF_TABLE_BITS); + return read_huffman_symbol_no_refill(table, source); +} + +// procedure EXTEND (figure F.12) + +// Fast lookup table for (1 << (bits - 1)). +// The table actually helps, since the load can go in parallel with the shift +// operation below. +static const int bit_thresholds[16] = { + 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14 +}; + +static inline unsigned extend(int val, unsigned bits) +{ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + // GCC should ideally be able to figure out that the conditional move is better, but + // it doesn't for various reasons, and this is pretty important for speed, so we hardcode. + asm("cmp %1, %0 ; cmovl %2, %0" + : "+r" (val) + : "g" (bit_thresholds[bits]), + "r" (val + (-1 << bits) + 1) + : "cc"); + return val; +#else + if (val < bit_thresholds[bits]) { + return val + (-1 << bits) + 1; + } else { + return val; + } +#endif +} #endif /* !defined(_DEHUFF_H) */