#define DEHUF_TABLE_SIZE (1 << DEHUF_TABLE_BITS)
static const int DEHUF_SLOW_PATH = -1;
+// About 98% of all AC coefficients (control byte + coefficient) are <= 10 bits
+// long; again, see codelen.txt. This will cost us about 6 kB of data to store
+// in L1 cache.
+#define DEHUF_AC_TABLE_BITS 10
+#define DEHUF_AC_TABLE_SIZE (1 << DEHUF_AC_TABLE_BITS)
+static const int AC_DEHUF_SLOW_PATH = 0xf0000000;
+
struct huffman_table {
unsigned num_codes[17]; // BITS
unsigned char codes[256]; // HUFFVAL
// the lookup tables is int to avoid extra zero extending.
int lookup_table_codes[DEHUF_TABLE_SIZE];
int lookup_table_length[DEHUF_TABLE_SIZE];
+
+ // Further lookup tables for decoding AC coefficients.
+ // (Generated but obviously not used for DC coefficients.)
+ // Maps from 10-bit lookahead values to the signed coeffient (_codes),
+ // number of bits to skip (_length) and the number of zero coefficients
+ // after this one (_skip).
+ int ac_table_codes[DEHUF_AC_TABLE_SIZE];
+ uint8_t ac_table_length[DEHUF_AC_TABLE_SIZE];
+ uint8_t ac_table_skip[DEHUF_AC_TABLE_SIZE];
};
enum coefficient_class {
unsigned read_huffman_symbol_slow_path(const struct huffman_table* table,
struct bit_source* source);
-static inline unsigned read_huffman_symbol(const struct huffman_table* table,
- struct bit_source* source)
+static inline unsigned read_huffman_symbol_no_refill(
+ const struct huffman_table* table,
+ struct bit_source* source)
{
- // FIXME: We can read past the end of the stream here in some edge
- // cases. We need to define some guarantees in the layers above.
- possibly_refill(source, DEHUF_TABLE_BITS);
+ assert(source->bits_available >= DEHUF_TABLE_BITS);
unsigned lookup = peek_bits(source, DEHUF_TABLE_BITS);
int code = table->lookup_table_codes[lookup];
int length = table->lookup_table_length[lookup];
if (code == DEHUF_SLOW_PATH) {
return read_huffman_symbol_slow_path(table, source);
}
-
+
read_bits(source, length);
return code;
}
+static inline unsigned read_huffman_symbol(const struct huffman_table* table,
+ struct bit_source* source)
+{
+ possibly_refill(source, DEHUF_TABLE_BITS);
+ return read_huffman_symbol_no_refill(table, source);
+}
+
// procedure EXTEND (figure F.12)
// Fast lookup table for (1 << (bits - 1)).
// The table actually helps, since the load can go in parallel with the shift
// operation below.
static const int bit_thresholds[16] = {
- 0, 1 << 0, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15
+ 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14
};
-static inline unsigned extend(int val, int bits)
+static inline unsigned extend(int val, unsigned bits)
{
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+ // GCC should ideally be able to figure out that the conditional move is better, but
+ // it doesn't for various reasons, and this is pretty important for speed, so we hardcode.
+ asm("cmp %1, %0 ; cmovl %2, %0"
+ : "+r" (val)
+ : "g" (bit_thresholds[bits]),
+ "r" (val + (-1 << bits) + 1)
+ : "cc");
+ return val;
+#else
if (val < bit_thresholds[bits]) {
return val + (-1 << bits) + 1;
} else {
return val;
}
+#endif
}
-
#endif /* !defined(_DEHUFF_H) */