]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/vp3.c
dont check if the ac esc 3 could have been stored as vlc as this detects only very...
[ffmpeg] / libavcodec / vp3.c
index 409e7923ad16655559b4eae8a0d9f15b4ac53635..c72c7fc16479d7297b507e046c46a314e6937feb 100644 (file)
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * VP3 Video Decoder by Mike Melanson (melanson@pcisys.net)
+ * For more information about the VP3 coding process, visit:
+ *   http://www.pcisys.net/~melanson/codecs/
+ *
+ * Theora decoder by Alex Beregszaszi
  *
  */
 
@@ -35,7 +39,6 @@
 #include "dsputil.h"
 #include "mpegvideo.h"
 #include "dsputil.h"
-#include "bswap.h"
 
 #include "vp3data.h"
 
@@ -140,9 +143,6 @@ typedef struct Vp3Fragment {
     int last_coeff;
     int motion_x;
     int motion_y;
-    /* this indicates which ffmpeg put_pixels() function to use:
-     * 00b = no halfpel, 01b = x halfpel, 10b = y halfpel, 11b = both halfpel */
-    int motion_halfpel_index;
     /* address of first pixel taking into account which plane the fragment
      * lives on as well as the plane stride */
     int first_pixel;
@@ -215,6 +215,7 @@ static int ModeAlphabet[7][CODING_MODE_COUNT] =
 
 typedef struct Vp3DecodeContext {
     AVCodecContext *avctx;
+    int theora, theora_tables;
     int width, height;
     AVFrame golden_frame;
     AVFrame last_frame;
@@ -247,6 +248,13 @@ typedef struct Vp3DecodeContext {
     Vp3Fragment *all_fragments;
     int u_fragment_start;
     int v_fragment_start;
+    
+    /* tables */
+    uint16_t coded_dc_scale_factor[64];
+    uint32_t coded_quality_threshold[64];
+    uint16_t coded_intra_y_dequant[64];
+    uint16_t coded_intra_c_dequant[64];
+    uint16_t coded_inter_dequant[64];
 
     /* this is a list of indices into the all_fragments array indicating
      * which of the fragments are coded */
@@ -289,8 +297,311 @@ typedef struct Vp3DecodeContext {
     int last_coded_y_fragment;
     int last_coded_c_fragment;
 
+    uint8_t edge_emu_buffer[9*2048]; //FIXME dynamic alloc
+    uint8_t qscale_table[2048]; //FIXME dynamic alloc (width+15)/16
 } Vp3DecodeContext;
 
+/************************************************************************
+ * VP3 I/DCT
+ ************************************************************************/
+
+#define IdctAdjustBeforeShift 8
+#define xC1S7 64277
+#define xC2S6 60547
+#define xC3S5 54491
+#define xC4S4 46341
+#define xC5S3 36410
+#define xC6S2 25080
+#define xC7S1 12785
+
+void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix, 
+    int16_t *output_data)
+{
+    int32_t intermediate_data[64];
+    int32_t *ip = intermediate_data;
+    int16_t *op = output_data;
+
+    int32_t A_, B_, C_, D_, _Ad, _Bd, _Cd, _Dd, E_, F_, G_, H_;
+    int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+    int32_t t1, t2;
+
+    int i, j;
+
+    debug_idct("raw coefficient block:\n");
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            debug_idct(" %5d", input_data[i * 8 + j]);
+        }
+        debug_idct("\n");
+    }
+    debug_idct("\n");
+
+    for (i = 0; i < 64; i++) {
+        j = dezigzag_index[i];
+        intermediate_data[j] = dequant_matrix[i] * input_data[i];
+    }
+
+    debug_idct("dequantized block:\n");
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            debug_idct(" %5d", intermediate_data[i * 8 + j]);
+        }
+        debug_idct("\n");
+    }
+    debug_idct("\n");
+
+    /* Inverse DCT on the rows now */
+    for (i = 0; i < 8; i++) {
+        /* Check for non-zero values */
+        if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
+            t1 = (int32_t)(xC1S7 * ip[1]);
+            t2 = (int32_t)(xC7S1 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+            A_ = t1 + t2;
+
+            t1 = (int32_t)(xC7S1 * ip[1]);
+            t2 = (int32_t)(xC1S7 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+            B_ = t1 - t2;
+
+            t1 = (int32_t)(xC3S5 * ip[3]);
+            t2 = (int32_t)(xC5S3 * ip[5]);
+            t1 >>= 16;
+            t2 >>= 16;
+            C_ = t1 + t2;
+
+            t1 = (int32_t)(xC3S5 * ip[5]);
+            t2 = (int32_t)(xC5S3 * ip[3]);
+            t1 >>= 16;
+            t2 >>= 16;
+            D_ = t1 - t2;
+
+
+            t1 = (int32_t)(xC4S4 * (A_ - C_));
+            t1 >>= 16;
+            _Ad = t1;
+
+            t1 = (int32_t)(xC4S4 * (B_ - D_));
+            t1 >>= 16;
+            _Bd = t1;
+
+
+            _Cd = A_ + C_;
+            _Dd = B_ + D_;
+
+            t1 = (int32_t)(xC4S4 * (ip[0] + ip[4]));
+            t1 >>= 16;
+            E_ = t1;
+
+            t1 = (int32_t)(xC4S4 * (ip[0] - ip[4]));
+            t1 >>= 16;
+            F_ = t1;
+
+            t1 = (int32_t)(xC2S6 * ip[2]);
+            t2 = (int32_t)(xC6S2 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+            G_ = t1 + t2;
+
+            t1 = (int32_t)(xC6S2 * ip[2]);
+            t2 = (int32_t)(xC2S6 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+            H_ = t1 - t2;
+
+
+            _Ed = E_ - G_;
+            _Gd = E_ + G_;
+
+            _Add = F_ + _Ad;
+            _Bdd = _Bd - H_;
+
+            _Fd = F_ - _Ad;
+            _Hd = _Bd + H_;
+
+            /*  Final sequence of operations over-write original inputs. */
+            ip[0] = (int16_t)((_Gd + _Cd )   >> 0);
+            ip[7] = (int16_t)((_Gd - _Cd )   >> 0);
+
+            ip[1] = (int16_t)((_Add + _Hd )  >> 0);
+            ip[2] = (int16_t)((_Add - _Hd )  >> 0);
+
+            ip[3] = (int16_t)((_Ed + _Dd )   >> 0);
+            ip[4] = (int16_t)((_Ed - _Dd )   >> 0);
+
+            ip[5] = (int16_t)((_Fd + _Bdd )  >> 0);
+            ip[6] = (int16_t)((_Fd - _Bdd )  >> 0);
+
+        }
+
+        ip += 8;            /* next row */
+    }
+
+    ip = intermediate_data;
+
+    for ( i = 0; i < 8; i++) {
+        /* Check for non-zero values (bitwise or faster than ||) */
+        if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+             ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+
+            t1 = (int32_t)(xC1S7 * ip[1*8]);
+            t2 = (int32_t)(xC7S1 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            A_ = t1 + t2;
+
+            t1 = (int32_t)(xC7S1 * ip[1*8]);
+            t2 = (int32_t)(xC1S7 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            B_ = t1 - t2;
+
+            t1 = (int32_t)(xC3S5 * ip[3*8]);
+            t2 = (int32_t)(xC5S3 * ip[5*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            C_ = t1 + t2;
+
+            t1 = (int32_t)(xC3S5 * ip[5*8]);
+            t2 = (int32_t)(xC5S3 * ip[3*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            D_ = t1 - t2;
+
+
+            t1 = (int32_t)(xC4S4 * (A_ - C_));
+            t1 >>= 16;
+            _Ad = t1;
+
+            t1 = (int32_t)(xC4S4 * (B_ - D_));
+            t1 >>= 16;
+            _Bd = t1;
+
+
+            _Cd = A_ + C_;
+            _Dd = B_ + D_;
+
+            t1 = (int32_t)(xC4S4 * (ip[0*8] + ip[4*8]));
+            t1 >>= 16;
+            E_ = t1;
+
+            t1 = (int32_t)(xC4S4 * (ip[0*8] - ip[4*8]));
+            t1 >>= 16;
+            F_ = t1;
+
+            t1 = (int32_t)(xC2S6 * ip[2*8]);
+            t2 = (int32_t)(xC6S2 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            G_ = t1 + t2;
+
+            t1 = (int32_t)(xC6S2 * ip[2*8]);
+            t2 = (int32_t)(xC2S6 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            H_ = t1 - t2;
+
+
+            _Ed = E_ - G_;
+            _Gd = E_ + G_;
+
+            _Add = F_ + _Ad;
+            _Bdd = _Bd - H_;
+
+            _Fd = F_ - _Ad;
+            _Hd = _Bd + H_;
+
+            _Gd += IdctAdjustBeforeShift;
+            _Add += IdctAdjustBeforeShift;
+            _Ed += IdctAdjustBeforeShift;
+            _Fd += IdctAdjustBeforeShift;
+
+            /* Final sequence of operations over-write original inputs. */
+            op[0*8] = (int16_t)((_Gd + _Cd )   >> 4);
+            op[7*8] = (int16_t)((_Gd - _Cd )   >> 4);
+
+            op[1*8] = (int16_t)((_Add + _Hd )  >> 4);
+            op[2*8] = (int16_t)((_Add - _Hd )  >> 4);
+
+            op[3*8] = (int16_t)((_Ed + _Dd )   >> 4);
+            op[4*8] = (int16_t)((_Ed - _Dd )   >> 4);
+
+            op[5*8] = (int16_t)((_Fd + _Bdd )  >> 4);
+            op[6*8] = (int16_t)((_Fd - _Bdd )  >> 4);
+
+        } else {
+
+            op[0*8] = 0;
+            op[7*8] = 0;
+            op[1*8] = 0;
+            op[2*8] = 0;
+            op[3*8] = 0;
+            op[4*8] = 0;
+            op[5*8] = 0;
+            op[6*8] = 0;
+        }
+
+        ip++;            /* next column */
+        op++;
+    }
+}
+
+void vp3_idct_put(int16_t *input_data, int16_t *dequant_matrix, 
+    uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+
+    vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            if (*op < -128)
+                *dest = 0;
+            else if (*op > 127)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(*op + 128);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
+
+void vp3_idct_add(int16_t *input_data, int16_t *dequant_matrix, 
+    uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+    int16_t sample;
+
+    vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            sample = *dest + *op;
+            if (sample < 0)
+                *dest = 0;
+            else if (sample > 255)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(sample & 0xFF);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
+
 /************************************************************************
  * VP3 specific functions
  ************************************************************************/
@@ -817,6 +1128,8 @@ static void init_frame(Vp3DecodeContext *s, GetBitContext *gb)
         memset(s->all_fragments[i].coeffs, 0, 64 * sizeof(DCTELEM));
         s->all_fragments[i].coeff_count = 0;
         s->all_fragments[i].last_coeff = 0;
+s->all_fragments[i].motion_x = 0xbeef;
+s->all_fragments[i].motion_y = 0xbeef;
     }
 }
 
@@ -827,8 +1140,8 @@ static void init_frame(Vp3DecodeContext *s, GetBitContext *gb)
 static void init_dequantizer(Vp3DecodeContext *s)
 {
 
-    int quality_scale = vp31_quality_threshold[s->quality_index];
-    int dc_scale_factor = vp31_dc_scale_factor[s->quality_index];
+    int quality_scale = s->coded_quality_threshold[s->quality_index];
+    int dc_scale_factor = s->coded_dc_scale_factor[s->quality_index];
     int i, j;
 
     debug_vp3("  vp3: initializing dequantization tables\n");
@@ -845,20 +1158,20 @@ static void init_dequantizer(Vp3DecodeContext *s)
      *
      * Then, saturate the result to a lower limit of MIN_DEQUANT_VAL.
      */
-#define SCALER 1
+#define SCALER 4
 
     /* scale DC quantizers */
-    s->intra_y_dequant[0] = vp31_intra_y_dequant[0] * dc_scale_factor / 100;
+    s->intra_y_dequant[0] = s->coded_intra_y_dequant[0] * dc_scale_factor / 100;
     if (s->intra_y_dequant[0] < MIN_DEQUANT_VAL * 2)
         s->intra_y_dequant[0] = MIN_DEQUANT_VAL * 2;
     s->intra_y_dequant[0] *= SCALER;
 
-    s->intra_c_dequant[0] = vp31_intra_c_dequant[0] * dc_scale_factor / 100;
+    s->intra_c_dequant[0] = s->coded_intra_c_dequant[0] * dc_scale_factor / 100;
     if (s->intra_c_dequant[0] < MIN_DEQUANT_VAL * 2)
         s->intra_c_dequant[0] = MIN_DEQUANT_VAL * 2;
     s->intra_c_dequant[0] *= SCALER;
 
-    s->inter_dequant[0] = vp31_inter_dequant[0] * dc_scale_factor / 100;
+    s->inter_dequant[0] = s->coded_inter_dequant[0] * dc_scale_factor / 100;
     if (s->inter_dequant[0] < MIN_DEQUANT_VAL * 4)
         s->inter_dequant[0] = MIN_DEQUANT_VAL * 4;
     s->inter_dequant[0] *= SCALER;
@@ -869,21 +1182,23 @@ static void init_dequantizer(Vp3DecodeContext *s)
 
         j = zigzag_index[i];
 
-        s->intra_y_dequant[j] = vp31_intra_y_dequant[i] * quality_scale / 100;
+        s->intra_y_dequant[j] = s->coded_intra_y_dequant[i] * quality_scale / 100;
         if (s->intra_y_dequant[j] < MIN_DEQUANT_VAL)
             s->intra_y_dequant[j] = MIN_DEQUANT_VAL;
         s->intra_y_dequant[j] *= SCALER;
 
-        s->intra_c_dequant[j] = vp31_intra_c_dequant[i] * quality_scale / 100;
+        s->intra_c_dequant[j] = s->coded_intra_c_dequant[i] * quality_scale / 100;
         if (s->intra_c_dequant[j] < MIN_DEQUANT_VAL)
             s->intra_c_dequant[j] = MIN_DEQUANT_VAL;
         s->intra_c_dequant[j] *= SCALER;
 
-        s->inter_dequant[j] = vp31_inter_dequant[i] * quality_scale / 100;
+        s->inter_dequant[j] = s->coded_inter_dequant[i] * quality_scale / 100;
         if (s->inter_dequant[j] < MIN_DEQUANT_VAL * 2)
             s->inter_dequant[j] = MIN_DEQUANT_VAL * 2;
         s->inter_dequant[j] *= SCALER;
     }
+    
+    memset(s->qscale_table, (FFMAX(s->intra_y_dequant[1], s->intra_c_dequant[1])+8)/16, 512); //FIXME finetune
 
     /* print debug information as requested */
     debug_dequantizers("intra Y dequantizers:\n");
@@ -1132,6 +1447,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
     int current_run = 0;
     int decode_fully_flags = 0;
     int decode_partial_blocks = 0;
+    int first_c_fragment_seen;
 
     int i, j;
     int current_fragment;
@@ -1161,14 +1477,14 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
 
                 /* if any of the superblocks are not partially coded, flag
                  * a boolean to decode the list of fully-coded superblocks */
-                if (bit == 0)
+                if (bit == 0) {
                     decode_fully_flags = 1;
-            } else {
-
-                /* make a note of the fact that there are partially coded
-                 * superblocks */
-                decode_partial_blocks = 1;
+                } else {
 
+                    /* make a note of the fact that there are partially coded
+                     * superblocks */
+                    decode_partial_blocks = 1;
+                }
             }
             s->superblock_coding[current_superblock++] = 
                 (bit) ? SB_PARTIALLY_CODED : SB_NOT_CODED;
@@ -1223,6 +1539,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
     s->coded_fragment_list_index = 0;
     s->first_coded_y_fragment = s->first_coded_c_fragment = 0;
     s->last_coded_y_fragment = s->last_coded_c_fragment = -1;
+    first_c_fragment_seen = 0;
     memset(s->macroblock_coding, MODE_COPY, s->macroblock_count);
     for (i = 0; i < s->superblock_count; i++) {
 
@@ -1253,15 +1570,18 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                     }
 
                     if (bit) {
-                        /* mode will be decoded in the next phase */
+                        /* default mode; actual mode will be decoded in 
+                         * the next phase */
                         s->all_fragments[current_fragment].coding_method = 
                             MODE_INTER_NO_MV;
                         s->coded_fragment_list[s->coded_fragment_list_index] = 
                             current_fragment;
                         if ((current_fragment >= s->u_fragment_start) &&
-                            (s->last_coded_y_fragment == -1)) {
+                            (s->last_coded_y_fragment == -1) &&
+                            (!first_c_fragment_seen)) {
                             s->first_coded_c_fragment = s->coded_fragment_list_index;
                             s->last_coded_y_fragment = s->first_coded_c_fragment - 1;
+                            first_c_fragment_seen = 1;
                         }
                         s->coded_fragment_list_index++;
                         s->macroblock_coding[s->all_fragments[current_fragment].macroblock] = MODE_INTER_NO_MV;
@@ -1286,9 +1606,11 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                     s->coded_fragment_list[s->coded_fragment_list_index] = 
                         current_fragment;
                     if ((current_fragment >= s->u_fragment_start) &&
-                        (s->last_coded_y_fragment == -1)) {
+                        (s->last_coded_y_fragment == -1) &&
+                        (!first_c_fragment_seen)) {
                         s->first_coded_c_fragment = s->coded_fragment_list_index;
                         s->last_coded_y_fragment = s->first_coded_c_fragment - 1;
+                        first_c_fragment_seen = 1;
                     }
                     s->coded_fragment_list_index++;
                     s->macroblock_coding[s->all_fragments[current_fragment].macroblock] = MODE_INTER_NO_MV;
@@ -1299,11 +1621,13 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
         }
     }
 
-    if (s->first_coded_c_fragment == 0)
-        /* no C fragments coded */
+    if (!first_c_fragment_seen)
+        /* only Y fragments coded in this frame */
         s->last_coded_y_fragment = s->coded_fragment_list_index - 1;
-    else
+    else 
+        /* end the list of coded C fragments */
         s->last_coded_c_fragment = s->coded_fragment_list_index - 1;
+
     debug_block_coding("    %d total coded fragments, y: %d -> %d, c: %d -> %d\n",
         s->coded_fragment_list_index,
         s->first_coded_y_fragment,
@@ -1398,55 +1722,6 @@ static int unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
     return 0;
 }
 
-/*
- * This function adjusts the components of a motion vector for the halfpel
- * motion grid. c_plane indicates whether the vector applies to the U or V
- * plane. The function returns the halfpel function index to be used in
- * ffmpeg's put_pixels[]() array of functions.
- */
-static inline int adjust_vector(int *x, int *y, int c_plane)
-{
-    int motion_halfpel_index = 0;
-    int x_halfpel;
-    int y_halfpel;
-
-    if (!c_plane) {
-
-        x_halfpel = *x & 1;
-        motion_halfpel_index |= x_halfpel;
-        if (*x >= 0)
-            *x >>= 1;
-        else
-            *x = -( (-(*x) >> 1) + x_halfpel);
-
-        y_halfpel = *y & 1;
-        motion_halfpel_index |= (y_halfpel << 1);
-        if (*y >= 0)
-            *y >>= 1;
-        else
-            *y = -( (-(*y) >> 1) + y_halfpel);
-
-    } else {
-
-        x_halfpel = ((*x & 0x03) != 0);
-        motion_halfpel_index |= x_halfpel;
-        if (*x >= 0)
-            *x >>= 2;
-        else
-            *x = -( (-(*x) >> 2) + x_halfpel);
-
-        y_halfpel = ((*y & 0x03) != 0);
-        motion_halfpel_index |= (y_halfpel << 1);
-        if (*y >= 0)
-            *y >>= 2;
-        else
-            *y = -( (-(*y) >> 2) + y_halfpel);
-
-    }
-
-    return motion_halfpel_index;
-}
-
 /*
  * This function unpacks all the motion vectors for the individual
  * macroblocks from the bitstream.
@@ -1465,7 +1740,6 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
     int current_fragment;
 
     debug_vp3("  vp3: unpacking motion vectors\n");
-
     if (s->keyframe) {
 
         debug_vp3("    keyframe-- there are no motion vectors\n");
@@ -1519,7 +1793,7 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                     }
 
                     /* vector maintenance, only on MODE_INTER_PLUS_MV */
-                    if (s->all_fragments[current_fragment].coding_method ==
+                    if (s->macroblock_coding[current_macroblock] ==
                         MODE_INTER_PLUS_MV) {
                         prior_last_motion_x = last_motion_x;
                         prior_last_motion_y = last_motion_y;
@@ -1606,7 +1880,7 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                 /* assign the motion vectors to the correct fragments */
                 debug_vectors("    vectors for macroblock starting @ fragment %d (coding method %d):\n",
                     current_fragment,
-                    s->all_fragments[current_fragment].coding_method);
+                    s->macroblock_coding[current_macroblock]);
                 for (k = 0; k < 6; k++) {
                     current_fragment = 
                         s->macroblock_fragments[current_macroblock * 6 + k];
@@ -1617,14 +1891,10 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                             current_fragment, s->fragment_count);
                         return 1;
                     }
-                    s->all_fragments[current_fragment].motion_halfpel_index =
-                        adjust_vector(&motion_x[k], &motion_y[k],
-                        ((k == 4) || (k == 5)));
                     s->all_fragments[current_fragment].motion_x = motion_x[k];
                     s->all_fragments[current_fragment].motion_y = motion_y[k];
-                    debug_vectors("    vector %d: fragment %d = (%d, %d), index %d\n",
-                        k, current_fragment, motion_x[k], motion_y[k],
-                        s->all_fragments[current_fragment].motion_halfpel_index);
+                    debug_vectors("    vector %d: fragment %d = (%d, %d)\n",
+                        k, current_fragment, motion_x[k], motion_y[k]);
                 }
             }
         }
@@ -1656,14 +1926,12 @@ static int unpack_vlcs(Vp3DecodeContext *s, GetBitContext *gb,
     DCTELEM coeff;
     Vp3Fragment *fragment;
 
-    if ((first_fragment < 0) ||
-        (first_fragment >= s->fragment_count) ||
-        (last_fragment < 0) ||
+    if ((first_fragment >= s->fragment_count) ||
         (last_fragment >= s->fragment_count)) {
 
         printf ("  vp3:unpack_vlcs(): bad fragment number (%d -> %d ?)\n",
             first_fragment, last_fragment);
-        return 1;
+        return 0;
     }
 
     for (i = first_fragment; i <= last_fragment; i++) {
@@ -2078,10 +2346,7 @@ static void render_fragments(Vp3DecodeContext *s,
     int x, y;
     int m, n;
     int i = first_fragment;
-    int j;
     int16_t *dequantizer;
-    DCTELEM dequant_block[64];
-    DCTELEM dequant_block_permuted[64];
     unsigned char *output_plane;
     unsigned char *last_plane;
     unsigned char *golden_plane;
@@ -2089,7 +2354,7 @@ static void render_fragments(Vp3DecodeContext *s,
     int motion_x, motion_y;
     int upper_motion_limit, lower_motion_limit;
     int motion_halfpel_index;
-    unsigned int motion_source;
+    uint8_t *motion_source;
 
     debug_vp3("  vp3: rendering final fragments for %s\n",
         (plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane");
@@ -2135,40 +2400,55 @@ static void render_fragments(Vp3DecodeContext *s,
             /* transform if this block was coded */
             if (s->all_fragments[i].coding_method != MODE_COPY) {
 
-                /* sort out the motion vector */
-                motion_x = s->all_fragments[i].motion_x;
-                motion_y = s->all_fragments[i].motion_y;
-                motion_halfpel_index = s->all_fragments[i].motion_halfpel_index;
-
-                motion_source = s->all_fragments[i].first_pixel;
-                motion_source += motion_x;
-                motion_source += (motion_y * stride);
-
-                /* if the are any problems with a motion vector, refuse
-                 * to render the block */
-                if ((motion_source < upper_motion_limit) ||
-                    (motion_source > lower_motion_limit)) {
-//                    printf ("  vp3: help! motion source (%d) out of range (%d..%d)\n",
-//                        motion_source, upper_motion_limit, lower_motion_limit);
-                    continue;
+                if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
+                    (s->all_fragments[i].coding_method == MODE_GOLDEN_MV))
+                    motion_source= golden_plane;
+                else 
+                    motion_source= last_plane;
+
+                motion_source += s->all_fragments[i].first_pixel;
+                motion_halfpel_index = 0;
+
+                /* sort out the motion vector if this fragment is coded
+                 * using a motion vector method */
+                if ((s->all_fragments[i].coding_method > MODE_INTRA) &&
+                    (s->all_fragments[i].coding_method != MODE_USING_GOLDEN)) {
+                    int src_x, src_y;
+                    motion_x = s->all_fragments[i].motion_x;
+                    motion_y = s->all_fragments[i].motion_y;
+                    if(plane){
+                        motion_x= (motion_x>>1) | (motion_x&1);
+                        motion_y= (motion_y>>1) | (motion_y&1);
+                    }
+
+                    src_x= (motion_x>>1) + x;
+                    src_y= (motion_y>>1) + y;
+if ((motion_x == 0xbeef) || (motion_y == 0xbeef))
+printf (" help! got beefy vector! (%X, %X)\n", motion_x, motion_y);
+
+                    motion_halfpel_index = motion_x & 0x01;
+                    motion_source += (motion_x >> 1);
+
+//                    motion_y = -motion_y;
+                    motion_halfpel_index |= (motion_y & 0x01) << 1;
+                    motion_source += ((motion_y >> 1) * stride);
+
+                    if(src_x<0 || src_y<0 || src_x + 9 >= width || src_y + 9 >= height){
+                        uint8_t *temp= s->edge_emu_buffer;
+                        if(stride<0) temp -= 9*stride;
+
+                        ff_emulated_edge_mc(temp, motion_source, stride, 9, 9, src_x, src_y, width, height);
+                        motion_source= temp;
+                    }
                 }
 
                 /* first, take care of copying a block from either the
                  * previous or the golden frame */
-                if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
-                    (s->all_fragments[i].coding_method == MODE_GOLDEN_MV)) {
-
-                    s->dsp.put_pixels_tab[1][motion_halfpel_index](
-                        output_plane + s->all_fragments[i].first_pixel,
-                        golden_plane + motion_source,
-                        stride, 8);
-
-                } else 
                 if (s->all_fragments[i].coding_method != MODE_INTRA) {
 
-                    s->dsp.put_pixels_tab[1][motion_halfpel_index](
+                    s->dsp.put_no_rnd_pixels_tab[1][motion_halfpel_index](
                         output_plane + s->all_fragments[i].first_pixel,
-                        last_plane + motion_source,
+                        motion_source,
                         stride, 8);
                 }
 
@@ -2176,34 +2456,16 @@ static void render_fragments(Vp3DecodeContext *s,
                 debug_idct("fragment %d, coding mode %d, DC = %d, dequant = %d:\n", 
                     i, s->all_fragments[i].coding_method, 
                     s->all_fragments[i].coeffs[0], dequantizer[0]);
-                for (j = 0; j < 64; j++)
-                    dequant_block[dezigzag_index[j]] =
-                        s->all_fragments[i].coeffs[j] *
-                        dequantizer[j];
-                for (j = 0; j < 64; j++)
-                    dequant_block_permuted[s->dsp.idct_permutation[j]] =
-                        dequant_block[j];
-
-                debug_idct("dequantized block:\n");
-                for (m = 0; m < 8; m++) {
-                    for (n = 0; n < 8; n++) {
-                        debug_idct(" %5d", dequant_block[m * 8 + n]);
-                    }
-                    debug_idct("\n");
-                }
-                debug_idct("\n");
 
                 /* invert DCT and place (or add) in final output */
-
                 if (s->all_fragments[i].coding_method == MODE_INTRA) {
-                    dequant_block_permuted[0] += 1024;
-                    s->dsp.idct_put(
+                    vp3_idct_put(s->all_fragments[i].coeffs, dequantizer,
                         output_plane + s->all_fragments[i].first_pixel,
-                        stride, dequant_block_permuted);
+                        stride);
                 } else {
-                    s->dsp.idct_add(
+                    vp3_idct_add(s->all_fragments[i].coeffs, dequantizer,
                         output_plane + s->all_fragments[i].first_pixel,
-                        stride, dequant_block_permuted);
+                        stride);
                 }
 
                 debug_idct("block after idct_%s():\n",
@@ -2298,8 +2560,13 @@ static int vp3_decode_init(AVCodecContext *avctx)
     int c_superblock_count;
 
     s->avctx = avctx;
+#if 0
     s->width = avctx->width;
     s->height = avctx->height;
+#else
+    s->width = (avctx->width + 15) & 0xFFFFFFF0;
+    s->height = (avctx->height + 15) & 0xFFFFFFF0;
+#endif
     avctx->pix_fmt = PIX_FMT_YUV420P;
     avctx->has_b_frames = 0;
     dsputil_init(&s->dsp, avctx);
@@ -2357,6 +2624,20 @@ static int vp3_decode_init(AVCodecContext *avctx)
     s->coded_fragment_list = av_malloc(s->fragment_count * sizeof(int));
     s->pixel_addresses_inited = 0;
 
+    if (!s->theora_tables)
+    {
+       for (i = 0; i < 64; i++)
+           s->coded_dc_scale_factor[i] = vp31_dc_scale_factor[i];
+       for (i = 0; i < 64; i++)
+           s->coded_quality_threshold[i] = vp31_quality_threshold[i];
+       for (i = 0; i < 64; i++)
+           s->coded_intra_y_dequant[i] = vp31_intra_y_dequant[i];
+       for (i = 0; i < 64; i++)
+           s->coded_intra_c_dequant[i] = vp31_intra_c_dequant[i];
+       for (i = 0; i < 64; i++)
+           s->coded_inter_dequant[i] = vp31_inter_dequant[i];
+    }
+
     /* init VLC tables */
     for (i = 0; i < 16; i++) {
 
@@ -2420,22 +2701,39 @@ static int vp3_decode_frame(AVCodecContext *avctx,
     *data_size = 0;
 
     init_get_bits(&gb, buf, buf_size * 8);
+    
+    if (s->theora && get_bits1(&gb))
+    {
+       printf("Theora: bad frame indicator\n");
+       return -1;
+    }
 
-    s->keyframe = get_bits(&gb, 1);
-    s->keyframe ^= 1;
-    skip_bits(&gb, 1);
+    s->keyframe = !get_bits1(&gb);
+    if (s->theora && s->keyframe)
+    {
+       if (get_bits1(&gb))
+           printf("Theora: warning, unsupported keyframe coding type?!\n");
+       skip_bits(&gb, 2); /* reserved? */
+    }
+    else
+       skip_bits(&gb, 1);
     s->last_quality_index = s->quality_index;
     s->quality_index = get_bits(&gb, 6);
-    if (s->quality_index != s->last_quality_index)
-        init_dequantizer(s);
 
-    debug_vp3(" VP3 frame #%d: Q index = %d", counter, s->quality_index);
+    debug_vp3(" VP3 %sframe #%d: Q index = %d\n",
+       s->keyframe?"key":"", counter, s->quality_index);
     counter++;
 
+    if (s->quality_index != s->last_quality_index)
+        init_dequantizer(s);
+
     if (s->keyframe) {
+        /* skip the other 2 header bytes for now */
+        if (!s->theora) skip_bits(&gb, 16);
         if (s->last_frame.data[0] == s->golden_frame.data[0]) {
             if (s->golden_frame.data[0])
                 avctx->release_buffer(avctx, &s->golden_frame);
+            s->last_frame= s->golden_frame; /* ensure that we catch any access to this released frame */
         } else {
             if (s->golden_frame.data[0])
                 avctx->release_buffer(avctx, &s->golden_frame);
@@ -2443,7 +2741,7 @@ static int vp3_decode_frame(AVCodecContext *avctx,
                 avctx->release_buffer(avctx, &s->last_frame);
         }
 
-        s->golden_frame.reference = 0;
+        s->golden_frame.reference = 3;
         if(avctx->get_buffer(avctx, &s->golden_frame) < 0) {
             printf("vp3: get_buffer() failed\n");
             return -1;
@@ -2457,22 +2755,16 @@ static int vp3_decode_frame(AVCodecContext *avctx,
             vp3_calculate_pixel_addresses(s);
 
     } else {
-
         /* allocate a new current frame */
-        s->current_frame.reference = 0;
+        s->current_frame.reference = 3;
         if(avctx->get_buffer(avctx, &s->current_frame) < 0) {
             printf("vp3: get_buffer() failed\n");
             return -1;
         }
-
     }
 
-    if (s->keyframe) {
-      debug_vp3(", keyframe\n");
-      /* skip the other 2 header bytes for now */
-      skip_bits(&gb, 16);
-    } else
-      debug_vp3("\n");
+    s->current_frame.qscale_table= s->qscale_table; //FIXME allocate individual tables per AVFrame
+    s->current_frame.qstride= 0;
 
     init_frame(s, &gb);
 
@@ -2499,14 +2791,19 @@ if (!s->keyframe) {
     }
 
     reverse_dc_prediction(s, 0, s->fragment_width, s->fragment_height);
-    reverse_dc_prediction(s, s->u_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2);
-    reverse_dc_prediction(s, s->v_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2);
-
     render_fragments(s, 0, s->width, s->height, 0);
-    render_fragments(s, s->u_fragment_start, s->width / 2, s->height / 2, 1);
-    render_fragments(s, s->v_fragment_start, s->width / 2, s->height / 2, 2);
+
+    if ((avctx->flags & CODEC_FLAG_GRAY) == 0) {
+        reverse_dc_prediction(s, s->u_fragment_start,
+            s->fragment_width / 2, s->fragment_height / 2);
+        reverse_dc_prediction(s, s->v_fragment_start,
+            s->fragment_width / 2, s->fragment_height / 2);
+        render_fragments(s, s->u_fragment_start, s->width / 2, s->height / 2, 1);
+        render_fragments(s, s->v_fragment_start, s->width / 2, s->height / 2, 2);
+    } else {
+        memset(s->current_frame.data[1], 0x80, s->width * s->height / 4);
+        memset(s->current_frame.data[2], 0x80, s->width * s->height / 4);
+    }
 
 #if KEYFRAMES_ONLY
 }
@@ -2523,6 +2820,7 @@ if (!s->keyframe) {
 
     /* shuffle frames (last = current) */
     memcpy(&s->last_frame, &s->current_frame, sizeof(AVFrame));
+    s->current_frame.data[0]= NULL; /* ensure that we catch any access to this released frame */
 
     return buf_size;
 }
@@ -2540,9 +2838,9 @@ static int vp3_decode_end(AVCodecContext *avctx)
     av_free(s->superblock_macroblocks);
     av_free(s->macroblock_fragments);
     av_free(s->macroblock_coding);
-
+    
     /* release all frames */
-    if (s->golden_frame.data[0])
+    if (s->golden_frame.data[0] && s->golden_frame.data[0] != s->last_frame.data[0])
         avctx->release_buffer(avctx, &s->golden_frame);
     if (s->last_frame.data[0])
         avctx->release_buffer(avctx, &s->last_frame);
@@ -2552,6 +2850,133 @@ static int vp3_decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+/* current version is 3.2.0 */
+
+static int theora_decode_header(AVCodecContext *avctx, GetBitContext gb)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+
+    skip_bits(&gb, 8); /* version major */
+    skip_bits(&gb, 8); /* version minor */
+    skip_bits(&gb, 8); /* version micro */
+    
+    s->width = get_bits(&gb, 16) << 4;
+    s->height = get_bits(&gb, 16) << 4;
+    
+    skip_bits(&gb, 24); /* frame width */
+    skip_bits(&gb, 24); /* frame height */
+
+    skip_bits(&gb, 8); /* offset x */
+    skip_bits(&gb, 8); /* offset y */
+
+    skip_bits(&gb, 32); /* fps numerator */
+    skip_bits(&gb, 32); /* fps denumerator */
+    skip_bits(&gb, 24); /* aspect numerator */
+    skip_bits(&gb, 24); /* aspect denumerator */
+    
+    skip_bits(&gb, 5); /* keyframe frequency force */
+    skip_bits(&gb, 8); /* colorspace */
+    skip_bits(&gb, 24); /* bitrate */
+
+    skip_bits(&gb, 6); /* last(?) quality index */
+    
+//    align_get_bits(&gb);
+    
+    avctx->width = s->width;
+    avctx->height = s->height;
+
+    vp3_decode_init(avctx);
+
+    return 0;
+}
+
+static int theora_decode_comments(AVCodecContext *avctx, GetBitContext gb)
+{
+    int nb_comments, i, tmp;
+
+    tmp = get_bits(&gb, 32);
+    while(tmp-=8)
+       skip_bits(&gb, 8);
+
+    nb_comments = get_bits(&gb, 32);
+    for (i = 0; i < nb_comments; i++)
+    {
+       tmp = get_bits(&gb, 32);
+       while(tmp-=8)
+           skip_bits(&gb, 8);
+    }
+    
+    return 0;
+}
+
+static int theora_decode_tables(AVCodecContext *avctx, GetBitContext gb)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+    int i;
+    
+    /* quality threshold table */
+    for (i = 0; i < 64; i++)
+       s->coded_quality_threshold[i] = get_bits(&gb, 16);
+
+    /* dc scale factor table */
+    for (i = 0; i < 64; i++)
+       s->coded_dc_scale_factor[i] = get_bits(&gb, 16);
+
+    /* y coeffs */
+    for (i = 0; i < 64; i++)
+       s->coded_intra_y_dequant[i] = get_bits(&gb, 8);
+
+    /* uv coeffs */
+    for (i = 0; i < 64; i++)
+       s->coded_intra_c_dequant[i] = get_bits(&gb, 8);
+
+    /* inter coeffs */
+    for (i = 0; i < 64; i++)
+       s->coded_inter_dequant[i] = get_bits(&gb, 8);
+    
+    s->theora_tables = 1;
+    
+    return 0;
+}
+
+static int theora_decode_init(AVCodecContext *avctx)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+    GetBitContext gb;
+    int ptype;
+    
+    s->theora = 1;
+
+    if (!avctx->extradata_size)
+       return -1;
+
+    init_get_bits(&gb, avctx->extradata, avctx->extradata_size);
+
+    ptype = get_bits(&gb, 8);
+    debug_vp3("Theora headerpacket type: %x\n", ptype);
+           
+    if (!(ptype & 0x80))
+       return -1;
+       
+    skip_bits(&gb, 6*8); /* "theora" */
+       
+    switch(ptype)
+    {
+        case 0x80:
+            theora_decode_header(avctx, gb);
+           vp3_decode_init(avctx);
+           break;
+       case 0x81:
+           theora_decode_comments(avctx, gb);
+           break;
+       case 0x82:
+           theora_decode_tables(avctx, gb);
+           break;
+    }
+
+    return 0;
+}
+
 AVCodec vp3_decoder = {
     "vp3",
     CODEC_TYPE_VIDEO,
@@ -2564,3 +2989,16 @@ AVCodec vp3_decoder = {
     0,
     NULL
 };
+
+AVCodec theora_decoder = {
+    "theora",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_THEORA,
+    sizeof(Vp3DecodeContext),
+    theora_decode_init,
+    NULL,
+    vp3_decode_end,
+    vp3_decode_frame,
+    0,
+    NULL
+};