dont check if the ac esc 3 could have been stored as vlc as this detects only very...

[ffmpeg] / libavcodec / vp3.c
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c

index 409e7923ad16655559b4eae8a0d9f15b4ac53635..c72c7fc16479d7297b507e046c46a314e6937feb 100644 (file)
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -17,6 +17,10 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   *
   * VP3 Video Decoder by Mike Melanson (melanson@pcisys.net)
+ * For more information about the VP3 coding process, visit:
+ *   http://www.pcisys.net/~melanson/codecs/
+ *
+ * Theora decoder by Alex Beregszaszi
   *
   */
  
@@ -35,7 +39,6 @@
  #include "dsputil.h"
  #include "mpegvideo.h"
  #include "dsputil.h"
-#include "bswap.h"
  
  #include "vp3data.h"
  
@@ -140,9 +143,6 @@ typedef struct Vp3Fragment {
      int last_coeff;
      int motion_x;
      int motion_y;
-    /* this indicates which ffmpeg put_pixels() function to use:
-     * 00b = no halfpel, 01b = x halfpel, 10b = y halfpel, 11b = both halfpel */
-    int motion_halfpel_index;
      /* address of first pixel taking into account which plane the fragment
       * lives on as well as the plane stride */
      int first_pixel;
@@ -215,6 +215,7 @@ static int ModeAlphabet[7][CODING_MODE_COUNT] =
  
  typedef struct Vp3DecodeContext {
      AVCodecContext *avctx;
+    int theora, theora_tables;
      int width, height;
      AVFrame golden_frame;
      AVFrame last_frame;
@@ -247,6 +248,13 @@ typedef struct Vp3DecodeContext {
      Vp3Fragment *all_fragments;
      int u_fragment_start;
      int v_fragment_start;
+    
+    /* tables */
+    uint16_t coded_dc_scale_factor[64];
+    uint32_t coded_quality_threshold[64];
+    uint16_t coded_intra_y_dequant[64];
+    uint16_t coded_intra_c_dequant[64];
+    uint16_t coded_inter_dequant[64];
  
      /* this is a list of indices into the all_fragments array indicating
       * which of the fragments are coded */
@@ -289,8 +297,311 @@ typedef struct Vp3DecodeContext {
      int last_coded_y_fragment;
      int last_coded_c_fragment;
  
+    uint8_t edge_emu_buffer[9*2048]; //FIXME dynamic alloc
+    uint8_t qscale_table[2048]; //FIXME dynamic alloc (width+15)/16
  } Vp3DecodeContext;
  
+/************************************************************************
+ * VP3 I/DCT
+ ************************************************************************/
+
+#define IdctAdjustBeforeShift 8
+#define xC1S7 64277
+#define xC2S6 60547
+#define xC3S5 54491
+#define xC4S4 46341
+#define xC5S3 36410
+#define xC6S2 25080
+#define xC7S1 12785
+
+void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix, 
+    int16_t *output_data)
+{
+    int32_t intermediate_data[64];
+    int32_t *ip = intermediate_data;
+    int16_t *op = output_data;
+
+    int32_t A_, B_, C_, D_, _Ad, _Bd, _Cd, _Dd, E_, F_, G_, H_;
+    int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+    int32_t t1, t2;
+
+    int i, j;
+
+    debug_idct("raw coefficient block:\n");
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            debug_idct(" %5d", input_data[i * 8 + j]);
+        }
+        debug_idct("\n");
+    }
+    debug_idct("\n");
+
+    for (i = 0; i < 64; i++) {
+        j = dezigzag_index[i];
+        intermediate_data[j] = dequant_matrix[i] * input_data[i];
+    }
+
+    debug_idct("dequantized block:\n");
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            debug_idct(" %5d", intermediate_data[i * 8 + j]);
+        }
+        debug_idct("\n");
+    }
+    debug_idct("\n");
+
+    /* Inverse DCT on the rows now */
+    for (i = 0; i < 8; i++) {
+        /* Check for non-zero values */
+        if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
+            t1 = (int32_t)(xC1S7 * ip[1]);
+            t2 = (int32_t)(xC7S1 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+            A_ = t1 + t2;
+
+            t1 = (int32_t)(xC7S1 * ip[1]);
+            t2 = (int32_t)(xC1S7 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+            B_ = t1 - t2;
+
+            t1 = (int32_t)(xC3S5 * ip[3]);
+            t2 = (int32_t)(xC5S3 * ip[5]);
+            t1 >>= 16;
+            t2 >>= 16;
+            C_ = t1 + t2;
+
+            t1 = (int32_t)(xC3S5 * ip[5]);
+            t2 = (int32_t)(xC5S3 * ip[3]);
+            t1 >>= 16;
+            t2 >>= 16;
+            D_ = t1 - t2;
+
+
+            t1 = (int32_t)(xC4S4 * (A_ - C_));
+            t1 >>= 16;
+            _Ad = t1;
+
+            t1 = (int32_t)(xC4S4 * (B_ - D_));
+            t1 >>= 16;
+            _Bd = t1;
+
+
+            _Cd = A_ + C_;
+            _Dd = B_ + D_;
+
+            t1 = (int32_t)(xC4S4 * (ip[0] + ip[4]));
+            t1 >>= 16;
+            E_ = t1;
+
+            t1 = (int32_t)(xC4S4 * (ip[0] - ip[4]));
+            t1 >>= 16;
+            F_ = t1;
+
+            t1 = (int32_t)(xC2S6 * ip[2]);
+            t2 = (int32_t)(xC6S2 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+            G_ = t1 + t2;
+
+            t1 = (int32_t)(xC6S2 * ip[2]);
+            t2 = (int32_t)(xC2S6 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+            H_ = t1 - t2;
+
+
+            _Ed = E_ - G_;
+            _Gd = E_ + G_;
+
+            _Add = F_ + _Ad;
+            _Bdd = _Bd - H_;
+
+            _Fd = F_ - _Ad;
+            _Hd = _Bd + H_;
+
+            /*  Final sequence of operations over-write original inputs. */
+            ip[0] = (int16_t)((_Gd + _Cd )   >> 0);
+            ip[7] = (int16_t)((_Gd - _Cd )   >> 0);
+
+            ip[1] = (int16_t)((_Add + _Hd )  >> 0);
+            ip[2] = (int16_t)((_Add - _Hd )  >> 0);
+
+            ip[3] = (int16_t)((_Ed + _Dd )   >> 0);
+            ip[4] = (int16_t)((_Ed - _Dd )   >> 0);
+
+            ip[5] = (int16_t)((_Fd + _Bdd )  >> 0);
+            ip[6] = (int16_t)((_Fd - _Bdd )  >> 0);
+
+        }
+
+        ip += 8;            /* next row */
+    }
+
+    ip = intermediate_data;
+
+    for ( i = 0; i < 8; i++) {
+        /* Check for non-zero values (bitwise or faster than ||) */
+        if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+             ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+
+            t1 = (int32_t)(xC1S7 * ip[1*8]);
+            t2 = (int32_t)(xC7S1 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            A_ = t1 + t2;
+
+            t1 = (int32_t)(xC7S1 * ip[1*8]);
+            t2 = (int32_t)(xC1S7 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            B_ = t1 - t2;
+
+            t1 = (int32_t)(xC3S5 * ip[3*8]);
+            t2 = (int32_t)(xC5S3 * ip[5*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            C_ = t1 + t2;
+
+            t1 = (int32_t)(xC3S5 * ip[5*8]);
+            t2 = (int32_t)(xC5S3 * ip[3*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            D_ = t1 - t2;
+
+
+            t1 = (int32_t)(xC4S4 * (A_ - C_));
+            t1 >>= 16;
+            _Ad = t1;
+
+            t1 = (int32_t)(xC4S4 * (B_ - D_));
+            t1 >>= 16;
+            _Bd = t1;
+
+
+            _Cd = A_ + C_;
+            _Dd = B_ + D_;
+
+            t1 = (int32_t)(xC4S4 * (ip[0*8] + ip[4*8]));
+            t1 >>= 16;
+            E_ = t1;
+
+            t1 = (int32_t)(xC4S4 * (ip[0*8] - ip[4*8]));
+            t1 >>= 16;
+            F_ = t1;
+
+            t1 = (int32_t)(xC2S6 * ip[2*8]);
+            t2 = (int32_t)(xC6S2 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            G_ = t1 + t2;
+
+            t1 = (int32_t)(xC6S2 * ip[2*8]);
+            t2 = (int32_t)(xC2S6 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            H_ = t1 - t2;
+
+
+            _Ed = E_ - G_;
+            _Gd = E_ + G_;
+
+            _Add = F_ + _Ad;
+            _Bdd = _Bd - H_;
+
+            _Fd = F_ - _Ad;
+            _Hd = _Bd + H_;
+
+            _Gd += IdctAdjustBeforeShift;
+            _Add += IdctAdjustBeforeShift;
+            _Ed += IdctAdjustBeforeShift;
+            _Fd += IdctAdjustBeforeShift;
+
+            /* Final sequence of operations over-write original inputs. */
+            op[0*8] = (int16_t)((_Gd + _Cd )   >> 4);
+            op[7*8] = (int16_t)((_Gd - _Cd )   >> 4);
+
+            op[1*8] = (int16_t)((_Add + _Hd )  >> 4);
+            op[2*8] = (int16_t)((_Add - _Hd )  >> 4);
+
+            op[3*8] = (int16_t)((_Ed + _Dd )   >> 4);
+            op[4*8] = (int16_t)((_Ed - _Dd )   >> 4);
+
+            op[5*8] = (int16_t)((_Fd + _Bdd )  >> 4);
+            op[6*8] = (int16_t)((_Fd - _Bdd )  >> 4);
+
+        } else {
+
+            op[0*8] = 0;
+            op[7*8] = 0;
+            op[1*8] = 0;
+            op[2*8] = 0;
+            op[3*8] = 0;
+            op[4*8] = 0;
+            op[5*8] = 0;
+            op[6*8] = 0;
+        }
+
+        ip++;            /* next column */
+        op++;
+    }
+}
+
+void vp3_idct_put(int16_t *input_data, int16_t *dequant_matrix, 
+    uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+
+    vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            if (*op < -128)
+                *dest = 0;
+            else if (*op > 127)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(*op + 128);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
+
+void vp3_idct_add(int16_t *input_data, int16_t *dequant_matrix, 
+    uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+    int16_t sample;
+
+    vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            sample = *dest + *op;
+            if (sample < 0)
+                *dest = 0;
+            else if (sample > 255)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(sample & 0xFF);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
+
  /************************************************************************
   * VP3 specific functions
   ************************************************************************/
@@ -817,6 +1128,8 @@ static void init_frame(Vp3DecodeContext *s, GetBitContext *gb)
          memset(s->all_fragments[i].coeffs, 0, 64 * sizeof(DCTELEM));
          s->all_fragments[i].coeff_count = 0;
          s->all_fragments[i].last_coeff = 0;
+s->all_fragments[i].motion_x = 0xbeef;
+s->all_fragments[i].motion_y = 0xbeef;
      }
  }
  
@@ -827,8 +1140,8 @@ static void init_frame(Vp3DecodeContext *s, GetBitContext *gb)
  static void init_dequantizer(Vp3DecodeContext *s)
  {
  
-    int quality_scale = vp31_quality_threshold[s->quality_index];
-    int dc_scale_factor = vp31_dc_scale_factor[s->quality_index];
+    int quality_scale = s->coded_quality_threshold[s->quality_index];
+    int dc_scale_factor = s->coded_dc_scale_factor[s->quality_index];
      int i, j;
  
      debug_vp3("  vp3: initializing dequantization tables\n");
@@ -845,20 +1158,20 @@ static void init_dequantizer(Vp3DecodeContext *s)
       *
       * Then, saturate the result to a lower limit of MIN_DEQUANT_VAL.
       */
-#define SCALER 1
+#define SCALER 4
  
      /* scale DC quantizers */
-    s->intra_y_dequant[0] = vp31_intra_y_dequant[0] * dc_scale_factor / 100;
+    s->intra_y_dequant[0] = s->coded_intra_y_dequant[0] * dc_scale_factor / 100;
      if (s->intra_y_dequant[0] < MIN_DEQUANT_VAL * 2)
          s->intra_y_dequant[0] = MIN_DEQUANT_VAL * 2;
      s->intra_y_dequant[0] *= SCALER;
  
-    s->intra_c_dequant[0] = vp31_intra_c_dequant[0] * dc_scale_factor / 100;
+    s->intra_c_dequant[0] = s->coded_intra_c_dequant[0] * dc_scale_factor / 100;
      if (s->intra_c_dequant[0] < MIN_DEQUANT_VAL * 2)
          s->intra_c_dequant[0] = MIN_DEQUANT_VAL * 2;
      s->intra_c_dequant[0] *= SCALER;
  
-    s->inter_dequant[0] = vp31_inter_dequant[0] * dc_scale_factor / 100;
+    s->inter_dequant[0] = s->coded_inter_dequant[0] * dc_scale_factor / 100;
      if (s->inter_dequant[0] < MIN_DEQUANT_VAL * 4)
          s->inter_dequant[0] = MIN_DEQUANT_VAL * 4;
      s->inter_dequant[0] *= SCALER;
@@ -869,21 +1182,23 @@ static void init_dequantizer(Vp3DecodeContext *s)
  
          j = zigzag_index[i];
  
-        s->intra_y_dequant[j] = vp31_intra_y_dequant[i] * quality_scale / 100;
+        s->intra_y_dequant[j] = s->coded_intra_y_dequant[i] * quality_scale / 100;
          if (s->intra_y_dequant[j] < MIN_DEQUANT_VAL)
              s->intra_y_dequant[j] = MIN_DEQUANT_VAL;
          s->intra_y_dequant[j] *= SCALER;
  
-        s->intra_c_dequant[j] = vp31_intra_c_dequant[i] * quality_scale / 100;
+        s->intra_c_dequant[j] = s->coded_intra_c_dequant[i] * quality_scale / 100;
          if (s->intra_c_dequant[j] < MIN_DEQUANT_VAL)
              s->intra_c_dequant[j] = MIN_DEQUANT_VAL;
          s->intra_c_dequant[j] *= SCALER;
  
-        s->inter_dequant[j] = vp31_inter_dequant[i] * quality_scale / 100;
+        s->inter_dequant[j] = s->coded_inter_dequant[i] * quality_scale / 100;
          if (s->inter_dequant[j] < MIN_DEQUANT_VAL * 2)
              s->inter_dequant[j] = MIN_DEQUANT_VAL * 2;
          s->inter_dequant[j] *= SCALER;
      }
+    
+    memset(s->qscale_table, (FFMAX(s->intra_y_dequant[1], s->intra_c_dequant[1])+8)/16, 512); //FIXME finetune
  
      /* print debug information as requested */
      debug_dequantizers("intra Y dequantizers:\n");
@@ -1132,6 +1447,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
      int current_run = 0;
      int decode_fully_flags = 0;
      int decode_partial_blocks = 0;
+    int first_c_fragment_seen;
  
      int i, j;
      int current_fragment;
@@ -1161,14 +1477,14 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
  
                  /* if any of the superblocks are not partially coded, flag
                   * a boolean to decode the list of fully-coded superblocks */
-                if (bit == 0)
+                if (bit == 0) {
                      decode_fully_flags = 1;
-            } else {
-
-                /* make a note of the fact that there are partially coded
-                 * superblocks */
-                decode_partial_blocks = 1;
+                } else {
  
+                    /* make a note of the fact that there are partially coded
+                     * superblocks */
+                    decode_partial_blocks = 1;
+                }
              }
              s->superblock_coding[current_superblock++] = 
                  (bit) ? SB_PARTIALLY_CODED : SB_NOT_CODED;
@@ -1223,6 +1539,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
      s->coded_fragment_list_index = 0;
      s->first_coded_y_fragment = s->first_coded_c_fragment = 0;
      s->last_coded_y_fragment = s->last_coded_c_fragment = -1;
+    first_c_fragment_seen = 0;
      memset(s->macroblock_coding, MODE_COPY, s->macroblock_count);
      for (i = 0; i < s->superblock_count; i++) {
  
@@ -1253,15 +1570,18 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                      }
  
                      if (bit) {
-                        /* mode will be decoded in the next phase */
+                        /* default mode; actual mode will be decoded in 
+                         * the next phase */
                          s->all_fragments[current_fragment].coding_method = 
                              MODE_INTER_NO_MV;
                          s->coded_fragment_list[s->coded_fragment_list_index] = 
                              current_fragment;
                          if ((current_fragment >= s->u_fragment_start) &&
-                            (s->last_coded_y_fragment == -1)) {
+                            (s->last_coded_y_fragment == -1) &&
+                            (!first_c_fragment_seen)) {
                              s->first_coded_c_fragment = s->coded_fragment_list_index;
                              s->last_coded_y_fragment = s->first_coded_c_fragment - 1;
+                            first_c_fragment_seen = 1;
                          }
                          s->coded_fragment_list_index++;
                          s->macroblock_coding[s->all_fragments[current_fragment].macroblock] = MODE_INTER_NO_MV;
@@ -1286,9 +1606,11 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                      s->coded_fragment_list[s->coded_fragment_list_index] = 
                          current_fragment;
                      if ((current_fragment >= s->u_fragment_start) &&
-                        (s->last_coded_y_fragment == -1)) {
+                        (s->last_coded_y_fragment == -1) &&
+                        (!first_c_fragment_seen)) {
                          s->first_coded_c_fragment = s->coded_fragment_list_index;
                          s->last_coded_y_fragment = s->first_coded_c_fragment - 1;
+                        first_c_fragment_seen = 1;
                      }
                      s->coded_fragment_list_index++;
                      s->macroblock_coding[s->all_fragments[current_fragment].macroblock] = MODE_INTER_NO_MV;
@@ -1299,11 +1621,13 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
          }
      }
  
-    if (s->first_coded_c_fragment == 0)
-        /* no C fragments coded */
+    if (!first_c_fragment_seen)
+        /* only Y fragments coded in this frame */
          s->last_coded_y_fragment = s->coded_fragment_list_index - 1;
-    else
+    else 
+        /* end the list of coded C fragments */
          s->last_coded_c_fragment = s->coded_fragment_list_index - 1;
+
      debug_block_coding("    %d total coded fragments, y: %d -> %d, c: %d -> %d\n",
          s->coded_fragment_list_index,
          s->first_coded_y_fragment,
@@ -1398,55 +1722,6 @@ static int unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
      return 0;
  }
  
-/*
- * This function adjusts the components of a motion vector for the halfpel
- * motion grid. c_plane indicates whether the vector applies to the U or V
- * plane. The function returns the halfpel function index to be used in
- * ffmpeg's put_pixels[]() array of functions.
- */
-static inline int adjust_vector(int *x, int *y, int c_plane)
-{
-    int motion_halfpel_index = 0;
-    int x_halfpel;
-    int y_halfpel;
-
-    if (!c_plane) {
-
-        x_halfpel = *x & 1;
-        motion_halfpel_index |= x_halfpel;
-        if (*x >= 0)
-            *x >>= 1;
-        else
-            *x = -( (-(*x) >> 1) + x_halfpel);
-
-        y_halfpel = *y & 1;
-        motion_halfpel_index |= (y_halfpel << 1);
-        if (*y >= 0)
-            *y >>= 1;
-        else
-            *y = -( (-(*y) >> 1) + y_halfpel);
-
-    } else {
-
-        x_halfpel = ((*x & 0x03) != 0);
-        motion_halfpel_index |= x_halfpel;
-        if (*x >= 0)
-            *x >>= 2;
-        else
-            *x = -( (-(*x) >> 2) + x_halfpel);
-
-        y_halfpel = ((*y & 0x03) != 0);
-        motion_halfpel_index |= (y_halfpel << 1);
-        if (*y >= 0)
-            *y >>= 2;
-        else
-            *y = -( (-(*y) >> 2) + y_halfpel);
-
-    }
-
-    return motion_halfpel_index;
-}
-
  /*
   * This function unpacks all the motion vectors for the individual
   * macroblocks from the bitstream.
@@ -1465,7 +1740,6 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
      int current_fragment;
  
      debug_vp3("  vp3: unpacking motion vectors\n");
-
      if (s->keyframe) {
  
          debug_vp3("    keyframe-- there are no motion vectors\n");
@@ -1519,7 +1793,7 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                      }
  
                      /* vector maintenance, only on MODE_INTER_PLUS_MV */
-                    if (s->all_fragments[current_fragment].coding_method ==
+                    if (s->macroblock_coding[current_macroblock] ==
                          MODE_INTER_PLUS_MV) {
                          prior_last_motion_x = last_motion_x;
                          prior_last_motion_y = last_motion_y;
@@ -1606,7 +1880,7 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                  /* assign the motion vectors to the correct fragments */
                  debug_vectors("    vectors for macroblock starting @ fragment %d (coding method %d):\n",
                      current_fragment,
-                    s->all_fragments[current_fragment].coding_method);
+                    s->macroblock_coding[current_macroblock]);
                  for (k = 0; k < 6; k++) {
                      current_fragment = 
                          s->macroblock_fragments[current_macroblock * 6 + k];
@@ -1617,14 +1891,10 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                              current_fragment, s->fragment_count);
                          return 1;
                      }
-                    s->all_fragments[current_fragment].motion_halfpel_index =
-                        adjust_vector(&motion_x[k], &motion_y[k],
-                        ((k == 4) || (k == 5)));
                      s->all_fragments[current_fragment].motion_x = motion_x[k];
                      s->all_fragments[current_fragment].motion_y = motion_y[k];
-                    debug_vectors("    vector %d: fragment %d = (%d, %d), index %d\n",
-                        k, current_fragment, motion_x[k], motion_y[k],
-                        s->all_fragments[current_fragment].motion_halfpel_index);
+                    debug_vectors("    vector %d: fragment %d = (%d, %d)\n",
+                        k, current_fragment, motion_x[k], motion_y[k]);
                  }
              }
          }
@@ -1656,14 +1926,12 @@ static int unpack_vlcs(Vp3DecodeContext *s, GetBitContext *gb,
      DCTELEM coeff;
      Vp3Fragment *fragment;
  
-    if ((first_fragment < 0) ||
-        (first_fragment >= s->fragment_count) ||
-        (last_fragment < 0) ||
+    if ((first_fragment >= s->fragment_count) ||
          (last_fragment >= s->fragment_count)) {
  
          printf ("  vp3:unpack_vlcs(): bad fragment number (%d -> %d ?)\n",
              first_fragment, last_fragment);
-        return 1;
+        return 0;
      }
  
      for (i = first_fragment; i <= last_fragment; i++) {
@@ -2078,10 +2346,7 @@ static void render_fragments(Vp3DecodeContext *s,
      int x, y;
      int m, n;
      int i = first_fragment;
-    int j;
      int16_t *dequantizer;
-    DCTELEM dequant_block[64];
-    DCTELEM dequant_block_permuted[64];
      unsigned char *output_plane;
      unsigned char *last_plane;
      unsigned char *golden_plane;
@@ -2089,7 +2354,7 @@ static void render_fragments(Vp3DecodeContext *s,
      int motion_x, motion_y;
      int upper_motion_limit, lower_motion_limit;
      int motion_halfpel_index;
-    unsigned int motion_source;
+    uint8_t *motion_source;
  
      debug_vp3("  vp3: rendering final fragments for %s\n",
          (plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane");
@@ -2135,40 +2400,55 @@ static void render_fragments(Vp3DecodeContext *s,
              /* transform if this block was coded */
              if (s->all_fragments[i].coding_method != MODE_COPY) {
  
-                /* sort out the motion vector */
-                motion_x = s->all_fragments[i].motion_x;
-                motion_y = s->all_fragments[i].motion_y;
-                motion_halfpel_index = s->all_fragments[i].motion_halfpel_index;
-
-                motion_source = s->all_fragments[i].first_pixel;
-                motion_source += motion_x;
-                motion_source += (motion_y * stride);
-
-                /* if the are any problems with a motion vector, refuse
-                 * to render the block */
-                if ((motion_source < upper_motion_limit) ||
-                    (motion_source > lower_motion_limit)) {
-//                    printf ("  vp3: help! motion source (%d) out of range (%d..%d)\n",
-//                        motion_source, upper_motion_limit, lower_motion_limit);
-                    continue;
+                if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
+                    (s->all_fragments[i].coding_method == MODE_GOLDEN_MV))
+                    motion_source= golden_plane;
+                else 
+                    motion_source= last_plane;
+
+                motion_source += s->all_fragments[i].first_pixel;
+                motion_halfpel_index = 0;
+
+                /* sort out the motion vector if this fragment is coded
+                 * using a motion vector method */
+                if ((s->all_fragments[i].coding_method > MODE_INTRA) &&
+                    (s->all_fragments[i].coding_method != MODE_USING_GOLDEN)) {
+                    int src_x, src_y;
+                    motion_x = s->all_fragments[i].motion_x;
+                    motion_y = s->all_fragments[i].motion_y;
+                    if(plane){
+                        motion_x= (motion_x>>1) | (motion_x&1);
+                        motion_y= (motion_y>>1) | (motion_y&1);
+                    }
+
+                    src_x= (motion_x>>1) + x;
+                    src_y= (motion_y>>1) + y;
+if ((motion_x == 0xbeef) || (motion_y == 0xbeef))
+printf (" help! got beefy vector! (%X, %X)\n", motion_x, motion_y);
+
+                    motion_halfpel_index = motion_x & 0x01;
+                    motion_source += (motion_x >> 1);
+
+//                    motion_y = -motion_y;
+                    motion_halfpel_index |= (motion_y & 0x01) << 1;
+                    motion_source += ((motion_y >> 1) * stride);
+
+                    if(src_x<0 || src_y<0 || src_x + 9 >= width || src_y + 9 >= height){
+                        uint8_t *temp= s->edge_emu_buffer;
+                        if(stride<0) temp -= 9*stride;
+
+                        ff_emulated_edge_mc(temp, motion_source, stride, 9, 9, src_x, src_y, width, height);
+                        motion_source= temp;
+                    }
                  }
  
                  /* first, take care of copying a block from either the
                   * previous or the golden frame */
-                if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
-                    (s->all_fragments[i].coding_method == MODE_GOLDEN_MV)) {
-
-                    s->dsp.put_pixels_tab[1][motion_halfpel_index](
-                        output_plane + s->all_fragments[i].first_pixel,
-                        golden_plane + motion_source,
-                        stride, 8);
-
-                } else 
                  if (s->all_fragments[i].coding_method != MODE_INTRA) {
  
-                    s->dsp.put_pixels_tab[1][motion_halfpel_index](
+                    s->dsp.put_no_rnd_pixels_tab[1][motion_halfpel_index](
                          output_plane + s->all_fragments[i].first_pixel,
-                        last_plane + motion_source,
+                        motion_source,
                          stride, 8);
                  }
  
@@ -2176,34 +2456,16 @@ static void render_fragments(Vp3DecodeContext *s,
                  debug_idct("fragment %d, coding mode %d, DC = %d, dequant = %d:\n", 
                      i, s->all_fragments[i].coding_method, 
                      s->all_fragments[i].coeffs[0], dequantizer[0]);
-                for (j = 0; j < 64; j++)
-                    dequant_block[dezigzag_index[j]] =
-                        s->all_fragments[i].coeffs[j] *
-                        dequantizer[j];
-                for (j = 0; j < 64; j++)
-                    dequant_block_permuted[s->dsp.idct_permutation[j]] =
-                        dequant_block[j];
-
-                debug_idct("dequantized block:\n");
-                for (m = 0; m < 8; m++) {
-                    for (n = 0; n < 8; n++) {
-                        debug_idct(" %5d", dequant_block[m * 8 + n]);
-                    }
-                    debug_idct("\n");
-                }
-                debug_idct("\n");
  
                  /* invert DCT and place (or add) in final output */
-
                  if (s->all_fragments[i].coding_method == MODE_INTRA) {
-                    dequant_block_permuted[0] += 1024;
-                    s->dsp.idct_put(
+                    vp3_idct_put(s->all_fragments[i].coeffs, dequantizer,
                          output_plane + s->all_fragments[i].first_pixel,
-                        stride, dequant_block_permuted);
+                        stride);
                  } else {
-                    s->dsp.idct_add(
+                    vp3_idct_add(s->all_fragments[i].coeffs, dequantizer,
                          output_plane + s->all_fragments[i].first_pixel,
-                        stride, dequant_block_permuted);
+                        stride);
                  }
  
                  debug_idct("block after idct_%s():\n",
@@ -2298,8 +2560,13 @@ static int vp3_decode_init(AVCodecContext *avctx)
      int c_superblock_count;
  
      s->avctx = avctx;
+#if 0
      s->width = avctx->width;
      s->height = avctx->height;
+#else
+    s->width = (avctx->width + 15) & 0xFFFFFFF0;
+    s->height = (avctx->height + 15) & 0xFFFFFFF0;
+#endif
      avctx->pix_fmt = PIX_FMT_YUV420P;
      avctx->has_b_frames = 0;
      dsputil_init(&s->dsp, avctx);
@@ -2357,6 +2624,20 @@ static int vp3_decode_init(AVCodecContext *avctx)
      s->coded_fragment_list = av_malloc(s->fragment_count * sizeof(int));
      s->pixel_addresses_inited = 0;
  
+    if (!s->theora_tables)
+    {
+       for (i = 0; i < 64; i++)
+           s->coded_dc_scale_factor[i] = vp31_dc_scale_factor[i];
+       for (i = 0; i < 64; i++)
+           s->coded_quality_threshold[i] = vp31_quality_threshold[i];
+       for (i = 0; i < 64; i++)
+           s->coded_intra_y_dequant[i] = vp31_intra_y_dequant[i];
+       for (i = 0; i < 64; i++)
+           s->coded_intra_c_dequant[i] = vp31_intra_c_dequant[i];
+       for (i = 0; i < 64; i++)
+           s->coded_inter_dequant[i] = vp31_inter_dequant[i];
+    }
+
      /* init VLC tables */
      for (i = 0; i < 16; i++) {
  
@@ -2420,22 +2701,39 @@ static int vp3_decode_frame(AVCodecContext *avctx,
      *data_size = 0;
  
      init_get_bits(&gb, buf, buf_size * 8);
+    
+    if (s->theora && get_bits1(&gb))
+    {
+       printf("Theora: bad frame indicator\n");
+       return -1;
+    }
  
-    s->keyframe = get_bits(&gb, 1);
-    s->keyframe ^= 1;
-    skip_bits(&gb, 1);
+    s->keyframe = !get_bits1(&gb);
+    if (s->theora && s->keyframe)
+    {
+       if (get_bits1(&gb))
+           printf("Theora: warning, unsupported keyframe coding type?!\n");
+       skip_bits(&gb, 2); /* reserved? */
+    }
+    else
+       skip_bits(&gb, 1);
      s->last_quality_index = s->quality_index;
      s->quality_index = get_bits(&gb, 6);
-    if (s->quality_index != s->last_quality_index)
-        init_dequantizer(s);
  
-    debug_vp3(" VP3 frame #%d: Q index = %d", counter, s->quality_index);
+    debug_vp3(" VP3 %sframe #%d: Q index = %d\n",
+       s->keyframe?"key":"", counter, s->quality_index);
      counter++;
  
+    if (s->quality_index != s->last_quality_index)
+        init_dequantizer(s);
+
      if (s->keyframe) {
+        /* skip the other 2 header bytes for now */
+        if (!s->theora) skip_bits(&gb, 16);
          if (s->last_frame.data[0] == s->golden_frame.data[0]) {
              if (s->golden_frame.data[0])
                  avctx->release_buffer(avctx, &s->golden_frame);
+            s->last_frame= s->golden_frame; /* ensure that we catch any access to this released frame */
          } else {
              if (s->golden_frame.data[0])
                  avctx->release_buffer(avctx, &s->golden_frame);
@@ -2443,7 +2741,7 @@ static int vp3_decode_frame(AVCodecContext *avctx,
                  avctx->release_buffer(avctx, &s->last_frame);
          }
  
-        s->golden_frame.reference = 0;
+        s->golden_frame.reference = 3;
          if(avctx->get_buffer(avctx, &s->golden_frame) < 0) {
              printf("vp3: get_buffer() failed\n");
              return -1;
@@ -2457,22 +2755,16 @@ static int vp3_decode_frame(AVCodecContext *avctx,
              vp3_calculate_pixel_addresses(s);
  
      } else {
-
          /* allocate a new current frame */
-        s->current_frame.reference = 0;
+        s->current_frame.reference = 3;
          if(avctx->get_buffer(avctx, &s->current_frame) < 0) {
              printf("vp3: get_buffer() failed\n");
              return -1;
          }
-
      }
  
-    if (s->keyframe) {
-      debug_vp3(", keyframe\n");
-      /* skip the other 2 header bytes for now */
-      skip_bits(&gb, 16);
-    } else
-      debug_vp3("\n");
+    s->current_frame.qscale_table= s->qscale_table; //FIXME allocate individual tables per AVFrame
+    s->current_frame.qstride= 0;
  
      init_frame(s, &gb);
  
@@ -2499,14 +2791,19 @@ if (!s->keyframe) {
      }
  
      reverse_dc_prediction(s, 0, s->fragment_width, s->fragment_height);
-    reverse_dc_prediction(s, s->u_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2);
-    reverse_dc_prediction(s, s->v_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2);
-
      render_fragments(s, 0, s->width, s->height, 0);
-    render_fragments(s, s->u_fragment_start, s->width / 2, s->height / 2, 1);
-    render_fragments(s, s->v_fragment_start, s->width / 2, s->height / 2, 2);
+
+    if ((avctx->flags & CODEC_FLAG_GRAY) == 0) {
+        reverse_dc_prediction(s, s->u_fragment_start,
+            s->fragment_width / 2, s->fragment_height / 2);
+        reverse_dc_prediction(s, s->v_fragment_start,
+            s->fragment_width / 2, s->fragment_height / 2);
+        render_fragments(s, s->u_fragment_start, s->width / 2, s->height / 2, 1);
+        render_fragments(s, s->v_fragment_start, s->width / 2, s->height / 2, 2);
+    } else {
+        memset(s->current_frame.data[1], 0x80, s->width * s->height / 4);
+        memset(s->current_frame.data[2], 0x80, s->width * s->height / 4);
+    }
  
  #if KEYFRAMES_ONLY
  }
@@ -2523,6 +2820,7 @@ if (!s->keyframe) {
  
      /* shuffle frames (last = current) */
      memcpy(&s->last_frame, &s->current_frame, sizeof(AVFrame));
+    s->current_frame.data[0]= NULL; /* ensure that we catch any access to this released frame */
  
      return buf_size;
  }
@@ -2540,9 +2838,9 @@ static int vp3_decode_end(AVCodecContext *avctx)
      av_free(s->superblock_macroblocks);
      av_free(s->macroblock_fragments);
      av_free(s->macroblock_coding);
-
+    
      /* release all frames */
-    if (s->golden_frame.data[0])
+    if (s->golden_frame.data[0] && s->golden_frame.data[0] != s->last_frame.data[0])
          avctx->release_buffer(avctx, &s->golden_frame);
      if (s->last_frame.data[0])
          avctx->release_buffer(avctx, &s->last_frame);
@@ -2552,6 +2850,133 @@ static int vp3_decode_end(AVCodecContext *avctx)
      return 0;
  }
  
+/* current version is 3.2.0 */
+
+static int theora_decode_header(AVCodecContext *avctx, GetBitContext gb)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+
+    skip_bits(&gb, 8); /* version major */
+    skip_bits(&gb, 8); /* version minor */
+    skip_bits(&gb, 8); /* version micro */
+    
+    s->width = get_bits(&gb, 16) << 4;
+    s->height = get_bits(&gb, 16) << 4;
+    
+    skip_bits(&gb, 24); /* frame width */
+    skip_bits(&gb, 24); /* frame height */
+
+    skip_bits(&gb, 8); /* offset x */
+    skip_bits(&gb, 8); /* offset y */
+
+    skip_bits(&gb, 32); /* fps numerator */
+    skip_bits(&gb, 32); /* fps denumerator */
+    skip_bits(&gb, 24); /* aspect numerator */
+    skip_bits(&gb, 24); /* aspect denumerator */
+    
+    skip_bits(&gb, 5); /* keyframe frequency force */
+    skip_bits(&gb, 8); /* colorspace */
+    skip_bits(&gb, 24); /* bitrate */
+
+    skip_bits(&gb, 6); /* last(?) quality index */
+    
+//    align_get_bits(&gb);
+    
+    avctx->width = s->width;
+    avctx->height = s->height;
+
+    vp3_decode_init(avctx);
+
+    return 0;
+}
+
+static int theora_decode_comments(AVCodecContext *avctx, GetBitContext gb)
+{
+    int nb_comments, i, tmp;
+
+    tmp = get_bits(&gb, 32);
+    while(tmp-=8)
+       skip_bits(&gb, 8);
+
+    nb_comments = get_bits(&gb, 32);
+    for (i = 0; i < nb_comments; i++)
+    {
+       tmp = get_bits(&gb, 32);
+       while(tmp-=8)
+           skip_bits(&gb, 8);
+    }
+    
+    return 0;
+}
+
+static int theora_decode_tables(AVCodecContext *avctx, GetBitContext gb)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+    int i;
+    
+    /* quality threshold table */
+    for (i = 0; i < 64; i++)
+       s->coded_quality_threshold[i] = get_bits(&gb, 16);
+
+    /* dc scale factor table */
+    for (i = 0; i < 64; i++)
+       s->coded_dc_scale_factor[i] = get_bits(&gb, 16);
+
+    /* y coeffs */
+    for (i = 0; i < 64; i++)
+       s->coded_intra_y_dequant[i] = get_bits(&gb, 8);
+
+    /* uv coeffs */
+    for (i = 0; i < 64; i++)
+       s->coded_intra_c_dequant[i] = get_bits(&gb, 8);
+
+    /* inter coeffs */
+    for (i = 0; i < 64; i++)
+       s->coded_inter_dequant[i] = get_bits(&gb, 8);
+    
+    s->theora_tables = 1;
+    
+    return 0;
+}
+
+static int theora_decode_init(AVCodecContext *avctx)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+    GetBitContext gb;
+    int ptype;
+    
+    s->theora = 1;
+
+    if (!avctx->extradata_size)
+       return -1;
+
+    init_get_bits(&gb, avctx->extradata, avctx->extradata_size);
+
+    ptype = get_bits(&gb, 8);
+    debug_vp3("Theora headerpacket type: %x\n", ptype);
+           
+    if (!(ptype & 0x80))
+       return -1;
+       
+    skip_bits(&gb, 6*8); /* "theora" */
+       
+    switch(ptype)
+    {
+        case 0x80:
+            theora_decode_header(avctx, gb);
+           vp3_decode_init(avctx);
+           break;
+       case 0x81:
+           theora_decode_comments(avctx, gb);
+           break;
+       case 0x82:
+           theora_decode_tables(avctx, gb);
+           break;
+    }
+
+    return 0;
+}
+
  AVCodec vp3_decoder = {
      "vp3",
      CODEC_TYPE_VIDEO,
@@ -2564,3 +2989,16 @@ AVCodec vp3_decoder = {
      0,
      NULL
  };
+
+AVCodec theora_decoder = {
+    "theora",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_THEORA,
+    sizeof(Vp3DecodeContext),
+    theora_decode_init,
+    NULL,
+    vp3_decode_end,
+    vp3_decode_frame,
+    0,
+    NULL
+};