* patch for compile problem by "Steven M. Schultz" sms at 2BSD dot COM

[ffmpeg] / libavcodec / vp3.c
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c

index a1ada883ca4afb5dff71f7e9f7efe48e5ae59900..00007788b29405cc0e449e0295754637e9a4a936 100644 (file)
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -17,6 +17,8 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   *
   * VP3 Video Decoder by Mike Melanson (melanson@pcisys.net)
+ * For more information about the VP3 coding process, visit:
+ *   http://www.pcisys.net/~melanson/codecs/
   *
   */
  
@@ -35,7 +37,6 @@
  #include "dsputil.h"
  #include "mpegvideo.h"
  #include "dsputil.h"
-#include "bswap.h"
  
  #include "vp3data.h"
  
@@ -47,6 +48,7 @@
   * Define one or more of the following compile-time variables to 1 to obtain
   * elaborate information about certain aspects of the decoding process.
   *
+ * KEYFRAMES_ONLY: set this to 1 to only see keyframes (VP3 slideshow mode)
   * DEBUG_VP3: high-level decoding flow
   * DEBUG_INIT: initialization parameters
   * DEBUG_DEQUANTIZERS: display how the dequanization tables are built
@@ -59,6 +61,8 @@
   * DEBUG_IDCT: show every detail of the IDCT process
   */
  
+#define KEYFRAMES_ONLY 0
+
  #define DEBUG_VP3 0
  #define DEBUG_INIT 0
  #define DEBUG_DEQUANTIZERS 0
@@ -210,8 +214,6 @@ static int ModeAlphabet[7][CODING_MODE_COUNT] =
  typedef struct Vp3DecodeContext {
      AVCodecContext *avctx;
      int width, height;
-    unsigned char *current_picture[3]; /* picture structure */
-    int linesize[3];
      AVFrame golden_frame;
      AVFrame last_frame;
      AVFrame current_frame;
@@ -224,6 +226,10 @@ typedef struct Vp3DecodeContext {
      int superblock_count;
      int superblock_width;
      int superblock_height;
+    int y_superblock_width;
+    int y_superblock_height;
+    int c_superblock_width;
+    int c_superblock_height;
      int u_superblock_start;
      int v_superblock_start;
      unsigned char *superblock_coding;
@@ -272,12 +278,320 @@ typedef struct Vp3DecodeContext {
       * numbers corresponds to the fragment indices 0..5 which comprise
       * the macroblock (4 Y fragments and 2 C fragments). */
      int *macroblock_fragments;
-    /* This is an array of flags indicating whether a particular 
-     * macroblock is coded. */
-    unsigned char *macroblock_coded;
+    /* This is an array that indicates how a particular macroblock 
+     * is coded. */
+    unsigned char *macroblock_coding;
+
+    int first_coded_y_fragment;
+    int first_coded_c_fragment;
+    int last_coded_y_fragment;
+    int last_coded_c_fragment;
  
+    uint8_t edge_emu_buffer[9*2048]; //FIXME dynamic alloc
+    uint8_t qscale_table[2048]; //FIXME dynamic alloc (width+15)/16
  } Vp3DecodeContext;
  
+/************************************************************************
+ * VP3 I/DCT
+ ************************************************************************/
+
+#define IdctAdjustBeforeShift 8
+#define xC1S7 64277
+#define xC2S6 60547
+#define xC3S5 54491
+#define xC4S4 46341
+#define xC5S3 36410
+#define xC6S2 25080
+#define xC7S1 12785
+
+void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix, 
+    int16_t *output_data)
+{
+    int32_t intermediate_data[64];
+    int32_t *ip = intermediate_data;
+    int16_t *op = output_data;
+
+    int32_t A_, B_, C_, D_, _Ad, _Bd, _Cd, _Dd, E_, F_, G_, H_;
+    int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+    int32_t t1, t2;
+
+    int i, j;
+
+    debug_idct("raw coefficient block:\n");
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            debug_idct(" %5d", input_data[i * 8 + j]);
+        }
+        debug_idct("\n");
+    }
+    debug_idct("\n");
+
+    for (i = 0; i < 64; i++) {
+        j = dezigzag_index[i];
+        intermediate_data[j] = dequant_matrix[i] * input_data[i];
+    }
+
+    debug_idct("dequantized block:\n");
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            debug_idct(" %5d", intermediate_data[i * 8 + j]);
+        }
+        debug_idct("\n");
+    }
+    debug_idct("\n");
+
+    /* Inverse DCT on the rows now */
+    for (i = 0; i < 8; i++) {
+        /* Check for non-zero values */
+        if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
+            t1 = (int32_t)(xC1S7 * ip[1]);
+            t2 = (int32_t)(xC7S1 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+            A_ = t1 + t2;
+
+            t1 = (int32_t)(xC7S1 * ip[1]);
+            t2 = (int32_t)(xC1S7 * ip[7]);
+            t1 >>= 16;
+            t2 >>= 16;
+            B_ = t1 - t2;
+
+            t1 = (int32_t)(xC3S5 * ip[3]);
+            t2 = (int32_t)(xC5S3 * ip[5]);
+            t1 >>= 16;
+            t2 >>= 16;
+            C_ = t1 + t2;
+
+            t1 = (int32_t)(xC3S5 * ip[5]);
+            t2 = (int32_t)(xC5S3 * ip[3]);
+            t1 >>= 16;
+            t2 >>= 16;
+            D_ = t1 - t2;
+
+
+            t1 = (int32_t)(xC4S4 * (A_ - C_));
+            t1 >>= 16;
+            _Ad = t1;
+
+            t1 = (int32_t)(xC4S4 * (B_ - D_));
+            t1 >>= 16;
+            _Bd = t1;
+
+
+            _Cd = A_ + C_;
+            _Dd = B_ + D_;
+
+            t1 = (int32_t)(xC4S4 * (ip[0] + ip[4]));
+            t1 >>= 16;
+            E_ = t1;
+
+            t1 = (int32_t)(xC4S4 * (ip[0] - ip[4]));
+            t1 >>= 16;
+            F_ = t1;
+
+            t1 = (int32_t)(xC2S6 * ip[2]);
+            t2 = (int32_t)(xC6S2 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+            G_ = t1 + t2;
+
+            t1 = (int32_t)(xC6S2 * ip[2]);
+            t2 = (int32_t)(xC2S6 * ip[6]);
+            t1 >>= 16;
+            t2 >>= 16;
+            H_ = t1 - t2;
+
+
+            _Ed = E_ - G_;
+            _Gd = E_ + G_;
+
+            _Add = F_ + _Ad;
+            _Bdd = _Bd - H_;
+
+            _Fd = F_ - _Ad;
+            _Hd = _Bd + H_;
+
+            /*  Final sequence of operations over-write original inputs. */
+            ip[0] = (int16_t)((_Gd + _Cd )   >> 0);
+            ip[7] = (int16_t)((_Gd - _Cd )   >> 0);
+
+            ip[1] = (int16_t)((_Add + _Hd )  >> 0);
+            ip[2] = (int16_t)((_Add - _Hd )  >> 0);
+
+            ip[3] = (int16_t)((_Ed + _Dd )   >> 0);
+            ip[4] = (int16_t)((_Ed - _Dd )   >> 0);
+
+            ip[5] = (int16_t)((_Fd + _Bdd )  >> 0);
+            ip[6] = (int16_t)((_Fd - _Bdd )  >> 0);
+
+        }
+
+        ip += 8;            /* next row */
+    }
+
+    ip = intermediate_data;
+
+    for ( i = 0; i < 8; i++) {
+        /* Check for non-zero values (bitwise or faster than ||) */
+        if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+             ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+
+            t1 = (int32_t)(xC1S7 * ip[1*8]);
+            t2 = (int32_t)(xC7S1 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            A_ = t1 + t2;
+
+            t1 = (int32_t)(xC7S1 * ip[1*8]);
+            t2 = (int32_t)(xC1S7 * ip[7*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            B_ = t1 - t2;
+
+            t1 = (int32_t)(xC3S5 * ip[3*8]);
+            t2 = (int32_t)(xC5S3 * ip[5*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            C_ = t1 + t2;
+
+            t1 = (int32_t)(xC3S5 * ip[5*8]);
+            t2 = (int32_t)(xC5S3 * ip[3*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            D_ = t1 - t2;
+
+
+            t1 = (int32_t)(xC4S4 * (A_ - C_));
+            t1 >>= 16;
+            _Ad = t1;
+
+            t1 = (int32_t)(xC4S4 * (B_ - D_));
+            t1 >>= 16;
+            _Bd = t1;
+
+
+            _Cd = A_ + C_;
+            _Dd = B_ + D_;
+
+            t1 = (int32_t)(xC4S4 * (ip[0*8] + ip[4*8]));
+            t1 >>= 16;
+            E_ = t1;
+
+            t1 = (int32_t)(xC4S4 * (ip[0*8] - ip[4*8]));
+            t1 >>= 16;
+            F_ = t1;
+
+            t1 = (int32_t)(xC2S6 * ip[2*8]);
+            t2 = (int32_t)(xC6S2 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            G_ = t1 + t2;
+
+            t1 = (int32_t)(xC6S2 * ip[2*8]);
+            t2 = (int32_t)(xC2S6 * ip[6*8]);
+            t1 >>= 16;
+            t2 >>= 16;
+            H_ = t1 - t2;
+
+
+            _Ed = E_ - G_;
+            _Gd = E_ + G_;
+
+            _Add = F_ + _Ad;
+            _Bdd = _Bd - H_;
+
+            _Fd = F_ - _Ad;
+            _Hd = _Bd + H_;
+
+            _Gd += IdctAdjustBeforeShift;
+            _Add += IdctAdjustBeforeShift;
+            _Ed += IdctAdjustBeforeShift;
+            _Fd += IdctAdjustBeforeShift;
+
+            /* Final sequence of operations over-write original inputs. */
+            op[0*8] = (int16_t)((_Gd + _Cd )   >> 4);
+            op[7*8] = (int16_t)((_Gd - _Cd )   >> 4);
+
+            op[1*8] = (int16_t)((_Add + _Hd )  >> 4);
+            op[2*8] = (int16_t)((_Add - _Hd )  >> 4);
+
+            op[3*8] = (int16_t)((_Ed + _Dd )   >> 4);
+            op[4*8] = (int16_t)((_Ed - _Dd )   >> 4);
+
+            op[5*8] = (int16_t)((_Fd + _Bdd )  >> 4);
+            op[6*8] = (int16_t)((_Fd - _Bdd )  >> 4);
+
+        } else {
+
+            op[0*8] = 0;
+            op[7*8] = 0;
+            op[1*8] = 0;
+            op[2*8] = 0;
+            op[3*8] = 0;
+            op[4*8] = 0;
+            op[5*8] = 0;
+            op[6*8] = 0;
+        }
+
+        ip++;            /* next column */
+        op++;
+    }
+}
+
+void vp3_idct_put(int16_t *input_data, int16_t *dequant_matrix, 
+    uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+
+    vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            if (*op < -128)
+                *dest = 0;
+            else if (*op > 127)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(*op + 128);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
+
+void vp3_idct_add(int16_t *input_data, int16_t *dequant_matrix, 
+    uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+    int16_t sample;
+
+    vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            sample = *dest + *op;
+            if (sample < 0)
+                *dest = 0;
+            else if (sample > 255)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(sample & 0xFF);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
+
  /************************************************************************
   * VP3 specific functions
   ************************************************************************/
@@ -286,8 +600,10 @@ typedef struct Vp3DecodeContext {
   * This function sets up all of the various blocks mappings:
   * superblocks <-> fragments, macroblocks <-> fragments,
   * superblocks <-> macroblocks
+ *
+ * Returns 0 is successful; returns 1 if *anything* went wrong.
   */
-static void init_block_mapping(Vp3DecodeContext *s) 
+static int init_block_mapping(Vp3DecodeContext *s) 
  {
      int i, j;
      signed int hilbert_walk_y[16];
@@ -381,9 +697,10 @@ static void init_block_mapping(Vp3DecodeContext *s)
              /* start of Y superblocks */
              right_edge = s->fragment_width;
              bottom_edge = s->fragment_height;
-            current_width = 0;
+            current_width = -1;
              current_height = 0;
-            superblock_row_inc = 3 * s->fragment_width;
+            superblock_row_inc = 3 * s->fragment_width - 
+                (s->y_superblock_width * 4 - s->fragment_width);
              hilbert = hilbert_walk_y;
  
              /* the first operation for this variable is to advance by 1 */
@@ -394,9 +711,10 @@ static void init_block_mapping(Vp3DecodeContext *s)
              /* start of U superblocks */
              right_edge = s->fragment_width / 2;
              bottom_edge = s->fragment_height / 2;
-            current_width = 0;
+            current_width = -1;
              current_height = 0;
-            superblock_row_inc = 3 * (s->fragment_width / 2);
+            superblock_row_inc = 3 * (s->fragment_width / 2) - 
+                (s->c_superblock_width * 4 - s->fragment_width / 2);
              hilbert = hilbert_walk_c;
  
              /* the first operation for this variable is to advance by 1 */
@@ -407,9 +725,10 @@ static void init_block_mapping(Vp3DecodeContext *s)
              /* start of V superblocks */
              right_edge = s->fragment_width / 2;
              bottom_edge = s->fragment_height / 2;
-            current_width = 0;
+            current_width = -1;
              current_height = 0;
-            superblock_row_inc = 3 * (s->fragment_width / 2);
+            superblock_row_inc = 3 * (s->fragment_width / 2) - 
+                (s->c_superblock_width * 4 - s->fragment_width / 2);
              hilbert = hilbert_walk_c;
  
              /* the first operation for this variable is to advance by 1 */
@@ -417,9 +736,9 @@ static void init_block_mapping(Vp3DecodeContext *s)
  
          }
  
-        if (current_width >= right_edge) {
+        if (current_width >= right_edge - 1) {
              /* reset width and move to next superblock row */
-            current_width = 0;
+            current_width = -1;
              current_height += 4;
  
              /* fragment is now at the start of a new superblock row */
@@ -429,21 +748,23 @@ static void init_block_mapping(Vp3DecodeContext *s)
          /* iterate through all 16 fragments in a superblock */
          for (j = 0; j < 16; j++) {
              current_fragment += hilbert[j];
+            current_width += travel_width[j];
              current_height += travel_height[j];
  
              /* check if the fragment is in bounds */
-            if ((current_width <= right_edge) &&
+            if ((current_width < right_edge) &&
                  (current_height < bottom_edge)) {
                  s->superblock_fragments[mapping_index] = current_fragment;
-                debug_init("    mapping fragment %d to superblock %d, position %d\n", 
-                    s->superblock_fragments[mapping_index], i, j);
+                debug_init("    mapping fragment %d to superblock %d, position %d (%d/%d x %d/%d)\n", 
+                    s->superblock_fragments[mapping_index], i, j,
+                    current_width, right_edge, current_height, bottom_edge);
              } else {
                  s->superblock_fragments[mapping_index] = -1;
-                debug_init("    superblock %d, position %d has no fragment\n", 
-                    i, j);
+                debug_init("    superblock %d, position %d has no fragment (%d/%d x %d/%d)\n", 
+                    i, j,
+                    current_width, right_edge, current_height, bottom_edge);
              }
  
-            current_width += travel_width[j];
              mapping_index++;
          }
      }
@@ -452,17 +773,18 @@ static void init_block_mapping(Vp3DecodeContext *s)
       * all of the Y plane superblocks to build this mapping */
      right_edge = s->macroblock_width;
      bottom_edge = s->macroblock_height;
-    current_width = 0;
+    current_width = -1;
      current_height = 0;
-    superblock_row_inc = s->macroblock_width;
+    superblock_row_inc = s->macroblock_width -
+        (s->y_superblock_width * 2 - s->macroblock_width);;
      hilbert = hilbert_walk_mb;
      mapping_index = 0;
      current_macroblock = -1;
      for (i = 0; i < s->u_superblock_start; i++) {
  
-        if (current_width >= right_edge) {
+        if (current_width >= right_edge - 1) {
              /* reset width and move to next superblock row */
-            current_width = 0;
+            current_width = -1;
              current_height += 2;
  
              /* macroblock is now at the start of a new superblock row */
@@ -472,21 +794,23 @@ static void init_block_mapping(Vp3DecodeContext *s)
          /* iterate through each potential macroblock in the superblock */
          for (j = 0; j < 4; j++) {
              current_macroblock += hilbert_walk_mb[j];
+            current_width += travel_width_mb[j];
              current_height += travel_height_mb[j];
  
              /* check if the macroblock is in bounds */
-            if ((current_width <= right_edge) &&
+            if ((current_width < right_edge) &&
                  (current_height < bottom_edge)) {
                  s->superblock_macroblocks[mapping_index] = current_macroblock;
-                debug_init("    mapping macroblock %d to superblock %d, position %d\n",
-                    s->superblock_macroblocks[mapping_index], i, j);
+                debug_init("    mapping macroblock %d to superblock %d, position %d (%d/%d x %d/%d)\n",
+                    s->superblock_macroblocks[mapping_index], i, j,
+                    current_width, right_edge, current_height, bottom_edge);
              } else {
                  s->superblock_macroblocks[mapping_index] = -1;
-                debug_init("    superblock %d, position %d has no macroblock\n",
-                    i, j);
+                debug_init("    superblock %d, position %d has no macroblock (%d/%d x %d/%d)\n",
+                    i, j,
+                    current_width, right_edge, current_height, bottom_edge);
              }
  
-            current_width += travel_width_mb[j];
              mapping_index++;
          }
      }
@@ -532,13 +856,13 @@ static void init_block_mapping(Vp3DecodeContext *s)
              /* C planes */
              c_fragment = s->u_fragment_start + 
                  (i * s->fragment_width / 4) + (j / 2);
-        s->all_fragments[c_fragment].macroblock = s->macroblock_count;
+            s->all_fragments[c_fragment].macroblock = s->macroblock_count;
              s->macroblock_fragments[mapping_index++] = c_fragment;
              debug_init("%d ", c_fragment);
  
              c_fragment = s->v_fragment_start + 
                  (i * s->fragment_width / 4) + (j / 2);
-        s->all_fragments[c_fragment].macroblock = s->macroblock_count;
+            s->all_fragments[c_fragment].macroblock = s->macroblock_count;
              s->macroblock_fragments[mapping_index++] = c_fragment;
              debug_init("%d ", c_fragment);
  
@@ -553,6 +877,8 @@ static void init_block_mapping(Vp3DecodeContext *s)
  
          current_fragment += s->fragment_width;
      }
+
+    return 0;  /* successful path out */
  }
  
  /*
@@ -792,6 +1118,8 @@ static void init_frame(Vp3DecodeContext *s, GetBitContext *gb)
          memset(s->all_fragments[i].coeffs, 0, 64 * sizeof(DCTELEM));
          s->all_fragments[i].coeff_count = 0;
          s->all_fragments[i].last_coeff = 0;
+s->all_fragments[i].motion_x = 0xbeef;
+s->all_fragments[i].motion_y = 0xbeef;
      }
  }
  
@@ -820,7 +1148,7 @@ static void init_dequantizer(Vp3DecodeContext *s)
       *
       * Then, saturate the result to a lower limit of MIN_DEQUANT_VAL.
       */
-#define SCALER 1
+#define SCALER 4
  
      /* scale DC quantizers */
      s->intra_y_dequant[0] = vp31_intra_y_dequant[0] * dc_scale_factor / 100;
@@ -842,7 +1170,7 @@ static void init_dequantizer(Vp3DecodeContext *s)
       * the dequantization phase */
      for (i = 1; i < 64; i++) {
  
-        j = quant_index[i];
+        j = zigzag_index[i];
  
          s->intra_y_dequant[j] = vp31_intra_y_dequant[i] * quality_scale / 100;
          if (s->intra_y_dequant[j] < MIN_DEQUANT_VAL)
@@ -859,6 +1187,8 @@ static void init_dequantizer(Vp3DecodeContext *s)
              s->inter_dequant[j] = MIN_DEQUANT_VAL * 2;
          s->inter_dequant[j] *= SCALER;
      }
+    
+    memset(s->qscale_table, (FFMAX(s->intra_y_dequant[1], s->intra_c_dequant[1])+8)/16, 512); //FIXME finetune
  
      /* print debug information as requested */
      debug_dequantizers("intra Y dequantizers:\n");
@@ -1100,13 +1430,14 @@ static int get_motion_vector_fixed(GetBitContext *gb)
   * This function unpacks all of the superblock/macroblock/fragment coding 
   * information from the bitstream.
   */
-static void unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
+static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
  {
      int bit = 0;
      int current_superblock = 0;
      int current_run = 0;
      int decode_fully_flags = 0;
      int decode_partial_blocks = 0;
+    int first_c_fragment_seen;
  
      int i, j;
      int current_fragment;
@@ -1136,14 +1467,14 @@ static void unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
  
                  /* if any of the superblocks are not partially coded, flag
                   * a boolean to decode the list of fully-coded superblocks */
-                if (bit == 0)
+                if (bit == 0) {
                      decode_fully_flags = 1;
-            } else {
-
-                /* make a note of the fact that there are partially coded
-                 * superblocks */
-                decode_partial_blocks = 1;
+                } else {
  
+                    /* make a note of the fact that there are partially coded
+                     * superblocks */
+                    decode_partial_blocks = 1;
+                }
              }
              s->superblock_coding[current_superblock++] = 
                  (bit) ? SB_PARTIALLY_CODED : SB_NOT_CODED;
@@ -1196,7 +1527,10 @@ static void unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
      /* figure out which fragments are coded; iterate through each
       * superblock (all planes) */
      s->coded_fragment_list_index = 0;
-    memset(s->macroblock_coded, 0, s->macroblock_count);
+    s->first_coded_y_fragment = s->first_coded_c_fragment = 0;
+    s->last_coded_y_fragment = s->last_coded_c_fragment = -1;
+    first_c_fragment_seen = 0;
+    memset(s->macroblock_coding, MODE_COPY, s->macroblock_count);
      for (i = 0; i < s->superblock_count; i++) {
  
          /* iterate through all 16 fragments in a superblock */
@@ -1204,6 +1538,11 @@ static void unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
  
              /* if the fragment is in bounds, check its coding status */
              current_fragment = s->superblock_fragments[i * 16 + j];
+            if (current_fragment >= s->fragment_count) {
+                printf ("  vp3:unpack_superblocks(): bad fragment number (%d >= %d)\n",
+                    current_fragment, s->fragment_count);
+                return 1;
+            }
              if (current_fragment != -1) {
                  if (s->superblock_coding[i] == SB_NOT_CODED) {
  
@@ -1221,12 +1560,21 @@ static void unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                      }
  
                      if (bit) {
-                        /* mode will be decoded in the next phase */
+                        /* default mode; actual mode will be decoded in 
+                         * the next phase */
                          s->all_fragments[current_fragment].coding_method = 
                              MODE_INTER_NO_MV;
-                        s->coded_fragment_list[s->coded_fragment_list_index++] = 
+                        s->coded_fragment_list[s->coded_fragment_list_index] = 
                              current_fragment;
-                        s->macroblock_coded[s->all_fragments[current_fragment].macroblock] = 1;
+                        if ((current_fragment >= s->u_fragment_start) &&
+                            (s->last_coded_y_fragment == -1) &&
+                            (!first_c_fragment_seen)) {
+                            s->first_coded_c_fragment = s->coded_fragment_list_index;
+                            s->last_coded_y_fragment = s->first_coded_c_fragment - 1;
+                            first_c_fragment_seen = 1;
+                        }
+                        s->coded_fragment_list_index++;
+                        s->macroblock_coding[s->all_fragments[current_fragment].macroblock] = MODE_INTER_NO_MV;
                          debug_block_coding("      superblock %d is partially coded, fragment %d is coded\n",
                              i, current_fragment);
                      } else {
@@ -1245,22 +1593,46 @@ static void unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                       * coding will be determined in next step */
                      s->all_fragments[current_fragment].coding_method = 
                          MODE_INTER_NO_MV;
-                    s->coded_fragment_list[s->coded_fragment_list_index++] = 
+                    s->coded_fragment_list[s->coded_fragment_list_index] = 
                          current_fragment;
-                    s->macroblock_coded[s->all_fragments[current_fragment].macroblock] = 1;
+                    if ((current_fragment >= s->u_fragment_start) &&
+                        (s->last_coded_y_fragment == -1) &&
+                        (!first_c_fragment_seen)) {
+                        s->first_coded_c_fragment = s->coded_fragment_list_index;
+                        s->last_coded_y_fragment = s->first_coded_c_fragment - 1;
+                        first_c_fragment_seen = 1;
+                    }
+                    s->coded_fragment_list_index++;
+                    s->macroblock_coding[s->all_fragments[current_fragment].macroblock] = MODE_INTER_NO_MV;
                      debug_block_coding("      superblock %d is fully coded, fragment %d is coded\n",
                          i, current_fragment);
                  }
              }
          }
      }
+
+    if (!first_c_fragment_seen)
+        /* only Y fragments coded in this frame */
+        s->last_coded_y_fragment = s->coded_fragment_list_index - 1;
+    else 
+        /* end the list of coded C fragments */
+        s->last_coded_c_fragment = s->coded_fragment_list_index - 1;
+
+    debug_block_coding("    %d total coded fragments, y: %d -> %d, c: %d -> %d\n",
+        s->coded_fragment_list_index,
+        s->first_coded_y_fragment,
+        s->last_coded_y_fragment,
+        s->first_coded_c_fragment,
+        s->last_coded_c_fragment);
+
+    return 0;
  }
  
  /*
   * This function unpacks all the coding mode data for individual macroblocks
   * from the bitstream.
   */
-static void unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
+static int unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
  {
      int i, j, k;
      int scheme;
@@ -1286,7 +1658,7 @@ static void unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
          if (scheme == 0) {
              debug_modes("    custom mode alphabet ahead:\n");
              for (i = 0; i < 8; i++)
-                ModeAlphabet[0][i] = get_bits(gb, 3);
+                ModeAlphabet[scheme][get_bits(gb, 3)] = i;
          }
  
          for (i = 0; i < 8; i++)
@@ -1300,8 +1672,13 @@ static void unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
              for (j = 0; j < 4; j++) {
                  current_macroblock = s->superblock_macroblocks[i * 4 + j];
                  if ((current_macroblock == -1) ||
-                    (!s->macroblock_coded[current_macroblock]))
+                    (s->macroblock_coding[current_macroblock] == MODE_COPY))
                      continue;
+                if (current_macroblock >= s->macroblock_count) {
+                    printf ("  vp3:unpack_modes(): bad macroblock number (%d >= %d)\n",
+                        current_macroblock, s->macroblock_count);
+                    return 1;
+                }
  
                  /* mode 7 means get 3 bits for each coding mode */
                  if (scheme == 7)
@@ -1309,9 +1686,17 @@ static void unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
                  else
                      coding_mode = ModeAlphabet[scheme][get_mode_code(gb)];
  
+                s->macroblock_coding[current_macroblock] = coding_mode;
                  for (k = 0; k < 6; k++) {
                      current_fragment = 
                          s->macroblock_fragments[current_macroblock * 6 + k];
+                    if (current_fragment == -1)
+                        continue;
+                    if (current_fragment >= s->fragment_count) {
+                        printf ("  vp3:unpack_modes(): bad fragment number (%d >= %d)\n",
+                            current_fragment, s->fragment_count);
+                        return 1;
+                    }
                      if (s->all_fragments[current_fragment].coding_method != 
                          MODE_COPY)
                          s->all_fragments[current_fragment].coding_method =
@@ -1324,13 +1709,14 @@ static void unpack_modes(Vp3DecodeContext *s, GetBitContext *gb)
          }
      }
  
+    return 0;
  }
  
  /*
   * This function unpacks all the motion vectors for the individual
   * macroblocks from the bitstream.
   */
-static void unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
+static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
  {
      int i, j, k;
      int coding_mode;
@@ -1344,7 +1730,6 @@ static void unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
      int current_fragment;
  
      debug_vp3("  vp3: unpacking motion vectors\n");
-
      if (s->keyframe) {
  
          debug_vp3("    keyframe-- there are no motion vectors\n");
@@ -1366,11 +1751,21 @@ static void unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
              for (j = 0; j < 4; j++) {
                  current_macroblock = s->superblock_macroblocks[i * 4 + j];
                  if ((current_macroblock == -1) ||
-                    (!s->macroblock_coded[current_macroblock]))
+                    (s->macroblock_coding[current_macroblock] == MODE_COPY))
                      continue;
+                if (current_macroblock >= s->macroblock_count) {
+                    printf ("  vp3:unpack_vectors(): bad macroblock number (%d >= %d)\n",
+                        current_macroblock, s->macroblock_count);
+                    return 1;
+                }
  
                  current_fragment = s->macroblock_fragments[current_macroblock * 6];
-                switch (s->all_fragments[current_fragment].coding_method) {
+                if (current_fragment >= s->fragment_count) {
+                    printf ("  vp3:unpack_vectors(): bad fragment number (%d >= %d\n",
+                        current_fragment, s->fragment_count);
+                    return 1;
+                }
+                switch (s->macroblock_coding[current_macroblock]) {
  
                  case MODE_INTER_PLUS_MV:
                  case MODE_GOLDEN_MV:
@@ -1388,7 +1783,7 @@ static void unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                      }
  
                      /* vector maintenance, only on MODE_INTER_PLUS_MV */
-                    if (s->all_fragments[current_fragment].coding_method ==
+                    if (s->macroblock_coding[current_macroblock] ==
                          MODE_INTER_PLUS_MV) {
                          prior_last_motion_x = last_motion_x;
                          prior_last_motion_y = last_motion_y;
@@ -1462,23 +1857,40 @@ static void unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                      last_motion_x = motion_x[0];
                      last_motion_y = motion_y[0];
                      break;
+
+                default:
+                    /* covers intra, inter without MV, golden without MV */
+                    memset(motion_x, 0, 6 * sizeof(int));
+                    memset(motion_y, 0, 6 * sizeof(int));
+
+                    /* no vector maintenance */
+                    break;
                  }
  
                  /* assign the motion vectors to the correct fragments */
                  debug_vectors("    vectors for macroblock starting @ fragment %d (coding method %d):\n",
                      current_fragment,
-                    s->all_fragments[current_fragment].coding_method);
+                    s->macroblock_coding[current_macroblock]);
                  for (k = 0; k < 6; k++) {
                      current_fragment = 
                          s->macroblock_fragments[current_macroblock * 6 + k];
+                    if (current_fragment == -1)
+                        continue;
+                    if (current_fragment >= s->fragment_count) {
+                        printf ("  vp3:unpack_vectors(): bad fragment number (%d >= %d)\n",
+                            current_fragment, s->fragment_count);
+                        return 1;
+                    }
                      s->all_fragments[current_fragment].motion_x = motion_x[k];
-                    s->all_fragments[current_fragment].motion_x = motion_y[k];
+                    s->all_fragments[current_fragment].motion_y = motion_y[k];
                      debug_vectors("    vector %d: fragment %d = (%d, %d)\n",
                          k, current_fragment, motion_x[k], motion_y[k]);
                  }
              }
          }
      }
+
+    return 0;
  }
  
  /* 
@@ -1504,7 +1916,15 @@ static int unpack_vlcs(Vp3DecodeContext *s, GetBitContext *gb,
      DCTELEM coeff;
      Vp3Fragment *fragment;
  
-    for (i = first_fragment; i < last_fragment; i++) {
+    if ((first_fragment >= s->fragment_count) ||
+        (last_fragment >= s->fragment_count)) {
+
+        printf ("  vp3:unpack_vlcs(): bad fragment number (%d -> %d ?)\n",
+            first_fragment, last_fragment);
+        return 0;
+    }
+
+    for (i = first_fragment; i <= last_fragment; i++) {
  
          fragment = &s->all_fragments[s->coded_fragment_list[i]];
          if (fragment->coeff_count > coeff_index)
@@ -1540,7 +1960,7 @@ static int unpack_vlcs(Vp3DecodeContext *s, GetBitContext *gb,
   * This function unpacks all of the DCT coefficient data from the
   * bitstream.
   */
-static void unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
+static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
  {
      int i;
      int dc_y_table;
@@ -1549,42 +1969,6 @@ static void unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
      int ac_c_table;
      int residual_eob_run = 0;
  
-    /* for the binary search */
-    int left, middle, right, found;
-    /* this indicates the first fragment of the color plane data */
-    int plane_split = 0;
-
-    debug_vp3("  vp3: unpacking DCT coefficients\n");
-
-    /* find the plane split (the first color plane fragment) using a binary 
-     * search; test the boundaries first */
-    if (s->coded_fragment_list_index == 0)
-        return;
-    if (s->u_fragment_start <= s->coded_fragment_list[0])
-        plane_split = 0;  /* this means no Y fragments */
-    else if (s->coded_fragment_list[s->coded_fragment_list_index - 1] >
-        s->u_fragment_start) {
-
-        left = 0;
-        right = s->coded_fragment_list_index - 1;
-        found = 0;
-        do {
-            middle = (left + right + 1) / 2;
-            if ((s->coded_fragment_list[middle] >= s->u_fragment_start) &&
-                (s->coded_fragment_list[middle - 1] < s->u_fragment_start))
-                found = 1;
-            else if (s->coded_fragment_list[middle] < s->u_fragment_start)
-                left = middle;
-            else
-                right = middle;
-        } while (!found);
-
-        plane_split = middle;
-    }
-
-    debug_vp3("  plane split @ index %d (fragment %d)\n", plane_split,
-        s->coded_fragment_list[plane_split]);
-
      /* fetch the DC table indices */
      dc_y_table = get_bits(gb, 4);
      dc_c_table = get_bits(gb, 4);
@@ -1593,73 +1977,75 @@ static void unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
      debug_vp3("  vp3: unpacking Y plane DC coefficients using table %d\n",
          dc_y_table);
      residual_eob_run = unpack_vlcs(s, gb, &s->dc_vlc[dc_y_table], 0, 
-        0, plane_split, residual_eob_run);
+        s->first_coded_y_fragment, s->last_coded_y_fragment, residual_eob_run);
  
      /* unpack the C plane DC coefficients */
      debug_vp3("  vp3: unpacking C plane DC coefficients using table %d\n",
          dc_c_table);
      residual_eob_run = unpack_vlcs(s, gb, &s->dc_vlc[dc_c_table], 0,
-        plane_split, s->coded_fragment_list_index, residual_eob_run);
+        s->first_coded_c_fragment, s->last_coded_c_fragment, residual_eob_run);
  
-    /* fetch the level 1 AC table indices */
+    /* fetch the AC table indices */
      ac_y_table = get_bits(gb, 4);
      ac_c_table = get_bits(gb, 4);
  
-    /* unpack the level 1 AC coefficients (coeffs 1-5) */
+    /* unpack the group 1 AC coefficients (coeffs 1-5) */
      for (i = 1; i <= 5; i++) {
  
          debug_vp3("  vp3: unpacking level %d Y plane AC coefficients using table %d\n",
              i, ac_y_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_1[ac_y_table], i, 
-            0, plane_split, residual_eob_run);
+            s->first_coded_y_fragment, s->last_coded_y_fragment, residual_eob_run);
  
          debug_vp3("  vp3: unpacking level %d C plane AC coefficients using table %d\n",
              i, ac_c_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_1[ac_c_table], i, 
-            plane_split, s->coded_fragment_list_index, residual_eob_run);
+            s->first_coded_c_fragment, s->last_coded_c_fragment, residual_eob_run);
      }
  
-    /* unpack the level 2 AC coefficients (coeffs 6-14) */
+    /* unpack the group 2 AC coefficients (coeffs 6-14) */
      for (i = 6; i <= 14; i++) {
  
          debug_vp3("  vp3: unpacking level %d Y plane AC coefficients using table %d\n",
              i, ac_y_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_2[ac_y_table], i, 
-            0, plane_split, residual_eob_run);
+            s->first_coded_y_fragment, s->last_coded_y_fragment, residual_eob_run);
  
          debug_vp3("  vp3: unpacking level %d C plane AC coefficients using table %d\n",
              i, ac_c_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_2[ac_c_table], i, 
-            plane_split, s->coded_fragment_list_index, residual_eob_run);
+            s->first_coded_c_fragment, s->last_coded_c_fragment, residual_eob_run);
      }
  
-    /* unpack the level 3 AC coefficients (coeffs 15-27) */
+    /* unpack the group 3 AC coefficients (coeffs 15-27) */
      for (i = 15; i <= 27; i++) {
  
          debug_vp3("  vp3: unpacking level %d Y plane AC coefficients using table %d\n",
              i, ac_y_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_3[ac_y_table], i, 
-            0, plane_split, residual_eob_run);
+            s->first_coded_y_fragment, s->last_coded_y_fragment, residual_eob_run);
  
          debug_vp3("  vp3: unpacking level %d C plane AC coefficients using table %d\n",
              i, ac_c_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_3[ac_c_table], i, 
-            plane_split, s->coded_fragment_list_index, residual_eob_run);
+            s->first_coded_c_fragment, s->last_coded_c_fragment, residual_eob_run);
      }
  
-    /* unpack the level 4 AC coefficients (coeffs 28-63) */
+    /* unpack the group 4 AC coefficients (coeffs 28-63) */
      for (i = 28; i <= 63; i++) {
  
          debug_vp3("  vp3: unpacking level %d Y plane AC coefficients using table %d\n",
              i, ac_y_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_4[ac_y_table], i, 
-            0, plane_split, residual_eob_run);
+            s->first_coded_y_fragment, s->last_coded_y_fragment, residual_eob_run);
  
          debug_vp3("  vp3: unpacking level %d C plane AC coefficients using table %d\n",
              i, ac_c_table);
          residual_eob_run = unpack_vlcs(s, gb, &s->ac_vlc_4[ac_c_table], i, 
-            plane_split, s->coded_fragment_list_index, residual_eob_run);
+            s->first_coded_c_fragment, s->last_coded_c_fragment, residual_eob_run);
      }
+
+    return 0;
  }
  
  /*
@@ -1670,7 +2056,6 @@ static void unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
  #define COMPATIBLE_FRAME(x) \
    (compatible_frame[s->all_fragments[x].coding_method] == current_frame_type)
  #define FRAME_CODED(x) (s->all_fragments[x].coding_method != MODE_COPY)
-#define HIGHBITDUPPED(X) (((signed short) X)  >> 15)
  static inline int iabs (int x) { return ((x < 0) ? -x : x); }
  
  static void reverse_dc_prediction(Vp3DecodeContext *s,
@@ -1944,20 +2329,22 @@ static void reverse_dc_prediction(Vp3DecodeContext *s,
   */
  static void render_fragments(Vp3DecodeContext *s,
                               int first_fragment,
-                             int fragment_width,
-                             int fragment_height,
+                             int width,
+                             int height,
                               int plane /* 0 = Y, 1 = U, 2 = V */) 
  {
      int x, y;
      int m, n;
      int i = first_fragment;
-    int j;
      int16_t *dequantizer;
-    DCTELEM dequant_block[64];
      unsigned char *output_plane;
      unsigned char *last_plane;
      unsigned char *golden_plane;
      int stride;
+    int motion_x, motion_y;
+    int upper_motion_limit, lower_motion_limit;
+    int motion_halfpel_index;
+    uint8_t *motion_source;
  
      debug_vp3("  vp3: rendering final fragments for %s\n",
          (plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane");
@@ -1966,75 +2353,130 @@ static void render_fragments(Vp3DecodeContext *s,
      if (plane == 0) {
          dequantizer = s->intra_y_dequant;
          output_plane = s->current_frame.data[0];
-        last_plane = s->current_frame.data[0];
-        golden_plane = s->current_frame.data[0];
+        last_plane = s->last_frame.data[0];
+        golden_plane = s->golden_frame.data[0];
          stride = -s->current_frame.linesize[0];
+        upper_motion_limit = 7 * s->current_frame.linesize[0];
+        lower_motion_limit = height * s->current_frame.linesize[0] + width - 8;
      } else if (plane == 1) {
          dequantizer = s->intra_c_dequant;
          output_plane = s->current_frame.data[1];
-        last_plane = s->current_frame.data[1];
-        golden_plane = s->current_frame.data[1];
+        last_plane = s->last_frame.data[1];
+        golden_plane = s->golden_frame.data[1];
          stride = -s->current_frame.linesize[1];
+        upper_motion_limit = 7 * s->current_frame.linesize[1];
+        lower_motion_limit = height * s->current_frame.linesize[1] + width - 8;
      } else {
          dequantizer = s->intra_c_dequant;
          output_plane = s->current_frame.data[2];
-        last_plane = s->current_frame.data[2];
-        golden_plane = s->current_frame.data[2];
+        last_plane = s->last_frame.data[2];
+        golden_plane = s->golden_frame.data[2];
          stride = -s->current_frame.linesize[2];
+        upper_motion_limit = 7 * s->current_frame.linesize[2];
+        lower_motion_limit = height * s->current_frame.linesize[2] + width - 8;
      }
  
      /* for each fragment row... */
-    for (y = 0; y < fragment_height; y++) {
+    for (y = 0; y < height; y += 8) {
  
          /* for each fragment in a row... */
-        for (x = 0; x < fragment_width; x++, i++) {
+        for (x = 0; x < width; x += 8, i++) {
+
+            if ((i < 0) || (i >= s->fragment_count)) {
+                printf ("  vp3:render_fragments(): bad fragment number (%d)\n", i);
+                return;
+            }
  
              /* transform if this block was coded */
-            if (s->all_fragments[i].coding_method == MODE_INTRA) {
-                /* dequantize the DCT coefficients */
-                for (j = 0; j < 64; j++)
-                    dequant_block[dequant_index[j]] =
-                        s->all_fragments[i].coeffs[j] *
-                        dequantizer[j];
-                dequant_block[0] += 1024;
-
-                debug_idct("fragment %d:\n", i);
-                debug_idct("dequantized block:\n");
-                for (m = 0; m < 8; m++) {
-                    for (n = 0; n < 8; n++) {
-                        debug_idct(" %5d", dequant_block[m * 8 + n]);
+            if (s->all_fragments[i].coding_method != MODE_COPY) {
+
+                if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
+                    (s->all_fragments[i].coding_method == MODE_GOLDEN_MV))
+                    motion_source= golden_plane;
+                else 
+                    motion_source= last_plane;
+
+                motion_source += s->all_fragments[i].first_pixel;
+                motion_halfpel_index = 0;
+
+                /* sort out the motion vector if this fragment is coded
+                 * using a motion vector method */
+                if ((s->all_fragments[i].coding_method > MODE_INTRA) &&
+                    (s->all_fragments[i].coding_method != MODE_USING_GOLDEN)) {
+                    int src_x, src_y;
+                    motion_x = s->all_fragments[i].motion_x;
+                    motion_y = s->all_fragments[i].motion_y;
+                    if(plane){
+                        motion_x= (motion_x>>1) | (motion_x&1);
+                        motion_y= (motion_y>>1) | (motion_y&1);
+                    }
+
+                    src_x= (motion_x>>1) + x;
+                    src_y= (motion_y>>1) + y;
+if ((motion_x == 0xbeef) || (motion_y == 0xbeef))
+printf (" help! got beefy vector! (%X, %X)\n", motion_x, motion_y);
+
+                    motion_halfpel_index = motion_x & 0x01;
+                    motion_source += (motion_x >> 1);
+
+//                    motion_y = -motion_y;
+                    motion_halfpel_index |= (motion_y & 0x01) << 1;
+                    motion_source += ((motion_y >> 1) * stride);
+
+                    if(src_x<0 || src_y<0 || src_x + 9 >= width || src_y + 9 >= height){
+                        uint8_t *temp= s->edge_emu_buffer;
+                        if(stride<0) temp -= 9*stride;
+
+                        ff_emulated_edge_mc(temp, motion_source, stride, 9, 9, src_x, src_y, width, height);
+                        motion_source= temp;
                      }
-                    debug_idct("\n");
                  }
-                debug_idct("\n");
  
-                /* invert DCT and place in final output */
-                s->dsp.idct_put(
-                    output_plane + s->all_fragments[i].first_pixel,
-                    stride, dequant_block);
+                /* first, take care of copying a block from either the
+                 * previous or the golden frame */
+                if (s->all_fragments[i].coding_method != MODE_INTRA) {
  
-/*
-                debug_idct("idct block:\n");
+                    s->dsp.put_no_rnd_pixels_tab[1][motion_halfpel_index](
+                        output_plane + s->all_fragments[i].first_pixel,
+                        motion_source,
+                        stride, 8);
+                }
+
+                /* dequantize the DCT coefficients */
+                debug_idct("fragment %d, coding mode %d, DC = %d, dequant = %d:\n", 
+                    i, s->all_fragments[i].coding_method, 
+                    s->all_fragments[i].coeffs[0], dequantizer[0]);
+
+                /* invert DCT and place (or add) in final output */
+                if (s->all_fragments[i].coding_method == MODE_INTRA) {
+                    vp3_idct_put(s->all_fragments[i].coeffs, dequantizer,
+                        output_plane + s->all_fragments[i].first_pixel,
+                        stride);
+                } else {
+                    vp3_idct_add(s->all_fragments[i].coeffs, dequantizer,
+                        output_plane + s->all_fragments[i].first_pixel,
+                        stride);
+                }
+
+                debug_idct("block after idct_%s():\n",
+                    (s->all_fragments[i].coding_method == MODE_INTRA)?
+                    "put" : "add");
                  for (m = 0; m < 8; m++) {
                      for (n = 0; n < 8; n++) {
-                        debug_idct(" %3d", pixels[m * 8 + n]);
+                        debug_idct(" %3d", *(output_plane + 
+                            s->all_fragments[i].first_pixel + (m * stride + n)));
                      }
                      debug_idct("\n");
                  }
                  debug_idct("\n");
-*/
-            } else if (s->all_fragments[i].coding_method == MODE_COPY) {
-
-                /* copy directly from the previous frame */
-                for (m = 0; m < 8; m++)
-                    memcpy(
-                        output_plane + s->all_fragments[i].first_pixel + stride * m,
-                        last_plane + s->all_fragments[i].first_pixel + stride * m,
-                        8);
  
              } else {
  
-                /* carry out the motion compensation */
+                /* copy directly from the previous frame */
+                s->dsp.put_pixels_tab[1][0](
+                    output_plane + s->all_fragments[i].first_pixel,
+                    last_plane + s->all_fragments[i].first_pixel,
+                    stride, 8);
  
              }
          }
@@ -2102,10 +2544,19 @@ static int vp3_decode_init(AVCodecContext *avctx)
  {
      Vp3DecodeContext *s = avctx->priv_data;
      int i;
+    int c_width;
+    int c_height;
+    int y_superblock_count;
+    int c_superblock_count;
  
      s->avctx = avctx;
+#if 0
      s->width = avctx->width;
      s->height = avctx->height;
+#else
+    s->width = (avctx->width + 15) & 0xFFFFFFF0;
+    s->height = (avctx->height + 15) & 0xFFFFFFF0;
+#endif
      avctx->pix_fmt = PIX_FMT_YUV420P;
      avctx->has_b_frames = 0;
      dsputil_init(&s->dsp, avctx);
@@ -2114,11 +2565,20 @@ static int vp3_decode_init(AVCodecContext *avctx)
       * in the first frame decode */
      s->quality_index = -1;
  
-    s->superblock_width = (s->width + 31) / 32;
-    s->superblock_height = (s->height + 31) / 32;
-    s->superblock_count = s->superblock_width * s->superblock_height * 3 / 2;
-    s->u_superblock_start = s->superblock_width * s->superblock_height;
-    s->v_superblock_start = s->superblock_width * s->superblock_height * 5 / 4;
+    s->y_superblock_width = (s->width + 31) / 32;
+    s->y_superblock_height = (s->height + 31) / 32;
+    y_superblock_count = s->y_superblock_width * s->y_superblock_height;
+
+    /* work out the dimensions for the C planes */
+    c_width = s->width / 2;
+    c_height = s->height / 2;
+    s->c_superblock_width = (c_width + 31) / 32;
+    s->c_superblock_height = (c_height + 31) / 32;
+    c_superblock_count = s->c_superblock_width * s->c_superblock_height;
+
+    s->superblock_count = y_superblock_count + (c_superblock_count * 2);
+    s->u_superblock_start = y_superblock_count;
+    s->v_superblock_start = s->u_superblock_start + c_superblock_count;
      s->superblock_coding = av_malloc(s->superblock_count);
  
      s->macroblock_width = (s->width + 15) / 16;
@@ -2133,9 +2593,14 @@ static int vp3_decode_init(AVCodecContext *avctx)
      s->u_fragment_start = s->fragment_width * s->fragment_height;
      s->v_fragment_start = s->fragment_width * s->fragment_height * 5 / 4;
  
-    debug_init("  width: %d x %d\n", s->width, s->height);
-    debug_init("  superblocks: %d x %d, %d total\n",
-        s->superblock_width, s->superblock_height, s->superblock_count);
+    debug_init("  Y plane: %d x %d\n", s->width, s->height);
+    debug_init("  C plane: %d x %d\n", c_width, c_height);
+    debug_init("  Y superblocks: %d x %d, %d total\n",
+        s->y_superblock_width, s->y_superblock_height, y_superblock_count);
+    debug_init("  C superblocks: %d x %d, %d total\n",
+        s->c_superblock_width, s->c_superblock_height, c_superblock_count);
+    debug_init("  total superblocks = %d, U starts @ %d, V starts @ %d\n", 
+        s->superblock_count, s->u_superblock_start, s->v_superblock_start);
      debug_init("  macroblocks: %d x %d, %d total\n",
          s->macroblock_width, s->macroblock_height, s->macroblock_count);
      debug_init("  %d fragments, %d x %d, u starts @ %d, v starts @ %d\n",
@@ -2152,43 +2617,49 @@ static int vp3_decode_init(AVCodecContext *avctx)
      /* init VLC tables */
      for (i = 0; i < 16; i++) {
  
-        /* Dc histograms */
+        /* DC histograms */
          init_vlc(&s->dc_vlc[i], 5, 32,
              &dc_bias[i][0][1], 4, 2,
              &dc_bias[i][0][0], 4, 2);
  
-        /* level 1 AC histograms */
+        /* group 1 AC histograms */
          init_vlc(&s->ac_vlc_1[i], 5, 32,
              &ac_bias_0[i][0][1], 4, 2,
              &ac_bias_0[i][0][0], 4, 2);
  
-        /* level 2 AC histograms */
+        /* group 2 AC histograms */
          init_vlc(&s->ac_vlc_2[i], 5, 32,
              &ac_bias_1[i][0][1], 4, 2,
              &ac_bias_1[i][0][0], 4, 2);
  
-        /* level 3 AC histograms */
+        /* group 3 AC histograms */
          init_vlc(&s->ac_vlc_3[i], 5, 32,
              &ac_bias_2[i][0][1], 4, 2,
              &ac_bias_2[i][0][0], 4, 2);
  
-        /* level 4 AC histograms */
+        /* group 4 AC histograms */
          init_vlc(&s->ac_vlc_4[i], 5, 32,
              &ac_bias_3[i][0][1], 4, 2,
              &ac_bias_3[i][0][0], 4, 2);
      }
  
-    /* build quantization table */
+    /* build quantization zigzag table */
      for (i = 0; i < 64; i++)
-        quant_index[dequant_index[i]] = i;
+        zigzag_index[dezigzag_index[i]] = i;
  
      /* work out the block mapping tables */
      s->superblock_fragments = av_malloc(s->superblock_count * 16 * sizeof(int));
      s->superblock_macroblocks = av_malloc(s->superblock_count * 4 * sizeof(int));
      s->macroblock_fragments = av_malloc(s->macroblock_count * 6 * sizeof(int));
-    s->macroblock_coded = av_malloc(s->macroblock_count + 1);
+    s->macroblock_coding = av_malloc(s->macroblock_count + 1);
      init_block_mapping(s);
  
+    for (i = 0; i < 3; i++) {
+        s->current_frame.data[i] = NULL;
+        s->last_frame.data[i] = NULL;
+        s->golden_frame.data[i] = NULL;
+    }
+
      return 0;
  }
  
@@ -2212,28 +2683,37 @@ static int vp3_decode_frame(AVCodecContext *avctx,
      skip_bits(&gb, 1);
      s->last_quality_index = s->quality_index;
      s->quality_index = get_bits(&gb, 6);
-    if (s->quality_index != s->last_quality_index)
-        init_dequantizer(s);
  
      debug_vp3(" VP3 frame #%d: Q index = %d", counter, s->quality_index);
      counter++;
  
+    if (s->quality_index != s->last_quality_index)
+        init_dequantizer(s);
+
      if (s->keyframe) {
-        /* release the previous golden frame and get a new one */
-        if (counter > 1)
-            avctx->release_buffer(avctx, &s->golden_frame);
  
-        s->golden_frame.reference = 0;
+        debug_vp3(", keyframe\n");
+        /* skip the other 2 header bytes for now */
+        skip_bits(&gb, 16);
+        if (s->last_frame.data[0] == s->golden_frame.data[0]) {
+            if (s->golden_frame.data[0])
+                avctx->release_buffer(avctx, &s->golden_frame);
+            s->last_frame= s->golden_frame; /* ensure that we catch any access to this released frame */
+        } else {
+            if (s->golden_frame.data[0])
+                avctx->release_buffer(avctx, &s->golden_frame);
+            if (s->last_frame.data[0])
+                avctx->release_buffer(avctx, &s->last_frame);
+        }
+
+        s->golden_frame.reference = 3;
          if(avctx->get_buffer(avctx, &s->golden_frame) < 0) {
              printf("vp3: get_buffer() failed\n");
              return -1;
          }
  
-        /* last frame is hereby invalidated */
-        avctx->release_buffer(avctx, &s->last_frame);
-
          /* golden frame is also the current frame */
-        s->current_frame = s->golden_frame;
+        memcpy(&s->current_frame, &s->golden_frame, sizeof(AVFrame));
  
          /* time to figure out pixel addresses? */
          if (!s->pixel_addresses_inited)
@@ -2241,50 +2721,74 @@ static int vp3_decode_frame(AVCodecContext *avctx,
  
      } else {
  
+        debug_vp3("\n");
+
          /* allocate a new current frame */
-        s->current_frame.reference = 0;
+        s->current_frame.reference = 3;
          if(avctx->get_buffer(avctx, &s->current_frame) < 0) {
              printf("vp3: get_buffer() failed\n");
              return -1;
          }
-
      }
  
-    if (s->keyframe) {
-      debug_vp3(", keyframe\n");
-      /* skip the other 2 header bytes for now */
-      skip_bits(&gb, 16);
-    } else
-      debug_vp3("\n");
+    s->current_frame.qscale_table= s->qscale_table; //FIXME allocate individual tables per AVFrame
+    s->current_frame.qstride= 0;
  
      init_frame(s, &gb);
  
-    unpack_superblocks(s, &gb);
-    unpack_modes(s, &gb);
-    unpack_vectors(s, &gb);
-    unpack_dct_coeffs(s, &gb);
+#if KEYFRAMES_ONLY
+if (!s->keyframe) {
  
-    reverse_dc_prediction(s, 0, s->fragment_width, s->fragment_height);
-    reverse_dc_prediction(s, s->u_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2);
-    reverse_dc_prediction(s, s->v_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2);
+    memcpy(s->current_frame.data[0], s->golden_frame.data[0],
+        s->current_frame.linesize[0] * s->height);
+    memcpy(s->current_frame.data[1], s->golden_frame.data[1],
+        s->current_frame.linesize[1] * s->height / 2);
+    memcpy(s->current_frame.data[2], s->golden_frame.data[2],
+        s->current_frame.linesize[2] * s->height / 2);
+
+} else {
+#endif
+
+    if (unpack_superblocks(s, &gb) ||
+        unpack_modes(s, &gb) ||
+        unpack_vectors(s, &gb) ||
+        unpack_dct_coeffs(s, &gb)) {
+
+        printf("  vp3: could not decode frame\n");
+        return -1;
+    }
  
-    render_fragments(s, 0, s->fragment_width, s->fragment_height, 0);
-    render_fragments(s, s->u_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2, 1);
-    render_fragments(s, s->v_fragment_start,
-        s->fragment_width / 2, s->fragment_height / 2, 2);
+    reverse_dc_prediction(s, 0, s->fragment_width, s->fragment_height);
+    render_fragments(s, 0, s->width, s->height, 0);
+
+    if ((avctx->flags & CODEC_FLAG_GRAY) == 0) {
+        reverse_dc_prediction(s, s->u_fragment_start,
+            s->fragment_width / 2, s->fragment_height / 2);
+        reverse_dc_prediction(s, s->v_fragment_start,
+            s->fragment_width / 2, s->fragment_height / 2);
+        render_fragments(s, s->u_fragment_start, s->width / 2, s->height / 2, 1);
+        render_fragments(s, s->v_fragment_start, s->width / 2, s->height / 2, 2);
+    } else {
+        memset(s->current_frame.data[1], 0x80, s->width * s->height / 4);
+        memset(s->current_frame.data[2], 0x80, s->width * s->height / 4);
+    }
  
+#if KEYFRAMES_ONLY
+}
+#endif
  
      *data_size=sizeof(AVFrame);
      *(AVFrame*)data= s->current_frame;
  
-    /* release the last frame, if it was allocated */
-    avctx->release_buffer(avctx, &s->last_frame);
+    /* release the last frame, if it is allocated and if it is not the
+     * golden frame */
+    if ((s->last_frame.data[0]) &&
+        (s->last_frame.data[0] != s->golden_frame.data[0]))
+        avctx->release_buffer(avctx, &s->last_frame);
  
-    /* shuffle frames */
-    s->last_frame = s->current_frame;
+    /* shuffle frames (last = current) */
+    memcpy(&s->last_frame, &s->current_frame, sizeof(AVFrame));
+    s->current_frame.data[0]= NULL; /* ensure that we catch any access to this released frame */
  
      return buf_size;
  }
@@ -2301,12 +2805,15 @@ static int vp3_decode_end(AVCodecContext *avctx)
      av_free(s->superblock_fragments);
      av_free(s->superblock_macroblocks);
      av_free(s->macroblock_fragments);
-    av_free(s->macroblock_coded);
-
+    av_free(s->macroblock_coding);
+    
      /* release all frames */
-    avctx->release_buffer(avctx, &s->golden_frame);
-    avctx->release_buffer(avctx, &s->last_frame);
-    avctx->release_buffer(avctx, &s->current_frame);
+    if (s->golden_frame.data[0] && s->golden_frame.data[0] != s->last_frame.data[0])
+        avctx->release_buffer(avctx, &s->golden_frame);
+    if (s->last_frame.data[0])
+        avctx->release_buffer(avctx, &s->last_frame);
+    /* no need to release the current_frame since it will always be pointing
+     * to the same frame as either the golden or last frame */
  
      return 0;
  }