alacdec: fix packed sample output with 5.1

[ffmpeg] / libavcodec / snowenc.c
diff --git a/libavcodec/snowenc.c b/libavcodec/snowenc.c

index 70919603a6d5064927ca82765f4fcaa298830a73..e9dbb421ba7581d9e842f2a54fbd8b79f639d453 100644 (file)
--- a/libavcodec/snowenc.c
+++ b/libavcodec/snowenc.c
@@ -94,7 +94,7 @@ static void dwt_quantize(SnowContext *s, Plane *p, DWTELEM *buffer, int width, i
      //FIXME pass the copy cleanly ?
  
  //    memcpy(dwt_buffer, buffer, height * stride * sizeof(DWTELEM));
-    ff_spatial_dwt(buffer, width, height, stride, type, s->spatial_decomposition_count);
+    ff_spatial_dwt(buffer, s->temp_dwt_buffer, width, height, stride, type, s->spatial_decomposition_count);
  
      for(level=0; level<s->spatial_decomposition_count; level++){
          for(orientation=level ? 1 : 0; orientation<4; orientation++){
@@ -119,7 +119,7 @@ static void dwt_quantize(SnowContext *s, Plane *p, DWTELEM *buffer, int width, i
                      for(xs= 0; xs<Q2_STEP; xs++){
                          memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
                          dequantize_all(s, p, idwt2_buffer, width, height);
-                        ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
+                        ff_spatial_idwt(idwt2_buffer, s->temp_idwt_buffer, width, height, stride, type, s->spatial_decomposition_count);
                          find_sse(s, p, best_score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
                          memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
                          for(y=ys; y<b->height; y+= Q2_STEP){
@@ -130,7 +130,7 @@ static void dwt_quantize(SnowContext *s, Plane *p, DWTELEM *buffer, int width, i
                              }
                          }
                          dequantize_all(s, p, idwt2_buffer, width, height);
-                        ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
+                        ff_spatial_idwt(idwt2_buffer, s->temp_idwt_buffer, width, height, stride, type, s->spatial_decomposition_count);
                          find_sse(s, p, score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
                          for(y=ys; y<b->height; y+= Q2_STEP){
                              for(x=xs; x<b->width; x+= Q2_STEP){
@@ -156,7 +156,7 @@ static void dwt_quantize(SnowContext *s, Plane *p, DWTELEM *buffer, int width, i
  static av_cold int encode_init(AVCodecContext *avctx)
  {
      SnowContext *s = avctx->priv_data;
-    int plane_index;
+    int plane_index, ret;
  
      if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
          av_log(avctx, AV_LOG_ERROR, "This codec is under development, files encoded with it may not be decodable with future versions!!!\n"
@@ -185,7 +185,10 @@ static av_cold int encode_init(AVCodecContext *avctx)
          s->plane[plane_index].fast_mc= 1;
      }
  
-    ff_snow_common_init(avctx);
+    if ((ret = ff_snow_common_init(avctx)) < 0) {
+        ff_snow_common_end(avctx->priv_data);
+        return ret;
+    }
      ff_snow_alloc_blocks(s);
  
      s->version=0;
@@ -215,12 +218,12 @@ static av_cold int encode_init(AVCodecContext *avctx)
  
      avctx->coded_frame= &s->current_picture;
      switch(avctx->pix_fmt){
-//    case PIX_FMT_YUV444P:
+    case PIX_FMT_YUV444P:
  //    case PIX_FMT_YUV422P:
      case PIX_FMT_YUV420P:
  //     case PIX_FMT_GRAY8:
  //    case PIX_FMT_YUV411P:
-//    case PIX_FMT_YUV410P:
+    case PIX_FMT_YUV410P:
          s->colorspace_type= 0;
          break;
  /*    case PIX_FMT_RGB32:
@@ -230,9 +233,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
          av_log(avctx, AV_LOG_ERROR, "pixel format not supported\n");
          return -1;
      }
-//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
-    s->chroma_h_shift= 1;
-    s->chroma_v_shift= 1;
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
  
      ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
      ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
@@ -248,18 +249,16 @@ static av_cold int encode_init(AVCodecContext *avctx)
          }
      }
  
-    s->runs = av_malloc(avctx->width * avctx->height * sizeof(*s->runs));
-
      return 0;
  }
  
  //near copy & paste from dsputil, FIXME
-static int pix_sum(uint8_t * pix, int line_size, int w)
+static int pix_sum(uint8_t * pix, int line_size, int w, int h)
  {
      int s, i, j;
  
      s = 0;
-    for (i = 0; i < w; i++) {
+    for (i = 0; i < h; i++) {
          for (j = 0; j < w; j++) {
              s += pix[0];
              pix ++;
@@ -325,8 +324,8 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
      const int stride= s->current_picture.linesize[0];
      const int uvstride= s->current_picture.linesize[1];
      uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
-                                s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
-                                s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
+                                s->input_picture.data[1] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift),
+                                s->input_picture.data[2] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift)};
      int P[10][2];
      int16_t last_mv[3][2];
      int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
@@ -446,15 +445,15 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
      score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
  
      block_s= block_w*block_w;
-    sum = pix_sum(current_data[0], stride, block_w);
+    sum = pix_sum(current_data[0], stride, block_w, block_w);
      l= (sum + block_s/2)/block_s;
      iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
  
-    block_s= block_w*block_w>>2;
-    sum = pix_sum(current_data[1], uvstride, block_w>>1);
+    block_s= block_w*block_w>>(s->chroma_h_shift + s->chroma_v_shift);
+    sum = pix_sum(current_data[1], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
      cb= (sum + block_s/2)/block_s;
  //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
-    sum = pix_sum(current_data[2], uvstride, block_w>>1);
+    sum = pix_sum(current_data[2], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
      cr= (sum + block_s/2)/block_s;
  //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
  
@@ -576,9 +575,10 @@ static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
      int i, x2, y2;
      Plane *p= &s->plane[plane_index];
      const int block_size = MB_SIZE >> s->block_max_depth;
-    const int block_w    = plane_index ? block_size/2 : block_size;
-    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+1] : ff_obmc_tab[s->block_max_depth];
-    const int obmc_stride= plane_index ? block_size : 2*block_size;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
      const int ref_stride= s->current_picture.linesize[plane_index];
      uint8_t *src= s-> input_picture.data[plane_index];
      IDWTELEM *dst= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
@@ -591,6 +591,8 @@ static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
      int ab=0;
      int aa=0;
  
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc stuff above
+
      b->type|= BLOCK_INTRA;
      b->color[plane_index]= 0;
      memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
@@ -599,19 +601,19 @@ static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
          int mb_x2= mb_x + (i &1) - 1;
          int mb_y2= mb_y + (i>>1) - 1;
          int x= block_w*mb_x2 + block_w/2;
-        int y= block_w*mb_y2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
  
-        add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
-                    x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
+        add_yblock(s, 0, NULL, dst + (i&1)*block_w + (i>>1)*obmc_stride*block_h, NULL, obmc,
+                    x, y, block_w, block_h, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
  
-        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
+        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_h); y2++){
              for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
-                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
+                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_h*mb_y - block_h/2))*obmc_stride;
                  int obmc_v= obmc[index];
                  int d;
-                if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
+                if(y<0) obmc_v += obmc[index + block_h*obmc_stride];
                  if(x<0) obmc_v += obmc[index + block_w];
-                if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
+                if(y+block_h>h) obmc_v += obmc[index - block_h*obmc_stride];
                  if(x+block_w>w) obmc_v += obmc[index - block_w];
                  //FIXME precalculate this or simplify it somehow else
  
@@ -665,17 +667,18 @@ static inline int get_block_bits(SnowContext *s, int x, int y, int w){
      }
  }
  
-static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
+static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, uint8_t (*obmc_edged)[MB_SIZE * 2]){
      Plane *p= &s->plane[plane_index];
      const int block_size = MB_SIZE >> s->block_max_depth;
-    const int block_w    = plane_index ? block_size/2 : block_size;
-    const int obmc_stride= plane_index ? block_size : 2*block_size;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
      const int ref_stride= s->current_picture.linesize[plane_index];
      uint8_t *dst= s->current_picture.data[plane_index];
      uint8_t *src= s->  input_picture.data[plane_index];
      IDWTELEM *pred= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
      uint8_t *cur = s->scratchbuf;
-    uint8_t tmp[ref_stride*(2*MB_SIZE+HTAPS_MAX-1)];
+    uint8_t *tmp = s->emu_edge_buffer;
      const int b_stride = s->b_width << s->block_max_depth;
      const int b_height = s->b_height<< s->block_max_depth;
      const int w= p->width;
@@ -684,17 +687,19 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, con
      int rate= 0;
      const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
      int sx= block_w*mb_x - block_w/2;
-    int sy= block_w*mb_y - block_w/2;
+    int sy= block_h*mb_y - block_h/2;
      int x0= FFMAX(0,-sx);
      int y0= FFMAX(0,-sy);
      int x1= FFMIN(block_w*2, w-sx);
-    int y1= FFMIN(block_w*2, h-sy);
+    int y1= FFMIN(block_h*2, h-sy);
      int i,x,y;
  
-    ff_snow_pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below chckinhg only block_w
+
+    ff_snow_pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_h*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
  
      for(y=y0; y<y1; y++){
-        const uint8_t *obmc1= obmc_edged + y*obmc_stride;
+        const uint8_t *obmc1= obmc_edged[y];
          const IDWTELEM *pred1 = pred + y*obmc_stride;
          uint8_t *cur1 = cur + y*ref_stride;
          uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
@@ -719,9 +724,9 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, con
          else
              x0 = block_w;
          if(mb_y == 0)
-            y1 = block_w;
+            y1 = block_h;
          else
-            y0 = block_w;
+            y0 = block_h;
          for(y=y0; y<y1; y++)
              memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
      }
@@ -767,9 +772,10 @@ static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
      int i, y2;
      Plane *p= &s->plane[plane_index];
      const int block_size = MB_SIZE >> s->block_max_depth;
-    const int block_w    = plane_index ? block_size/2 : block_size;
-    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+1] : ff_obmc_tab[s->block_max_depth];
-    const int obmc_stride= plane_index ? block_size : 2*block_size;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
      const int ref_stride= s->current_picture.linesize[plane_index];
      uint8_t *dst= s->current_picture.data[plane_index];
      uint8_t *src= s-> input_picture.data[plane_index];
@@ -783,31 +789,33 @@ static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
      int rate= 0;
      const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
  
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below
+
      for(i=0; i<9; i++){
          int mb_x2= mb_x + (i%3) - 1;
          int mb_y2= mb_y + (i/3) - 1;
          int x= block_w*mb_x2 + block_w/2;
-        int y= block_w*mb_y2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
  
          add_yblock(s, 0, NULL, zero_dst, dst, obmc,
-                   x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
+                   x, y, block_w, block_h, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
  
          //FIXME find a cleaner/simpler way to skip the outside stuff
          for(y2= y; y2<0; y2++)
              memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
-        for(y2= h; y2<y+block_w; y2++)
+        for(y2= h; y2<y+block_h; y2++)
              memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
          if(x<0){
-            for(y2= y; y2<y+block_w; y2++)
+            for(y2= y; y2<y+block_h; y2++)
                  memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
          }
          if(x+block_w > w){
-            for(y2= y; y2<y+block_w; y2++)
+            for(y2= y; y2<y+block_h; y2++)
                  memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
          }
  
          assert(block_w== 8 || block_w==16);
-        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
+        distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h);
      }
  
      if(plane_index==0){
@@ -836,6 +844,7 @@ static int encode_subband_c0run(SnowContext *s, SubBand *b, const IDWTELEM *src,
  
      if(1){
          int run=0;
+        int *runs = s->run_buffer;
          int run_index=0;
          int max_index;
  
@@ -869,7 +878,7 @@ static int encode_subband_c0run(SnowContext *s, SubBand *b, const IDWTELEM *src,
                  }
                  if(!(/*ll|*/l|lt|t|rt|p)){
                      if(v){
-                        s->runs[run_index++]= run;
+                        runs[run_index++]= run;
                          run=0;
                      }else{
                          run++;
@@ -878,9 +887,9 @@ static int encode_subband_c0run(SnowContext *s, SubBand *b, const IDWTELEM *src,
              }
          }
          max_index= run_index;
-        s->runs[run_index++]= run;
+        runs[run_index++]= run;
          run_index=0;
-        run= s->runs[run_index++];
+        run= runs[run_index++];
  
          put_symbol2(&s->c, b->state[30], max_index, 0);
          if(run_index <= max_index)
@@ -924,7 +933,7 @@ static int encode_subband_c0run(SnowContext *s, SubBand *b, const IDWTELEM *src,
                      put_rac(&s->c, &b->state[0][context], !!v);
                  }else{
                      if(!run){
-                        run= s->runs[run_index++];
+                        run= runs[run_index++];
  
                          if(run_index <= max_index)
                              put_symbol2(&s->c, b->state[1], run, 3);
@@ -955,7 +964,7 @@ static int encode_subband(SnowContext *s, SubBand *b, const IDWTELEM *src, const
  //    encode_subband_dzr(s, b, src, parent, stride, orientation);
  }
  
-static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
+static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
      const int b_stride= s->b_width << s->block_max_depth;
      BlockNode *block= &s->block[mb_x + mb_y * b_stride];
      BlockNode backup= *block;
@@ -996,7 +1005,7 @@ static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int
  
  /* special case for int[2] args we discard afterwards,
   * fixes compilation problem with gcc 2.95 */
-static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
+static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
      int p[2] = {p0, p1};
      return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
  }
@@ -1004,10 +1013,18 @@ static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y
  static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
      const int b_stride= s->b_width << s->block_max_depth;
      BlockNode *block= &s->block[mb_x + mb_y * b_stride];
-    BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
+    BlockNode backup[4];
      unsigned value;
      int rd, index;
  
+    /* We don't initialize backup[] during variable declaration, because
+     * that fails to compile on MSVC: "cannot convert from 'BlockNode' to
+     * 'int16_t'". */
+    backup[0] = block[0];
+    backup[1] = block[1];
+    backup[2] = block[b_stride];
+    backup[3] = block[b_stride + 1];
+
      assert(mb_x>=0 && mb_y>=0);
      assert(mb_x<b_stride);
      assert(((mb_x|mb_y)&1) == 0);
@@ -1076,7 +1093,7 @@ static void iterative_me(SnowContext *s){
                  BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
                  BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
                  const int b_w= (MB_SIZE >> s->block_max_depth);
-                uint8_t obmc_edged[b_w*2][b_w*2];
+                uint8_t obmc_edged[MB_SIZE * 2][MB_SIZE * 2];
  
                  if(pass && (block->type & BLOCK_OPT))
                      continue;
@@ -1091,7 +1108,8 @@ static void iterative_me(SnowContext *s){
                  //FIXME precalculate
                  {
                      int x, y;
-                    memcpy(obmc_edged, ff_obmc_tab[s->block_max_depth], b_w*b_w*4);
+                    for (y = 0; y < b_w * 2; y++)
+                        memcpy(obmc_edged[y], ff_obmc_tab[s->block_max_depth] + y * b_w * 2, b_w * 2);
                      if(mb_x==0)
                          for(y=0; y<b_w*2; y++)
                              memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
@@ -1118,22 +1136,23 @@ static void iterative_me(SnowContext *s){
                      uint8_t *dst= s->current_picture.data[0];
                      const int stride= s->current_picture.linesize[0];
                      const int block_w= MB_SIZE >> s->block_max_depth;
+                    const int block_h= MB_SIZE >> s->block_max_depth;
                      const int sx= block_w*mb_x - block_w/2;
-                    const int sy= block_w*mb_y - block_w/2;
+                    const int sy= block_h*mb_y - block_h/2;
                      const int w= s->plane[0].width;
                      const int h= s->plane[0].height;
                      int y;
  
                      for(y=sy; y<0; y++)
                          memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
-                    for(y=h; y<sy+block_w*2; y++)
+                    for(y=h; y<sy+block_h*2; y++)
                          memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
                      if(sx<0){
-                        for(y=sy; y<sy+block_w*2; y++)
+                        for(y=sy; y<sy+block_h*2; y++)
                              memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
                      }
                      if(sx+block_w*2 > w){
-                        for(y=sy; y<sy+block_w*2; y++)
+                        for(y=sy; y<sy+block_h*2; y++)
                              memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
                      }
                  }
@@ -1145,9 +1164,9 @@ static void iterative_me(SnowContext *s){
                  // get previous score (cannot be cached due to OBMC)
                  if(pass > 0 && (block->type&BLOCK_INTRA)){
                      int color0[3]= {block->color[0], block->color[1], block->color[2]};
-                    check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
+                    check_block(s, mb_x, mb_y, color0, 1, obmc_edged, &best_rd);
                  }else
-                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, obmc_edged, &best_rd);
  
                  ref_b= *block;
                  ref_rd= best_rd;
@@ -1158,16 +1177,16 @@ static void iterative_me(SnowContext *s){
                      block->ref= ref;
                      best_rd= INT_MAX;
  
-                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
-                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, 0, 0, obmc_edged, &best_rd);
                      if(tb)
-                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
+                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], obmc_edged, &best_rd);
                      if(lb)
-                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
+                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], obmc_edged, &best_rd);
                      if(rb)
-                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
+                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], obmc_edged, &best_rd);
                      if(bb)
-                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
+                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], obmc_edged, &best_rd);
  
                      /* fullpel ME */
                      //FIXME avoid subpel interpolation / round to nearest integer
@@ -1175,10 +1194,10 @@ static void iterative_me(SnowContext *s){
                          dia_change=0;
                          for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
                              for(j=0; j<i; j++){
-                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
-                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
-                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
-                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), obmc_edged, &best_rd);
                              }
                          }
                      }while(dia_change);
@@ -1187,7 +1206,7 @@ static void iterative_me(SnowContext *s){
                          static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
                          dia_change=0;
                          for(i=0; i<8; i++)
-                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
+                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], obmc_edged, &best_rd);
                      }while(dia_change);
                      //FIXME or try the standard 2 pass qpel or similar
  
@@ -1200,7 +1219,7 @@ static void iterative_me(SnowContext *s){
                  }
                  best_rd= ref_rd;
                  *block= ref_b;
-                check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
+                check_block(s, mb_x, mb_y, color, 1, obmc_edged, &best_rd);
                  //FIXME RD style color selection
                  if(!same_block(block, &backup)){
                      if(tb ) tb ->type &= ~BLOCK_OPT;
@@ -1588,7 +1607,7 @@ static void calculate_visual_weight(SnowContext *s, Plane *p){
  
              memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
              ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
-            ff_spatial_idwt(s->spatial_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
              for(y=0; y<height; y++){
                  for(x=0; x<width; x++){
                      int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
@@ -1620,11 +1639,12 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
      ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
  
      for(i=0; i<3; i++){
-        int shift= !!i;
-        for(y=0; y<(height>>shift); y++)
+        int hshift= i ? s->chroma_h_shift : 0;
+        int vshift= i ? s->chroma_v_shift : 0;
+        for(y=0; y<(height>>vshift); y++)
              memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
                     &pict->data[i][y * pict->linesize[i]],
-                   width>>shift);
+                   width>>hshift);
      }
      s->new_picture = *pict;
  
@@ -1778,7 +1798,7 @@ redo_frame:
              /*  if(QUANTIZE2)
                  dwt_quantize(s, p, s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type);
              else*/
-                ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+                ff_spatial_dwt(s->spatial_dwt_buffer, s->temp_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
  
              if(s->pass1_rc && plane_index==0){
                  int delta_qlog = ratecontrol_1pass(s, pic);
@@ -1818,7 +1838,7 @@ redo_frame:
                  }
              }
  
-            ff_spatial_idwt(s->spatial_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
              if(s->qlog == LOSSLESS_QLOG){
                  for(y=0; y<h; y++){
                      for(x=0; x<w; x++){
@@ -1899,7 +1919,6 @@ static av_cold int encode_end(AVCodecContext *avctx)
      if (s->input_picture.data[0])
          avctx->release_buffer(avctx, &s->input_picture);
      av_free(avctx->stats_out);
-    av_freep(&s->runs);
  
      return 0;
  }
@@ -1927,6 +1946,10 @@ AVCodec ff_snow_encoder = {
      .init           = encode_init,
      .encode2        = encode_frame,
      .close          = encode_end,
+    .pix_fmts       = (const enum PixelFormat[]){
+        PIX_FMT_YUV420P, PIX_FMT_YUV410P, PIX_FMT_YUV444P,
+        PIX_FMT_NONE
+    },
      .long_name      = NULL_IF_CONFIG_SMALL("Snow"),
      .priv_class     = &snowenc_class,
  };
@@ -1950,14 +1973,17 @@ int main(void){
      s.spatial_decomposition_count=6;
      s.spatial_decomposition_type=1;
  
+    s.temp_dwt_buffer  = av_mallocz(width * sizeof(DWTELEM));
+    s.temp_idwt_buffer = av_mallocz(width * sizeof(IDWTELEM));
+
      av_lfg_init(&prng, 1);
  
      printf("testing 5/3 DWT\n");
      for(i=0; i<width*height; i++)
          buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
  
-    ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
-    ff_spatial_idwt((IDWTELEM*)buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
  
      for(i=0; i<width*height; i++)
          if(buffer[0][i]!= buffer[1][i]) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
@@ -1967,8 +1993,8 @@ int main(void){
      for(i=0; i<width*height; i++)
          buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
  
-    ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
-    ff_spatial_idwt((IDWTELEM*)buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
  
      for(i=0; i<width*height; i++)
          if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
@@ -1994,7 +2020,7 @@ int main(void){
  
                  memset(buffer[0], 0, sizeof(int)*width*height);
                  buf[w/2 + h/2*stride]= 256*256;
-                ff_spatial_idwt((IDWTELEM*)buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+                ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
                  for(y=0; y<height; y++){
                      for(x=0; x<width; x++){
                          int64_t d= buffer[0][x + y*width];
@@ -2036,7 +2062,7 @@ int main(void){
                      buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
                  }
              }
-            ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+            ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
              for(y=0; y<height; y++){
                  for(x=0; x<width; x++){
                      int64_t d= buffer[0][x + y*width];