git.sesse.net Git - ffmpeg/blob - libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/intmath.h"
  22 #include "avcodec.h"
  23 #include "dsputil.h"
  24 #include "dwt.h"
  25 #include "snow.h"
  26
  27 #include "rangecoder.h"
  28 #include "mathops.h"
  29
  30 #include "mpegvideo.h"
  31 #include "h263.h"
  32
  33 #undef NDEBUG
  34 #include <assert.h>
  35
  36 static const int8_t quant3bA[256]={
  37  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  38  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  39  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  40  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  41  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  42  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  43  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  44  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  45  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  46  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  47  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  48  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  49  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  50  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  51  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  52  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  53 };
  54
  55 static const uint8_t obmc32[1024]={
  56   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
  57   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
  58   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
  59   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
  60   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
  61   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
  62   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
  63   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
  64   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
  65   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
  66   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
  67   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
  68   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
  69   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
  70   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
  71   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
  72   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
  73   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
  74   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
  75   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
  76   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
  77   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
  78   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
  79   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
  80   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
  81   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
  82   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
  83   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
  84   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
  85   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
  86   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
  87   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
  88  //error:0.000020
  89 };
  90 static const uint8_t obmc16[256]={
  91   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
  92   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
  93   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
  94   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
  95   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
  96  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
  97  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
  98  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
  99  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 100  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 101  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 102   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 103   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 104   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 105   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 106   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 107 //error:0.000015
 108 };
 109
 110 //linear *64
 111 static const uint8_t obmc8[64]={
 112   4, 12, 20, 28, 28, 20, 12,  4,
 113  12, 36, 60, 84, 84, 60, 36, 12,
 114  20, 60,100,140,140,100, 60, 20,
 115  28, 84,140,196,196,140, 84, 28,
 116  28, 84,140,196,196,140, 84, 28,
 117  20, 60,100,140,140,100, 60, 20,
 118  12, 36, 60, 84, 84, 60, 36, 12,
 119   4, 12, 20, 28, 28, 20, 12,  4,
 120 //error:0.000000
 121 };
 122
 123 //linear *64
 124 static const uint8_t obmc4[16]={
 125  16, 48, 48, 16,
 126  48,144,144, 48,
 127  48,144,144, 48,
 128  16, 48, 48, 16,
 129 //error:0.000000
 130 };
 131
 132 static const uint8_t * const obmc_tab[4]={
 133     obmc32, obmc16, obmc8, obmc4
 134 };
 135
 136 static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
 137
 138 typedef struct BlockNode{
 139     int16_t mx;
 140     int16_t my;
 141     uint8_t ref;
 142     uint8_t color[3];
 143     uint8_t type;
 144 //#define TYPE_SPLIT    1
 145 #define BLOCK_INTRA   1
 146 #define BLOCK_OPT     2
 147 //#define TYPE_NOCOLOR  4
 148     uint8_t level; //FIXME merge into type?
 149 }BlockNode;
 150
 151 static const BlockNode null_block= { //FIXME add border maybe
 152     .color= {128,128,128},
 153     .mx= 0,
 154     .my= 0,
 155     .ref= 0,
 156     .type= 0,
 157     .level= 0,
 158 };
 159
 160 #define LOG2_MB_SIZE 4
 161 #define MB_SIZE (1<<LOG2_MB_SIZE)
 162 #define ENCODER_EXTRA_BITS 4
 163 #define HTAPS_MAX 8
 164
 165 typedef struct x_and_coeff{
 166     int16_t x;
 167     uint16_t coeff;
 168 } x_and_coeff;
 169
 170 typedef struct SubBand{
 171     int level;
 172     int stride;
 173     int width;
 174     int height;
 175     int qlog;        ///< log(qscale)/log[2^(1/6)]
 176     DWTELEM *buf;
 177     IDWTELEM *ibuf;
 178     int buf_x_offset;
 179     int buf_y_offset;
 180     int stride_line; ///< Stride measured in lines, not pixels.
 181     x_and_coeff * x_coeff;
 182     struct SubBand *parent;
 183     uint8_t state[/*7*2*/ 7 + 512][32];
 184 }SubBand;
 185
 186 typedef struct Plane{
 187     int width;
 188     int height;
 189     SubBand band[MAX_DECOMPOSITIONS][4];
 190
 191     int htaps;
 192     int8_t hcoeff[HTAPS_MAX/2];
 193     int diag_mc;
 194     int fast_mc;
 195
 196     int last_htaps;
 197     int8_t last_hcoeff[HTAPS_MAX/2];
 198     int last_diag_mc;
 199 }Plane;
 200
 201 typedef struct SnowContext{
 202
 203     AVCodecContext *avctx;
 204     RangeCoder c;
 205     DSPContext dsp;
 206     DWTContext dwt;
 207     AVFrame new_picture;
 208     AVFrame input_picture;              ///< new_picture with the internal linesizes
 209     AVFrame current_picture;
 210     AVFrame last_picture[MAX_REF_FRAMES];
 211     uint8_t *halfpel_plane[MAX_REF_FRAMES][4][4];
 212     AVFrame mconly_picture;
 213 //     uint8_t q_context[16];
 214     uint8_t header_state[32];
 215     uint8_t block_state[128 + 32*128];
 216     int keyframe;
 217     int always_reset;
 218     int version;
 219     int spatial_decomposition_type;
 220     int last_spatial_decomposition_type;
 221     int temporal_decomposition_type;
 222     int spatial_decomposition_count;
 223     int last_spatial_decomposition_count;
 224     int temporal_decomposition_count;
 225     int max_ref_frames;
 226     int ref_frames;
 227     int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
 228     uint32_t *ref_scores[MAX_REF_FRAMES];
 229     DWTELEM *spatial_dwt_buffer;
 230     IDWTELEM *spatial_idwt_buffer;
 231     int colorspace_type;
 232     int chroma_h_shift;
 233     int chroma_v_shift;
 234     int spatial_scalability;
 235     int qlog;
 236     int last_qlog;
 237     int lambda;
 238     int lambda2;
 239     int pass1_rc;
 240     int mv_scale;
 241     int last_mv_scale;
 242     int qbias;
 243     int last_qbias;
 244 #define QBIAS_SHIFT 3
 245     int b_width;
 246     int b_height;
 247     int block_max_depth;
 248     int last_block_max_depth;
 249     Plane plane[MAX_PLANES];
 250     BlockNode *block;
 251 #define ME_CACHE_SIZE 1024
 252     int me_cache[ME_CACHE_SIZE];
 253     int me_cache_generation;
 254     slice_buffer sb;
 255
 256     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to eventually make the motion estimation independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 257
 258     uint8_t *scratchbuf;
 259 }SnowContext;
 260
 261 #ifdef __sgi
 262 // Avoid a name clash on SGI IRIX
 263 #undef qexp
 264 #endif
 265 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 266 static uint8_t qexp[QROOT];
 267
 268 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 269     int i;
 270
 271     if(v){
 272         const int a= FFABS(v);
 273         const int e= av_log2(a);
 274         const int el= FFMIN(e, 10);
 275         put_rac(c, state+0, 0);
 276
 277         for(i=0; i<el; i++){
 278             put_rac(c, state+1+i, 1);  //1..10
 279         }
 280         for(; i<e; i++){
 281             put_rac(c, state+1+9, 1);  //1..10
 282         }
 283         put_rac(c, state+1+FFMIN(i,9), 0);
 284
 285         for(i=e-1; i>=el; i--){
 286             put_rac(c, state+22+9, (a>>i)&1); //22..31
 287         }
 288         for(; i>=0; i--){
 289             put_rac(c, state+22+i, (a>>i)&1); //22..31
 290         }
 291
 292         if(is_signed)
 293             put_rac(c, state+11 + el, v < 0); //11..21
 294     }else{
 295         put_rac(c, state+0, 1);
 296     }
 297 }
 298
 299 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 300     if(get_rac(c, state+0))
 301         return 0;
 302     else{
 303         int i, e, a;
 304         e= 0;
 305         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 306             e++;
 307         }
 308
 309         a= 1;
 310         for(i=e-1; i>=0; i--){
 311             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 312         }
 313
 314         e= -(is_signed && get_rac(c, state+11 + FFMIN(e,10))); //11..21
 315         return (a^e)-e;
 316     }
 317 }
 318
 319 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 320     int i;
 321     int r= log2>=0 ? 1<<log2 : 1;
 322
 323     assert(v>=0);
 324     assert(log2>=-4);
 325
 326     while(v >= r){
 327         put_rac(c, state+4+log2, 1);
 328         v -= r;
 329         log2++;
 330         if(log2>0) r+=r;
 331     }
 332     put_rac(c, state+4+log2, 0);
 333
 334     for(i=log2-1; i>=0; i--){
 335         put_rac(c, state+31-i, (v>>i)&1);
 336     }
 337 }
 338
 339 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 340     int i;
 341     int r= log2>=0 ? 1<<log2 : 1;
 342     int v=0;
 343
 344     assert(log2>=-4);
 345
 346     while(get_rac(c, state+4+log2)){
 347         v+= r;
 348         log2++;
 349         if(log2>0) r+=r;
 350     }
 351
 352     for(i=log2-1; i>=0; i--){
 353         v+= get_rac(c, state+31-i)<<i;
 354     }
 355
 356     return v;
 357 }
 358
 359 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
 360     const int w= b->width;
 361     const int h= b->height;
 362     int x,y;
 363
 364     int run, runs;
 365     x_and_coeff *xc= b->x_coeff;
 366     x_and_coeff *prev_xc= NULL;
 367     x_and_coeff *prev2_xc= xc;
 368     x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
 369     x_and_coeff *prev_parent_xc= parent_xc;
 370
 371     runs= get_symbol2(&s->c, b->state[30], 0);
 372     if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
 373     else           run= INT_MAX;
 374
 375     for(y=0; y<h; y++){
 376         int v=0;
 377         int lt=0, t=0, rt=0;
 378
 379         if(y && prev_xc->x == 0){
 380             rt= prev_xc->coeff;
 381         }
 382         for(x=0; x<w; x++){
 383             int p=0;
 384             const int l= v;
 385
 386             lt= t; t= rt;
 387
 388             if(y){
 389                 if(prev_xc->x <= x)
 390                     prev_xc++;
 391                 if(prev_xc->x == x + 1)
 392                     rt= prev_xc->coeff;
 393                 else
 394                     rt=0;
 395             }
 396             if(parent_xc){
 397                 if(x>>1 > parent_xc->x){
 398                     parent_xc++;
 399                 }
 400                 if(x>>1 == parent_xc->x){
 401                     p= parent_xc->coeff;
 402                 }
 403             }
 404             if(/*ll|*/l|lt|t|rt|p){
 405                 int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
 406
 407                 v=get_rac(&s->c, &b->state[0][context]);
 408                 if(v){
 409                     v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
 410                     v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
 411
 412                     xc->x=x;
 413                     (xc++)->coeff= v;
 414                 }
 415             }else{
 416                 if(!run){
 417                     if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
 418                     else           run= INT_MAX;
 419                     v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
 420                     v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
 421
 422                     xc->x=x;
 423                     (xc++)->coeff= v;
 424                 }else{
 425                     int max_run;
 426                     run--;
 427                     v=0;
 428
 429                     if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
 430                     else  max_run= FFMIN(run, w-x-1);
 431                     if(parent_xc)
 432                         max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
 433                     x+= max_run;
 434                     run-= max_run;
 435                 }
 436             }
 437         }
 438         (xc++)->x= w+1; //end marker
 439         prev_xc= prev2_xc;
 440         prev2_xc= xc;
 441
 442         if(parent_xc){
 443             if(y&1){
 444                 while(parent_xc->x != parent->width+1)
 445                     parent_xc++;
 446                 parent_xc++;
 447                 prev_parent_xc= parent_xc;
 448             }else{
 449                 parent_xc= prev_parent_xc;
 450             }
 451         }
 452     }
 453
 454     (xc++)->x= w+1; //end marker
 455 }
 456
 457 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
 458     const int w= b->width;
 459     int y;
 460     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
 461     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
 462     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
 463     int new_index = 0;
 464
 465     if(b->ibuf == s->spatial_idwt_buffer || s->qlog == LOSSLESS_QLOG){
 466         qadd= 0;
 467         qmul= 1<<QEXPSHIFT;
 468     }
 469
 470     /* If we are on the second or later slice, restore our index. */
 471     if (start_y != 0)
 472         new_index = save_state[0];
 473
 474
 475     for(y=start_y; y<h; y++){
 476         int x = 0;
 477         int v;
 478         IDWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
 479         memset(line, 0, b->width*sizeof(IDWTELEM));
 480         v = b->x_coeff[new_index].coeff;
 481         x = b->x_coeff[new_index++].x;
 482         while(x < w){
 483             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
 484             register int u= -(v&1);
 485             line[x] = (t^u) - u;
 486
 487             v = b->x_coeff[new_index].coeff;
 488             x = b->x_coeff[new_index++].x;
 489         }
 490     }
 491
 492     /* Save our variables for the next slice. */
 493     save_state[0] = new_index;
 494
 495     return;
 496 }
 497
 498 static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
 499     int plane_index, level, orientation;
 500
 501     for(plane_index=0; plane_index<3; plane_index++){
 502         for(level=0; level<MAX_DECOMPOSITIONS; level++){
 503             for(orientation=level ? 1:0; orientation<4; orientation++){
 504                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
 505             }
 506         }
 507     }
 508     memset(s->header_state, MID_STATE, sizeof(s->header_state));
 509     memset(s->block_state, MID_STATE, sizeof(s->block_state));
 510 }
 511
 512 static int alloc_blocks(SnowContext *s){
 513     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
 514     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
 515
 516     s->b_width = w;
 517     s->b_height= h;
 518
 519     av_free(s->block);
 520     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
 521     return 0;
 522 }
 523
 524 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
 525     const int w= s->b_width << s->block_max_depth;
 526     const int rem_depth= s->block_max_depth - level;
 527     const int index= (x + y*w) << rem_depth;
 528     const int block_w= 1<<rem_depth;
 529     BlockNode block;
 530     int i,j;
 531
 532     block.color[0]= l;
 533     block.color[1]= cb;
 534     block.color[2]= cr;
 535     block.mx= mx;
 536     block.my= my;
 537     block.ref= ref;
 538     block.type= type;
 539     block.level= level;
 540
 541     for(j=0; j<block_w; j++){
 542         for(i=0; i<block_w; i++){
 543             s->block[index + i + j*w]= block;
 544         }
 545     }
 546 }
 547
 548 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
 549     const int offset[3]= {
 550           y*c->  stride + x,
 551         ((y*c->uvstride + x)>>1),
 552         ((y*c->uvstride + x)>>1),
 553     };
 554     int i;
 555     for(i=0; i<3; i++){
 556         c->src[0][i]= src [i];
 557         c->ref[0][i]= ref [i] + offset[i];
 558     }
 559     assert(!ref_index);
 560 }
 561
 562 static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
 563                            const BlockNode *left, const BlockNode *top, const BlockNode *tr){
 564     if(s->ref_frames == 1){
 565         *mx = mid_pred(left->mx, top->mx, tr->mx);
 566         *my = mid_pred(left->my, top->my, tr->my);
 567     }else{
 568         const int *scale = scale_mv_ref[ref];
 569         *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
 570                        (top ->mx * scale[top ->ref] + 128) >>8,
 571                        (tr  ->mx * scale[tr  ->ref] + 128) >>8);
 572         *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
 573                        (top ->my * scale[top ->ref] + 128) >>8,
 574                        (tr  ->my * scale[tr  ->ref] + 128) >>8);
 575     }
 576 }
 577
 578 static av_always_inline int same_block(BlockNode *a, BlockNode *b){
 579     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
 580         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
 581     }else{
 582         return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
 583     }
 584 }
 585
 586 static void decode_q_branch(SnowContext *s, int level, int x, int y){
 587     const int w= s->b_width << s->block_max_depth;
 588     const int rem_depth= s->block_max_depth - level;
 589     const int index= (x + y*w) << rem_depth;
 590     int trx= (x+1)<<rem_depth;
 591     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
 592     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
 593     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
 594     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
 595     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
 596
 597     if(s->keyframe){
 598         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
 599         return;
 600     }
 601
 602     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
 603         int type, mx, my;
 604         int l = left->color[0];
 605         int cb= left->color[1];
 606         int cr= left->color[2];
 607         int ref = 0;
 608         int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
 609         int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
 610         int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
 611
 612         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
 613
 614         if(type){
 615             pred_mv(s, &mx, &my, 0, left, top, tr);
 616             l += get_symbol(&s->c, &s->block_state[32], 1);
 617             cb+= get_symbol(&s->c, &s->block_state[64], 1);
 618             cr+= get_symbol(&s->c, &s->block_state[96], 1);
 619         }else{
 620             if(s->ref_frames > 1)
 621                 ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
 622             pred_mv(s, &mx, &my, ref, left, top, tr);
 623             mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
 624             my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
 625         }
 626         set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
 627     }else{
 628         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
 629         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
 630         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
 631         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
 632     }
 633 }
 634
 635 static void decode_blocks(SnowContext *s){
 636     int x, y;
 637     int w= s->b_width;
 638     int h= s->b_height;
 639
 640     for(y=0; y<h; y++){
 641         for(x=0; x<w; x++){
 642             decode_q_branch(s, 0, x, y);
 643         }
 644     }
 645 }
 646
 647 static void mc_block(Plane *p, uint8_t *dst, const uint8_t *src, int stride, int b_w, int b_h, int dx, int dy){
 648     static const uint8_t weight[64]={
 649     8,7,6,5,4,3,2,1,
 650     7,7,0,0,0,0,0,1,
 651     6,0,6,0,0,0,2,0,
 652     5,0,0,5,0,3,0,0,
 653     4,0,0,0,4,0,0,0,
 654     3,0,0,5,0,3,0,0,
 655     2,0,6,0,0,0,2,0,
 656     1,7,0,0,0,0,0,1,
 657     };
 658
 659     static const uint8_t brane[256]={
 660     0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
 661     0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
 662     0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
 663     0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
 664     0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
 665     0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
 666     0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
 667     0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
 668     0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
 669     0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
 670     0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
 671     0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
 672     0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
 673     0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
 674     0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
 675     0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
 676     };
 677
 678     static const uint8_t needs[16]={
 679     0,1,0,0,
 680     2,4,2,0,
 681     0,1,0,0,
 682     15
 683     };
 684
 685     int x, y, b, r, l;
 686     int16_t tmpIt   [64*(32+HTAPS_MAX)];
 687     uint8_t tmp2t[3][stride*(32+HTAPS_MAX)];
 688     int16_t *tmpI= tmpIt;
 689     uint8_t *tmp2= tmp2t[0];
 690     const uint8_t *hpel[11];
 691     assert(dx<16 && dy<16);
 692     r= brane[dx + 16*dy]&15;
 693     l= brane[dx + 16*dy]>>4;
 694
 695     b= needs[l] | needs[r];
 696     if(p && !p->diag_mc)
 697         b= 15;
 698
 699     if(b&5){
 700         for(y=0; y < b_h+HTAPS_MAX-1; y++){
 701             for(x=0; x < b_w; x++){
 702                 int a_1=src[x + HTAPS_MAX/2-4];
 703                 int a0= src[x + HTAPS_MAX/2-3];
 704                 int a1= src[x + HTAPS_MAX/2-2];
 705                 int a2= src[x + HTAPS_MAX/2-1];
 706                 int a3= src[x + HTAPS_MAX/2+0];
 707                 int a4= src[x + HTAPS_MAX/2+1];
 708                 int a5= src[x + HTAPS_MAX/2+2];
 709                 int a6= src[x + HTAPS_MAX/2+3];
 710                 int am=0;
 711                 if(!p || p->fast_mc){
 712                     am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
 713                     tmpI[x]= am;
 714                     am= (am+16)>>5;
 715                 }else{
 716                     am= p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6);
 717                     tmpI[x]= am;
 718                     am= (am+32)>>6;
 719                 }
 720
 721                 if(am&(~255)) am= ~(am>>31);
 722                 tmp2[x]= am;
 723             }
 724             tmpI+= 64;
 725             tmp2+= stride;
 726             src += stride;
 727         }
 728         src -= stride*y;
 729     }
 730     src += HTAPS_MAX/2 - 1;
 731     tmp2= tmp2t[1];
 732
 733     if(b&2){
 734         for(y=0; y < b_h; y++){
 735             for(x=0; x < b_w+1; x++){
 736                 int a_1=src[x + (HTAPS_MAX/2-4)*stride];
 737                 int a0= src[x + (HTAPS_MAX/2-3)*stride];
 738                 int a1= src[x + (HTAPS_MAX/2-2)*stride];
 739                 int a2= src[x + (HTAPS_MAX/2-1)*stride];
 740                 int a3= src[x + (HTAPS_MAX/2+0)*stride];
 741                 int a4= src[x + (HTAPS_MAX/2+1)*stride];
 742                 int a5= src[x + (HTAPS_MAX/2+2)*stride];
 743                 int a6= src[x + (HTAPS_MAX/2+3)*stride];
 744                 int am=0;
 745                 if(!p || p->fast_mc)
 746                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
 747                 else
 748                     am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 32)>>6;
 749
 750                 if(am&(~255)) am= ~(am>>31);
 751                 tmp2[x]= am;
 752             }
 753             src += stride;
 754             tmp2+= stride;
 755         }
 756         src -= stride*y;
 757     }
 758     src += stride*(HTAPS_MAX/2 - 1);
 759     tmp2= tmp2t[2];
 760     tmpI= tmpIt;
 761     if(b&4){
 762         for(y=0; y < b_h; y++){
 763             for(x=0; x < b_w; x++){
 764                 int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
 765                 int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
 766                 int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
 767                 int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
 768                 int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
 769                 int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
 770                 int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
 771                 int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
 772                 int am=0;
 773                 if(!p || p->fast_mc)
 774                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
 775                 else
 776                     am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 2048)>>12;
 777                 if(am&(~255)) am= ~(am>>31);
 778                 tmp2[x]= am;
 779             }
 780             tmpI+= 64;
 781             tmp2+= stride;
 782         }
 783     }
 784
 785     hpel[ 0]= src;
 786     hpel[ 1]= tmp2t[0] + stride*(HTAPS_MAX/2-1);
 787     hpel[ 2]= src + 1;
 788
 789     hpel[ 4]= tmp2t[1];
 790     hpel[ 5]= tmp2t[2];
 791     hpel[ 6]= tmp2t[1] + 1;
 792
 793     hpel[ 8]= src + stride;
 794     hpel[ 9]= hpel[1] + stride;
 795     hpel[10]= hpel[8] + 1;
 796
 797     if(b==15){
 798         const uint8_t *src1= hpel[dx/8 + dy/8*4  ];
 799         const uint8_t *src2= hpel[dx/8 + dy/8*4+1];
 800         const uint8_t *src3= hpel[dx/8 + dy/8*4+4];
 801         const uint8_t *src4= hpel[dx/8 + dy/8*4+5];
 802         dx&=7;
 803         dy&=7;
 804         for(y=0; y < b_h; y++){
 805             for(x=0; x < b_w; x++){
 806                 dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
 807                          (8-dx)*   dy *src3[x] + dx*   dy *src4[x]+32)>>6;
 808             }
 809             src1+=stride;
 810             src2+=stride;
 811             src3+=stride;
 812             src4+=stride;
 813             dst +=stride;
 814         }
 815     }else{
 816         const uint8_t *src1= hpel[l];
 817         const uint8_t *src2= hpel[r];
 818         int a= weight[((dx&7) + (8*(dy&7)))];
 819         int b= 8-a;
 820         for(y=0; y < b_h; y++){
 821             for(x=0; x < b_w; x++){
 822                 dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
 823             }
 824             src1+=stride;
 825             src2+=stride;
 826             dst +=stride;
 827         }
 828     }
 829 }
 830
 831 #define mca(dx,dy,b_w)\
 832 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, int stride, int h){\
 833     assert(h==b_w);\
 834     mc_block(NULL, dst, src-(HTAPS_MAX/2-1)-(HTAPS_MAX/2-1)*stride, stride, b_w, b_w, dx, dy);\
 835 }
 836
 837 mca( 0, 0,16)
 838 mca( 8, 0,16)
 839 mca( 0, 8,16)
 840 mca( 8, 8,16)
 841 mca( 0, 0,8)
 842 mca( 8, 0,8)
 843 mca( 0, 8,8)
 844 mca( 8, 8,8)
 845
 846 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
 847     if(block->type & BLOCK_INTRA){
 848         int x, y;
 849         const int color = block->color[plane_index];
 850         const int color4= color*0x01010101;
 851         if(b_w==32){
 852             for(y=0; y < b_h; y++){
 853                 *(uint32_t*)&dst[0 + y*stride]= color4;
 854                 *(uint32_t*)&dst[4 + y*stride]= color4;
 855                 *(uint32_t*)&dst[8 + y*stride]= color4;
 856                 *(uint32_t*)&dst[12+ y*stride]= color4;
 857                 *(uint32_t*)&dst[16+ y*stride]= color4;
 858                 *(uint32_t*)&dst[20+ y*stride]= color4;
 859                 *(uint32_t*)&dst[24+ y*stride]= color4;
 860                 *(uint32_t*)&dst[28+ y*stride]= color4;
 861             }
 862         }else if(b_w==16){
 863             for(y=0; y < b_h; y++){
 864                 *(uint32_t*)&dst[0 + y*stride]= color4;
 865                 *(uint32_t*)&dst[4 + y*stride]= color4;
 866                 *(uint32_t*)&dst[8 + y*stride]= color4;
 867                 *(uint32_t*)&dst[12+ y*stride]= color4;
 868             }
 869         }else if(b_w==8){
 870             for(y=0; y < b_h; y++){
 871                 *(uint32_t*)&dst[0 + y*stride]= color4;
 872                 *(uint32_t*)&dst[4 + y*stride]= color4;
 873             }
 874         }else if(b_w==4){
 875             for(y=0; y < b_h; y++){
 876                 *(uint32_t*)&dst[0 + y*stride]= color4;
 877             }
 878         }else{
 879             for(y=0; y < b_h; y++){
 880                 for(x=0; x < b_w; x++){
 881                     dst[x + y*stride]= color;
 882                 }
 883             }
 884         }
 885     }else{
 886         uint8_t *src= s->last_picture[block->ref].data[plane_index];
 887         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
 888         int mx= block->mx*scale;
 889         int my= block->my*scale;
 890         const int dx= mx&15;
 891         const int dy= my&15;
 892         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
 893         sx += (mx>>4) - (HTAPS_MAX/2-1);
 894         sy += (my>>4) - (HTAPS_MAX/2-1);
 895         src += sx + sy*stride;
 896         if(   (unsigned)sx >= w - b_w - (HTAPS_MAX-2)
 897            || (unsigned)sy >= h - b_h - (HTAPS_MAX-2)){
 898             s->dsp.emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1, sx, sy, w, h);
 899             src= tmp + MB_SIZE;
 900         }
 901 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
 902 //        assert(!(b_w&(b_w-1)));
 903         assert(b_w>1 && b_h>1);
 904         assert((tab_index>=0 && tab_index<4) || b_w==32);
 905         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)) || !s->plane[plane_index].fast_mc )
 906             mc_block(&s->plane[plane_index], dst, src, stride, b_w, b_h, dx, dy);
 907         else if(b_w==32){
 908             int y;
 909             for(y=0; y<b_h; y+=16){
 910                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
 911                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
 912             }
 913         }else if(b_w==b_h)
 914             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
 915         else if(b_w==2*b_h){
 916             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 3       + 3*stride,stride);
 917             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
 918         }else{
 919             assert(2*b_w==b_h);
 920             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 3 + 3*stride           ,stride);
 921             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
 922         }
 923     }
 924 }
 925
 926 void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
 927                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
 928     int y, x;
 929     IDWTELEM * dst;
 930     for(y=0; y<b_h; y++){
 931         //FIXME ugly misuse of obmc_stride
 932         const uint8_t *obmc1= obmc + y*obmc_stride;
 933         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
 934         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
 935         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
 936         dst = slice_buffer_get_line(sb, src_y + y);
 937         for(x=0; x<b_w; x++){
 938             int v=   obmc1[x] * block[3][x + y*src_stride]
 939                     +obmc2[x] * block[2][x + y*src_stride]
 940                     +obmc3[x] * block[1][x + y*src_stride]
 941                     +obmc4[x] * block[0][x + y*src_stride];
 942
 943             v <<= 8 - LOG2_OBMC_MAX;
 944             if(FRAC_BITS != 8){
 945                 v >>= 8 - FRAC_BITS;
 946             }
 947             if(add){
 948                 v += dst[x + src_x];
 949                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
 950                 if(v&(~255)) v= ~(v>>31);
 951                 dst8[x + y*src_stride] = v;
 952             }else{
 953                 dst[x + src_x] -= v;
 954             }
 955         }
 956     }
 957 }
 958
 959 //FIXME name cleanup (b_w, block_w, b_width stuff)
 960 static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, IDWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
 961     const int b_width = s->b_width  << s->block_max_depth;
 962     const int b_height= s->b_height << s->block_max_depth;
 963     const int b_stride= b_width;
 964     BlockNode *lt= &s->block[b_x + b_y*b_stride];
 965     BlockNode *rt= lt+1;
 966     BlockNode *lb= lt+b_stride;
 967     BlockNode *rb= lb+1;
 968     uint8_t *block[4];
 969     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
 970     uint8_t *tmp = s->scratchbuf;
 971     uint8_t *ptmp;
 972     int x,y;
 973
 974     if(b_x<0){
 975         lt= rt;
 976         lb= rb;
 977     }else if(b_x + 1 >= b_width){
 978         rt= lt;
 979         rb= lb;
 980     }
 981     if(b_y<0){
 982         lt= lb;
 983         rt= rb;
 984     }else if(b_y + 1 >= b_height){
 985         lb= lt;
 986         rb= rt;
 987     }
 988
 989     if(src_x<0){ //FIXME merge with prev & always round internal width up to *16
 990         obmc -= src_x;
 991         b_w += src_x;
 992         if(!sliced && !offset_dst)
 993             dst -= src_x;
 994         src_x=0;
 995     }else if(src_x + b_w > w){
 996         b_w = w - src_x;
 997     }
 998     if(src_y<0){
 999         obmc -= src_y*obmc_stride;
1000         b_h += src_y;
1001         if(!sliced && !offset_dst)
1002             dst -= src_y*dst_stride;
1003         src_y=0;
1004     }else if(src_y + b_h> h){
1005         b_h = h - src_y;
1006     }
1007
1008     if(b_w<=0 || b_h<=0) return;
1009
1010     assert(src_stride > 2*MB_SIZE + 5);
1011
1012     if(!sliced && offset_dst)
1013         dst += src_x + src_y*dst_stride;
1014     dst8+= src_x + src_y*src_stride;
1015 //    src += src_x + src_y*src_stride;
1016
1017     ptmp= tmp + 3*tmp_step;
1018     block[0]= ptmp;
1019     ptmp+=tmp_step;
1020     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
1021
1022     if(same_block(lt, rt)){
1023         block[1]= block[0];
1024     }else{
1025         block[1]= ptmp;
1026         ptmp+=tmp_step;
1027         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
1028     }
1029
1030     if(same_block(lt, lb)){
1031         block[2]= block[0];
1032     }else if(same_block(rt, lb)){
1033         block[2]= block[1];
1034     }else{
1035         block[2]= ptmp;
1036         ptmp+=tmp_step;
1037         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
1038     }
1039
1040     if(same_block(lt, rb) ){
1041         block[3]= block[0];
1042     }else if(same_block(rt, rb)){
1043         block[3]= block[1];
1044     }else if(same_block(lb, rb)){
1045         block[3]= block[2];
1046     }else{
1047         block[3]= ptmp;
1048         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
1049     }
1050     if(sliced){
1051         s->dwt.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
1052     }else{
1053         for(y=0; y<b_h; y++){
1054             //FIXME ugly misuse of obmc_stride
1055             const uint8_t *obmc1= obmc + y*obmc_stride;
1056             const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
1057             const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
1058             const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
1059             for(x=0; x<b_w; x++){
1060                 int v=   obmc1[x] * block[3][x + y*src_stride]
1061                         +obmc2[x] * block[2][x + y*src_stride]
1062                         +obmc3[x] * block[1][x + y*src_stride]
1063                         +obmc4[x] * block[0][x + y*src_stride];
1064
1065                 v <<= 8 - LOG2_OBMC_MAX;
1066                 if(FRAC_BITS != 8){
1067                     v >>= 8 - FRAC_BITS;
1068                 }
1069                 if(add){
1070                     v += dst[x + y*dst_stride];
1071                     v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
1072                     if(v&(~255)) v= ~(v>>31);
1073                     dst8[x + y*src_stride] = v;
1074                 }else{
1075                     dst[x + y*dst_stride] -= v;
1076                 }
1077             }
1078         }
1079     }
1080 }
1081
1082 static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, IDWTELEM * old_buffer, int plane_index, int add, int mb_y){
1083     Plane *p= &s->plane[plane_index];
1084     const int mb_w= s->b_width  << s->block_max_depth;
1085     const int mb_h= s->b_height << s->block_max_depth;
1086     int x, y, mb_x;
1087     int block_size = MB_SIZE >> s->block_max_depth;
1088     int block_w    = plane_index ? block_size/2 : block_size;
1089     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
1090     int obmc_stride= plane_index ? block_size : 2*block_size;
1091     int ref_stride= s->current_picture.linesize[plane_index];
1092     uint8_t *dst8= s->current_picture.data[plane_index];
1093     int w= p->width;
1094     int h= p->height;
1095
1096     if(s->keyframe || (s->avctx->debug&512)){
1097         if(mb_y==mb_h)
1098             return;
1099
1100         if(add){
1101             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
1102 //                DWTELEM * line = slice_buffer_get_line(sb, y);
1103                 IDWTELEM * line = sb->line[y];
1104                 for(x=0; x<w; x++){
1105 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
1106                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
1107                     v >>= FRAC_BITS;
1108                     if(v&(~255)) v= ~(v>>31);
1109                     dst8[x + y*ref_stride]= v;
1110                 }
1111             }
1112         }else{
1113             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
1114 //                DWTELEM * line = slice_buffer_get_line(sb, y);
1115                 IDWTELEM * line = sb->line[y];
1116                 for(x=0; x<w; x++){
1117                     line[x] -= 128 << FRAC_BITS;
1118 //                    buf[x + y*w]-= 128<<FRAC_BITS;
1119                 }
1120             }
1121         }
1122
1123         return;
1124     }
1125
1126     for(mb_x=0; mb_x<=mb_w; mb_x++){
1127         add_yblock(s, 1, sb, old_buffer, dst8, obmc,
1128                    block_w*mb_x - block_w/2,
1129                    block_w*mb_y - block_w/2,
1130                    block_w, block_w,
1131                    w, h,
1132                    w, ref_stride, obmc_stride,
1133                    mb_x - 1, mb_y - 1,
1134                    add, 0, plane_index);
1135     }
1136 }
1137
1138 static av_always_inline void predict_slice(SnowContext *s, IDWTELEM *buf, int plane_index, int add, int mb_y){
1139     Plane *p= &s->plane[plane_index];
1140     const int mb_w= s->b_width  << s->block_max_depth;
1141     const int mb_h= s->b_height << s->block_max_depth;
1142     int x, y, mb_x;
1143     int block_size = MB_SIZE >> s->block_max_depth;
1144     int block_w    = plane_index ? block_size/2 : block_size;
1145     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
1146     const int obmc_stride= plane_index ? block_size : 2*block_size;
1147     int ref_stride= s->current_picture.linesize[plane_index];
1148     uint8_t *dst8= s->current_picture.data[plane_index];
1149     int w= p->width;
1150     int h= p->height;
1151
1152     if(s->keyframe || (s->avctx->debug&512)){
1153         if(mb_y==mb_h)
1154             return;
1155
1156         if(add){
1157             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
1158                 for(x=0; x<w; x++){
1159                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
1160                     v >>= FRAC_BITS;
1161                     if(v&(~255)) v= ~(v>>31);
1162                     dst8[x + y*ref_stride]= v;
1163                 }
1164             }
1165         }else{
1166             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
1167                 for(x=0; x<w; x++){
1168                     buf[x + y*w]-= 128<<FRAC_BITS;
1169                 }
1170             }
1171         }
1172
1173         return;
1174     }
1175
1176     for(mb_x=0; mb_x<=mb_w; mb_x++){
1177         add_yblock(s, 0, NULL, buf, dst8, obmc,
1178                    block_w*mb_x - block_w/2,
1179                    block_w*mb_y - block_w/2,
1180                    block_w, block_w,
1181                    w, h,
1182                    w, ref_stride, obmc_stride,
1183                    mb_x - 1, mb_y - 1,
1184                    add, 1, plane_index);
1185     }
1186 }
1187
1188 static av_always_inline void predict_plane(SnowContext *s, IDWTELEM *buf, int plane_index, int add){
1189     const int mb_h= s->b_height << s->block_max_depth;
1190     int mb_y;
1191     for(mb_y=0; mb_y<=mb_h; mb_y++)
1192         predict_slice(s, buf, plane_index, add, mb_y);
1193 }
1194
1195 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int start_y, int end_y){
1196     const int w= b->width;
1197     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1198     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1199     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1200     int x,y;
1201
1202     if(s->qlog == LOSSLESS_QLOG) return;
1203
1204     for(y=start_y; y<end_y; y++){
1205 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
1206         IDWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
1207         for(x=0; x<w; x++){
1208             int i= line[x];
1209             if(i<0){
1210                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
1211             }else if(i>0){
1212                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
1213             }
1214         }
1215     }
1216 }
1217
1218 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
1219     const int w= b->width;
1220     int x,y;
1221
1222     IDWTELEM * line=0; // silence silly "could be used without having been initialized" warning
1223     IDWTELEM * prev;
1224
1225     if (start_y != 0)
1226         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
1227
1228     for(y=start_y; y<end_y; y++){
1229         prev = line;
1230 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
1231         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
1232         for(x=0; x<w; x++){
1233             if(x){
1234                 if(use_median){
1235                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
1236                     else  line[x] += line[x - 1];
1237                 }else{
1238                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
1239                     else  line[x] += line[x - 1];
1240                 }
1241             }else{
1242                 if(y) line[x] += prev[x];
1243             }
1244         }
1245     }
1246 }
1247
1248 static void decode_qlogs(SnowContext *s){
1249     int plane_index, level, orientation;
1250
1251     for(plane_index=0; plane_index<3; plane_index++){
1252         for(level=0; level<s->spatial_decomposition_count; level++){
1253             for(orientation=level ? 1:0; orientation<4; orientation++){
1254                 int q;
1255                 if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
1256                 else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
1257                 else                    q= get_symbol(&s->c, s->header_state, 1);
1258                 s->plane[plane_index].band[level][orientation].qlog= q;
1259             }
1260         }
1261     }
1262 }
1263
1264 #define GET_S(dst, check) \
1265     tmp= get_symbol(&s->c, s->header_state, 0);\
1266     if(!(check)){\
1267         av_log(s->avctx, AV_LOG_ERROR, "Error " #dst " is %d\n", tmp);\
1268         return -1;\
1269     }\
1270     dst= tmp;
1271
1272 static int decode_header(SnowContext *s){
1273     int plane_index, tmp;
1274     uint8_t kstate[32];
1275
1276     memset(kstate, MID_STATE, sizeof(kstate));
1277
1278     s->keyframe= get_rac(&s->c, kstate);
1279     if(s->keyframe || s->always_reset){
1280         reset_contexts(s);
1281         s->spatial_decomposition_type=
1282         s->qlog=
1283         s->qbias=
1284         s->mv_scale=
1285         s->block_max_depth= 0;
1286     }
1287     if(s->keyframe){
1288         GET_S(s->version, tmp <= 0U)
1289         s->always_reset= get_rac(&s->c, s->header_state);
1290         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
1291         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
1292         GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
1293         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
1294         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
1295         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
1296         s->spatial_scalability= get_rac(&s->c, s->header_state);
1297 //        s->rate_scalability= get_rac(&s->c, s->header_state);
1298         GET_S(s->max_ref_frames, tmp < (unsigned)MAX_REF_FRAMES)
1299         s->max_ref_frames++;
1300
1301         decode_qlogs(s);
1302     }
1303
1304     if(!s->keyframe){
1305         if(get_rac(&s->c, s->header_state)){
1306             for(plane_index=0; plane_index<2; plane_index++){
1307                 int htaps, i, sum=0;
1308                 Plane *p= &s->plane[plane_index];
1309                 p->diag_mc= get_rac(&s->c, s->header_state);
1310                 htaps= get_symbol(&s->c, s->header_state, 0)*2 + 2;
1311                 if((unsigned)htaps > HTAPS_MAX || htaps==0)
1312                     return -1;
1313                 p->htaps= htaps;
1314                 for(i= htaps/2; i; i--){
1315                     p->hcoeff[i]= get_symbol(&s->c, s->header_state, 0) * (1-2*(i&1));
1316                     sum += p->hcoeff[i];
1317                 }
1318                 p->hcoeff[0]= 32-sum;
1319             }
1320             s->plane[2].diag_mc= s->plane[1].diag_mc;
1321             s->plane[2].htaps  = s->plane[1].htaps;
1322             memcpy(s->plane[2].hcoeff, s->plane[1].hcoeff, sizeof(s->plane[1].hcoeff));
1323         }
1324         if(get_rac(&s->c, s->header_state)){
1325             GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
1326             decode_qlogs(s);
1327         }
1328     }
1329
1330     s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
1331     if(s->spatial_decomposition_type > 1U){
1332         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
1333         return -1;
1334     }
1335     if(FFMIN(s->avctx-> width>>s->chroma_h_shift,
1336              s->avctx->height>>s->chroma_v_shift) >> (s->spatial_decomposition_count-1) <= 0){
1337         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_count %d too large for size", s->spatial_decomposition_count);
1338         return -1;
1339     }
1340
1341     s->qlog           += get_symbol(&s->c, s->header_state, 1);
1342     s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
1343     s->qbias          += get_symbol(&s->c, s->header_state, 1);
1344     s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
1345     if(s->block_max_depth > 1 || s->block_max_depth < 0){
1346         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
1347         s->block_max_depth= 0;
1348         return -1;
1349     }
1350
1351     return 0;
1352 }
1353
1354 static void init_qexp(void){
1355     int i;
1356     double v=128;
1357
1358     for(i=0; i<QROOT; i++){
1359         qexp[i]= lrintf(v);
1360         v *= pow(2, 1.0 / QROOT);
1361     }
1362 }
1363
1364 static av_cold int common_init(AVCodecContext *avctx){
1365     SnowContext *s = avctx->priv_data;
1366     int width, height;
1367     int i, j;
1368
1369     s->avctx= avctx;
1370     s->max_ref_frames=1; //just make sure its not an invalid value in case of no initial keyframe
1371
1372     dsputil_init(&s->dsp, avctx);
1373     ff_dwt_init(&s->dwt);
1374
1375 #define mcf(dx,dy)\
1376     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
1377     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
1378         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
1379     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
1380     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
1381         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
1382
1383     mcf( 0, 0)
1384     mcf( 4, 0)
1385     mcf( 8, 0)
1386     mcf(12, 0)
1387     mcf( 0, 4)
1388     mcf( 4, 4)
1389     mcf( 8, 4)
1390     mcf(12, 4)
1391     mcf( 0, 8)
1392     mcf( 4, 8)
1393     mcf( 8, 8)
1394     mcf(12, 8)
1395     mcf( 0,12)
1396     mcf( 4,12)
1397     mcf( 8,12)
1398     mcf(12,12)
1399
1400 #define mcfh(dx,dy)\
1401     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
1402     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
1403         mc_block_hpel ## dx ## dy ## 16;\
1404     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
1405     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
1406         mc_block_hpel ## dx ## dy ## 8;
1407
1408     mcfh(0, 0)
1409     mcfh(8, 0)
1410     mcfh(0, 8)
1411     mcfh(8, 8)
1412
1413     if(!qexp[0])
1414         init_qexp();
1415
1416 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
1417
1418     width= s->avctx->width;
1419     height= s->avctx->height;
1420
1421     s->spatial_idwt_buffer= av_mallocz(width*height*sizeof(IDWTELEM));
1422     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM)); //FIXME this does not belong here
1423
1424     for(i=0; i<MAX_REF_FRAMES; i++)
1425         for(j=0; j<MAX_REF_FRAMES; j++)
1426             scale_mv_ref[i][j] = 256*(i+1)/(j+1);
1427
1428     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
1429     s->scratchbuf = av_malloc(s->mconly_picture.linesize[0]*7*MB_SIZE);
1430
1431     return 0;
1432 }
1433
1434 static int common_init_after_header(AVCodecContext *avctx){
1435     SnowContext *s = avctx->priv_data;
1436     int plane_index, level, orientation;
1437
1438     for(plane_index=0; plane_index<3; plane_index++){
1439         int w= s->avctx->width;
1440         int h= s->avctx->height;
1441
1442         if(plane_index){
1443             w>>= s->chroma_h_shift;
1444             h>>= s->chroma_v_shift;
1445         }
1446         s->plane[plane_index].width = w;
1447         s->plane[plane_index].height= h;
1448
1449         for(level=s->spatial_decomposition_count-1; level>=0; level--){
1450             for(orientation=level ? 1 : 0; orientation<4; orientation++){
1451                 SubBand *b= &s->plane[plane_index].band[level][orientation];
1452
1453                 b->buf= s->spatial_dwt_buffer;
1454                 b->level= level;
1455                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
1456                 b->width = (w + !(orientation&1))>>1;
1457                 b->height= (h + !(orientation>1))>>1;
1458
1459                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
1460                 b->buf_x_offset = 0;
1461                 b->buf_y_offset = 0;
1462
1463                 if(orientation&1){
1464                     b->buf += (w+1)>>1;
1465                     b->buf_x_offset = (w+1)>>1;
1466                 }
1467                 if(orientation>1){
1468                     b->buf += b->stride>>1;
1469                     b->buf_y_offset = b->stride_line >> 1;
1470                 }
1471                 b->ibuf= s->spatial_idwt_buffer + (b->buf - s->spatial_dwt_buffer);
1472
1473                 if(level)
1474                     b->parent= &s->plane[plane_index].band[level-1][orientation];
1475                 //FIXME avoid this realloc
1476                 av_freep(&b->x_coeff);
1477                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
1478             }
1479             w= (w+1)>>1;
1480             h= (h+1)>>1;
1481         }
1482     }
1483
1484     return 0;
1485 }
1486
1487 #define QUANTIZE2 0
1488
1489 #if QUANTIZE2==1
1490 #define Q2_STEP 8
1491
1492 static void find_sse(SnowContext *s, Plane *p, int *score, int score_stride, IDWTELEM *r0, IDWTELEM *r1, int level, int orientation){
1493     SubBand *b= &p->band[level][orientation];
1494     int x, y;
1495     int xo=0;
1496     int yo=0;
1497     int step= 1 << (s->spatial_decomposition_count - level);
1498
1499     if(orientation&1)
1500         xo= step>>1;
1501     if(orientation&2)
1502         yo= step>>1;
1503
1504     //FIXME bias for nonzero ?
1505     //FIXME optimize
1506     memset(score, 0, sizeof(*score)*score_stride*((p->height + Q2_STEP-1)/Q2_STEP));
1507     for(y=0; y<p->height; y++){
1508         for(x=0; x<p->width; x++){
1509             int sx= (x-xo + step/2) / step / Q2_STEP;
1510             int sy= (y-yo + step/2) / step / Q2_STEP;
1511             int v= r0[x + y*p->width] - r1[x + y*p->width];
1512             assert(sx>=0 && sy>=0 && sx < score_stride);
1513             v= ((v+8)>>4)<<4;
1514             score[sx + sy*score_stride] += v*v;
1515             assert(score[sx + sy*score_stride] >= 0);
1516         }
1517     }
1518 }
1519
1520 static void dequantize_all(SnowContext *s, Plane *p, IDWTELEM *buffer, int width, int height){
1521     int level, orientation;
1522
1523     for(level=0; level<s->spatial_decomposition_count; level++){
1524         for(orientation=level ? 1 : 0; orientation<4; orientation++){
1525             SubBand *b= &p->band[level][orientation];
1526             IDWTELEM *dst= buffer + (b->ibuf - s->spatial_idwt_buffer);
1527
1528             dequantize(s, b, dst, b->stride);
1529         }
1530     }
1531 }
1532
1533 static void dwt_quantize(SnowContext *s, Plane *p, DWTELEM *buffer, int width, int height, int stride, int type){
1534     int level, orientation, ys, xs, x, y, pass;
1535     IDWTELEM best_dequant[height * stride];
1536     IDWTELEM idwt2_buffer[height * stride];
1537     const int score_stride= (width + 10)/Q2_STEP;
1538     int best_score[(width + 10)/Q2_STEP * (height + 10)/Q2_STEP]; //FIXME size
1539     int score[(width + 10)/Q2_STEP * (height + 10)/Q2_STEP]; //FIXME size
1540     int threshold= (s->m.lambda * s->m.lambda) >> 6;
1541
1542     //FIXME pass the copy cleanly ?
1543
1544 //    memcpy(dwt_buffer, buffer, height * stride * sizeof(DWTELEM));
1545     ff_spatial_dwt(buffer, width, height, stride, type, s->spatial_decomposition_count);
1546
1547     for(level=0; level<s->spatial_decomposition_count; level++){
1548         for(orientation=level ? 1 : 0; orientation<4; orientation++){
1549             SubBand *b= &p->band[level][orientation];
1550             IDWTELEM *dst= best_dequant + (b->ibuf - s->spatial_idwt_buffer);
1551              DWTELEM *src=       buffer + (b-> buf - s->spatial_dwt_buffer);
1552             assert(src == b->buf); // code does not depend on this but it is true currently
1553
1554             quantize(s, b, dst, src, b->stride, s->qbias);
1555         }
1556     }
1557     for(pass=0; pass<1; pass++){
1558         if(s->qbias == 0) //keyframe
1559             continue;
1560         for(level=0; level<s->spatial_decomposition_count; level++){
1561             for(orientation=level ? 1 : 0; orientation<4; orientation++){
1562                 SubBand *b= &p->band[level][orientation];
1563                 IDWTELEM *dst= idwt2_buffer + (b->ibuf - s->spatial_idwt_buffer);
1564                 IDWTELEM *best_dst= best_dequant + (b->ibuf - s->spatial_idwt_buffer);
1565
1566                 for(ys= 0; ys<Q2_STEP; ys++){
1567                     for(xs= 0; xs<Q2_STEP; xs++){
1568                         memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
1569                         dequantize_all(s, p, idwt2_buffer, width, height);
1570                         ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
1571                         find_sse(s, p, best_score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
1572                         memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
1573                         for(y=ys; y<b->height; y+= Q2_STEP){
1574                             for(x=xs; x<b->width; x+= Q2_STEP){
1575                                 if(dst[x + y*b->stride]<0) dst[x + y*b->stride]++;
1576                                 if(dst[x + y*b->stride]>0) dst[x + y*b->stride]--;
1577                                 //FIXME try more than just --
1578                             }
1579                         }
1580                         dequantize_all(s, p, idwt2_buffer, width, height);
1581                         ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
1582                         find_sse(s, p, score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
1583                         for(y=ys; y<b->height; y+= Q2_STEP){
1584                             for(x=xs; x<b->width; x+= Q2_STEP){
1585                                 int score_idx= x/Q2_STEP + (y/Q2_STEP)*score_stride;
1586                                 if(score[score_idx] <= best_score[score_idx] + threshold){
1587                                     best_score[score_idx]= score[score_idx];
1588                                     if(best_dst[x + y*b->stride]<0) best_dst[x + y*b->stride]++;
1589                                     if(best_dst[x + y*b->stride]>0) best_dst[x + y*b->stride]--;
1590                                     //FIXME copy instead
1591                                 }
1592                             }
1593                         }
1594                     }
1595                 }
1596             }
1597         }
1598     }
1599     memcpy(s->spatial_idwt_buffer, best_dequant, height * stride * sizeof(IDWTELEM)); //FIXME work with that directly instead of copy at the end
1600 }
1601
1602 #endif /* QUANTIZE2==1 */
1603
1604 #define USE_HALFPEL_PLANE 0
1605
1606 static void halfpel_interpol(SnowContext *s, uint8_t *halfpel[4][4], AVFrame *frame){
1607     int p,x,y;
1608
1609     assert(!(s->avctx->flags & CODEC_FLAG_EMU_EDGE));
1610
1611     for(p=0; p<3; p++){
1612         int is_chroma= !!p;
1613         int w= s->avctx->width  >>is_chroma;
1614         int h= s->avctx->height >>is_chroma;
1615         int ls= frame->linesize[p];
1616         uint8_t *src= frame->data[p];
1617
1618         halfpel[1][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
1619         halfpel[2][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
1620         halfpel[3][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
1621
1622         halfpel[0][p]= src;
1623         for(y=0; y<h; y++){
1624             for(x=0; x<w; x++){
1625                 int i= y*ls + x;
1626
1627                 halfpel[1][p][i]= (20*(src[i] + src[i+1]) - 5*(src[i-1] + src[i+2]) + (src[i-2] + src[i+3]) + 16 )>>5;
1628             }
1629         }
1630         for(y=0; y<h; y++){
1631             for(x=0; x<w; x++){
1632                 int i= y*ls + x;
1633
1634                 halfpel[2][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
1635             }
1636         }
1637         src= halfpel[1][p];
1638         for(y=0; y<h; y++){
1639             for(x=0; x<w; x++){
1640                 int i= y*ls + x;
1641
1642                 halfpel[3][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
1643             }
1644         }
1645
1646 //FIXME border!
1647     }
1648 }
1649
1650 static void release_buffer(AVCodecContext *avctx){
1651     SnowContext *s = avctx->priv_data;
1652     int i;
1653
1654     if(s->last_picture[s->max_ref_frames-1].data[0]){
1655         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
1656         for(i=0; i<9; i++)
1657             if(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3])
1658                 av_free(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] - EDGE_WIDTH*(1+s->current_picture.linesize[i%3]));
1659     }
1660 }
1661
1662 static int frame_start(SnowContext *s){
1663    AVFrame tmp;
1664    int w= s->avctx->width; //FIXME round up to x16 ?
1665    int h= s->avctx->height;
1666
1667     if(s->current_picture.data[0]){
1668         s->dsp.draw_edges(s->current_picture.data[0],
1669                           s->current_picture.linesize[0], w   , h   ,
1670                           EDGE_WIDTH  , EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
1671         s->dsp.draw_edges(s->current_picture.data[1],
1672                           s->current_picture.linesize[1], w>>1, h>>1,
1673                           EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
1674         s->dsp.draw_edges(s->current_picture.data[2],
1675                           s->current_picture.linesize[2], w>>1, h>>1,
1676                           EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
1677     }
1678
1679     release_buffer(s->avctx);
1680
1681     tmp= s->last_picture[s->max_ref_frames-1];
1682     memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
1683     memmove(s->halfpel_plane+1, s->halfpel_plane, (s->max_ref_frames-1)*sizeof(void*)*4*4);
1684     if(USE_HALFPEL_PLANE && s->current_picture.data[0])
1685         halfpel_interpol(s, s->halfpel_plane[0], &s->current_picture);
1686     s->last_picture[0]= s->current_picture;
1687     s->current_picture= tmp;
1688
1689     if(s->keyframe){
1690         s->ref_frames= 0;
1691     }else{
1692         int i;
1693         for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
1694             if(i && s->last_picture[i-1].key_frame)
1695                 break;
1696         s->ref_frames= i;
1697         if(s->ref_frames==0){
1698             av_log(s->avctx,AV_LOG_ERROR, "No reference frames\n");
1699             return -1;
1700         }
1701     }
1702
1703     s->current_picture.reference= 1;
1704     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
1705         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
1706         return -1;
1707     }
1708
1709     s->current_picture.key_frame= s->keyframe;
1710
1711     return 0;
1712 }
1713
1714 static av_cold void common_end(SnowContext *s){
1715     int plane_index, level, orientation, i;
1716
1717     av_freep(&s->spatial_dwt_buffer);
1718     av_freep(&s->spatial_idwt_buffer);
1719
1720     s->m.me.temp= NULL;
1721     av_freep(&s->m.me.scratchpad);
1722     av_freep(&s->m.me.map);
1723     av_freep(&s->m.me.score_map);
1724     av_freep(&s->m.obmc_scratchpad);
1725
1726     av_freep(&s->block);
1727     av_freep(&s->scratchbuf);
1728
1729     for(i=0; i<MAX_REF_FRAMES; i++){
1730         av_freep(&s->ref_mvs[i]);
1731         av_freep(&s->ref_scores[i]);
1732         if(s->last_picture[i].data[0])
1733             s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
1734     }
1735
1736     for(plane_index=0; plane_index<3; plane_index++){
1737         for(level=s->spatial_decomposition_count-1; level>=0; level--){
1738             for(orientation=level ? 1 : 0; orientation<4; orientation++){
1739                 SubBand *b= &s->plane[plane_index].band[level][orientation];
1740
1741                 av_freep(&b->x_coeff);
1742             }
1743         }
1744     }
1745     if (s->mconly_picture.data[0])
1746         s->avctx->release_buffer(s->avctx, &s->mconly_picture);
1747     if (s->current_picture.data[0])
1748         s->avctx->release_buffer(s->avctx, &s->current_picture);
1749 }
1750
1751 static av_cold int decode_init(AVCodecContext *avctx)
1752 {
1753     avctx->pix_fmt= PIX_FMT_YUV420P;
1754
1755     common_init(avctx);
1756
1757     return 0;
1758 }
1759
1760 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt){
1761     const uint8_t *buf = avpkt->data;
1762     int buf_size = avpkt->size;
1763     SnowContext *s = avctx->priv_data;
1764     RangeCoder * const c= &s->c;
1765     int bytes_read;
1766     AVFrame *picture = data;
1767     int level, orientation, plane_index;
1768
1769     ff_init_range_decoder(c, buf, buf_size);
1770     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
1771
1772     s->current_picture.pict_type= AV_PICTURE_TYPE_I; //FIXME I vs. P
1773     if(decode_header(s)<0)
1774         return -1;
1775     common_init_after_header(avctx);
1776
1777     // realloc slice buffer for the case that spatial_decomposition_count changed
1778     ff_slice_buffer_destroy(&s->sb);
1779     ff_slice_buffer_init(&s->sb, s->plane[0].height, (MB_SIZE >> s->block_max_depth) + s->spatial_decomposition_count * 8 + 1, s->plane[0].width, s->spatial_idwt_buffer);
1780
1781     for(plane_index=0; plane_index<3; plane_index++){
1782         Plane *p= &s->plane[plane_index];
1783         p->fast_mc= p->diag_mc && p->htaps==6 && p->hcoeff[0]==40
1784                                               && p->hcoeff[1]==-10
1785                                               && p->hcoeff[2]==2;
1786     }
1787
1788     alloc_blocks(s);
1789
1790     if(frame_start(s) < 0)
1791         return -1;
1792     //keyframe flag duplication mess FIXME
1793     if(avctx->debug&FF_DEBUG_PICT_INFO)
1794         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
1795
1796     decode_blocks(s);
1797
1798     for(plane_index=0; plane_index<3; plane_index++){
1799         Plane *p= &s->plane[plane_index];
1800         int w= p->width;
1801         int h= p->height;
1802         int x, y;
1803         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
1804
1805         if(s->avctx->debug&2048){
1806             memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
1807             predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
1808
1809             for(y=0; y<h; y++){
1810                 for(x=0; x<w; x++){
1811                     int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
1812                     s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
1813                 }
1814             }
1815         }
1816
1817         {
1818         for(level=0; level<s->spatial_decomposition_count; level++){
1819             for(orientation=level ? 1 : 0; orientation<4; orientation++){
1820                 SubBand *b= &p->band[level][orientation];
1821                 unpack_coeffs(s, b, b->parent, orientation);
1822             }
1823         }
1824         }
1825
1826         {
1827         const int mb_h= s->b_height << s->block_max_depth;
1828         const int block_size = MB_SIZE >> s->block_max_depth;
1829         const int block_w    = plane_index ? block_size/2 : block_size;
1830         int mb_y;
1831         DWTCompose cs[MAX_DECOMPOSITIONS];
1832         int yd=0, yq=0;
1833         int y;
1834         int end_y;
1835
1836         ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
1837         for(mb_y=0; mb_y<=mb_h; mb_y++){
1838
1839             int slice_starty = block_w*mb_y;
1840             int slice_h = block_w*(mb_y+1);
1841             if (!(s->keyframe || s->avctx->debug&512)){
1842                 slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
1843                 slice_h -= (block_w >> 1);
1844             }
1845
1846             for(level=0; level<s->spatial_decomposition_count; level++){
1847                 for(orientation=level ? 1 : 0; orientation<4; orientation++){
1848                     SubBand *b= &p->band[level][orientation];
1849                     int start_y;
1850                     int end_y;
1851                     int our_mb_start = mb_y;
1852                     int our_mb_end = (mb_y + 1);
1853                     const int extra= 3;
1854                     start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
1855                     end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
1856                     if (!(s->keyframe || s->avctx->debug&512)){
1857                         start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
1858                         end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
1859                     }
1860                     start_y = FFMIN(b->height, start_y);
1861                     end_y = FFMIN(b->height, end_y);
1862
1863                     if (start_y != end_y){
1864                         if (orientation == 0){
1865                             SubBand * correlate_band = &p->band[0][0];
1866                             int correlate_end_y = FFMIN(b->height, end_y + 1);
1867                             int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
1868                             decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
1869                             correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
1870                             dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, start_y, end_y);
1871                         }
1872                         else
1873                             decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
1874                     }
1875                 }
1876             }
1877
1878             for(; yd<slice_h; yd+=4){
1879                 ff_spatial_idwt_buffered_slice(&s->dwt, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
1880             }
1881
1882             if(s->qlog == LOSSLESS_QLOG){
1883                 for(; yq<slice_h && yq<h; yq++){
1884                     IDWTELEM * line = slice_buffer_get_line(&s->sb, yq);
1885                     for(x=0; x<w; x++){
1886                         line[x] <<= FRAC_BITS;
1887                     }
1888                 }
1889             }
1890
1891             predict_slice_buffered(s, &s->sb, s->spatial_idwt_buffer, plane_index, 1, mb_y);
1892
1893             y = FFMIN(p->height, slice_starty);
1894             end_y = FFMIN(p->height, slice_h);
1895             while(y < end_y)
1896                 ff_slice_buffer_release(&s->sb, y++);
1897         }
1898
1899         ff_slice_buffer_flush(&s->sb);
1900         }
1901
1902     }
1903
1904     emms_c();
1905
1906     release_buffer(avctx);
1907
1908     if(!(s->avctx->debug&2048))
1909         *picture= s->current_picture;
1910     else
1911         *picture= s->mconly_picture;
1912
1913     *data_size = sizeof(AVFrame);
1914
1915     bytes_read= c->bytestream - c->bytestream_start;
1916     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
1917
1918     return bytes_read;
1919 }
1920
1921 static av_cold int decode_end(AVCodecContext *avctx)
1922 {
1923     SnowContext *s = avctx->priv_data;
1924
1925     ff_slice_buffer_destroy(&s->sb);
1926
1927     common_end(s);
1928
1929     return 0;
1930 }
1931
1932 AVCodec ff_snow_decoder = {
1933     .name           = "snow",
1934     .type           = AVMEDIA_TYPE_VIDEO,
1935     .id             = CODEC_ID_SNOW,
1936     .priv_data_size = sizeof(SnowContext),
1937     .init           = decode_init,
1938     .close          = decode_end,
1939     .decode         = decode_frame,
1940     .capabilities   = CODEC_CAP_DR1 /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
1941     .long_name = NULL_IF_CONFIG_SMALL("Snow"),
1942 };
1943
1944 #if CONFIG_SNOW_ENCODER
1945 static av_cold int encode_init(AVCodecContext *avctx)
1946 {
1947     SnowContext *s = avctx->priv_data;
1948     int plane_index;
1949
1950     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
1951         av_log(avctx, AV_LOG_ERROR, "This codec is under development, files encoded with it may not be decodable with future versions!!!\n"
1952                "Use vstrict=-2 / -strict -2 to use it anyway.\n");
1953         return -1;
1954     }
1955
1956     if(avctx->prediction_method == DWT_97
1957        && (avctx->flags & CODEC_FLAG_QSCALE)
1958        && avctx->global_quality == 0){
1959         av_log(avctx, AV_LOG_ERROR, "The 9/7 wavelet is incompatible with lossless mode.\n");
1960         return -1;
1961     }
1962
1963     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
1964
1965     s->mv_scale       = (avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
1966     s->block_max_depth= (avctx->flags & CODEC_FLAG_4MV ) ? 1 : 0;
1967
1968     for(plane_index=0; plane_index<3; plane_index++){
1969         s->plane[plane_index].diag_mc= 1;
1970         s->plane[plane_index].htaps= 6;
1971         s->plane[plane_index].hcoeff[0]=  40;
1972         s->plane[plane_index].hcoeff[1]= -10;
1973         s->plane[plane_index].hcoeff[2]=   2;
1974         s->plane[plane_index].fast_mc= 1;
1975     }
1976
1977     common_init(avctx);
1978     alloc_blocks(s);
1979
1980     s->version=0;
1981
1982     s->m.avctx   = avctx;
1983     s->m.flags   = avctx->flags;
1984     s->m.bit_rate= avctx->bit_rate;
1985
1986     s->m.me.temp      =
1987     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
1988     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
1989     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
1990     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
1991     h263_encode_init(&s->m); //mv_penalty
1992
1993     s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
1994
1995     if(avctx->flags&CODEC_FLAG_PASS1){
1996         if(!avctx->stats_out)
1997             avctx->stats_out = av_mallocz(256);
1998     }
1999     if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
2000         if(ff_rate_control_init(&s->m) < 0)
2001             return -1;
2002     }
2003     s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
2004
2005     avctx->coded_frame= &s->current_picture;
2006     switch(avctx->pix_fmt){
2007 //    case PIX_FMT_YUV444P:
2008 //    case PIX_FMT_YUV422P:
2009     case PIX_FMT_YUV420P:
2010     case PIX_FMT_GRAY8:
2011 //    case PIX_FMT_YUV411P:
2012 //    case PIX_FMT_YUV410P:
2013         s->colorspace_type= 0;
2014         break;
2015 /*    case PIX_FMT_RGB32:
2016         s->colorspace= 1;
2017         break;*/
2018     default:
2019         av_log(avctx, AV_LOG_ERROR, "pixel format not supported\n");
2020         return -1;
2021     }
2022 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
2023     s->chroma_h_shift= 1;
2024     s->chroma_v_shift= 1;
2025
2026     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
2027     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
2028
2029     s->avctx->get_buffer(s->avctx, &s->input_picture);
2030
2031     if(s->avctx->me_method == ME_ITER){
2032         int i;
2033         int size= s->b_width * s->b_height << 2*s->block_max_depth;
2034         for(i=0; i<s->max_ref_frames; i++){
2035             s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
2036             s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
2037         }
2038     }
2039
2040     return 0;
2041 }
2042
2043 //near copy & paste from dsputil, FIXME
2044 static int pix_sum(uint8_t * pix, int line_size, int w)
2045 {
2046     int s, i, j;
2047
2048     s = 0;
2049     for (i = 0; i < w; i++) {
2050         for (j = 0; j < w; j++) {
2051             s += pix[0];
2052             pix ++;
2053         }
2054         pix += line_size - w;
2055     }
2056     return s;
2057 }
2058
2059 //near copy & paste from dsputil, FIXME
2060 static int pix_norm1(uint8_t * pix, int line_size, int w)
2061 {
2062     int s, i, j;
2063     uint32_t *sq = ff_squareTbl + 256;
2064
2065     s = 0;
2066     for (i = 0; i < w; i++) {
2067         for (j = 0; j < w; j ++) {
2068             s += sq[pix[0]];
2069             pix ++;
2070         }
2071         pix += line_size - w;
2072     }
2073     return s;
2074 }
2075
2076 //FIXME copy&paste
2077 #define P_LEFT P[1]
2078 #define P_TOP P[2]
2079 #define P_TOPRIGHT P[3]
2080 #define P_MEDIAN P[4]
2081 #define P_MV1 P[9]
2082 #define FLAG_QPEL   1 //must be 1
2083
2084 static int encode_q_branch(SnowContext *s, int level, int x, int y){
2085     uint8_t p_buffer[1024];
2086     uint8_t i_buffer[1024];
2087     uint8_t p_state[sizeof(s->block_state)];
2088     uint8_t i_state[sizeof(s->block_state)];
2089     RangeCoder pc, ic;
2090     uint8_t *pbbak= s->c.bytestream;
2091     uint8_t *pbbak_start= s->c.bytestream_start;
2092     int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
2093     const int w= s->b_width  << s->block_max_depth;
2094     const int h= s->b_height << s->block_max_depth;
2095     const int rem_depth= s->block_max_depth - level;
2096     const int index= (x + y*w) << rem_depth;
2097     const int block_w= 1<<(LOG2_MB_SIZE - level);
2098     int trx= (x+1)<<rem_depth;
2099     int try= (y+1)<<rem_depth;
2100     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2101     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2102     const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
2103     const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
2104     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2105     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2106     int pl = left->color[0];
2107     int pcb= left->color[1];
2108     int pcr= left->color[2];
2109     int pmx, pmy;
2110     int mx=0, my=0;
2111     int l,cr,cb;
2112     const int stride= s->current_picture.linesize[0];
2113     const int uvstride= s->current_picture.linesize[1];
2114     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
2115                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
2116                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
2117     int P[10][2];
2118     int16_t last_mv[3][2];
2119     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
2120     const int shift= 1+qpel;
2121     MotionEstContext *c= &s->m.me;
2122     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2123     int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2124     int my_context= av_log2(2*FFABS(left->my - top->my));
2125     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2126     int ref, best_ref, ref_score, ref_mx, ref_my;
2127
2128     assert(sizeof(s->block_state) >= 256);
2129     if(s->keyframe){
2130         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2131         return 0;
2132     }
2133
2134 //    clip predictors / edge ?
2135
2136     P_LEFT[0]= left->mx;
2137     P_LEFT[1]= left->my;
2138     P_TOP [0]= top->mx;
2139     P_TOP [1]= top->my;
2140     P_TOPRIGHT[0]= tr->mx;
2141     P_TOPRIGHT[1]= tr->my;
2142
2143     last_mv[0][0]= s->block[index].mx;
2144     last_mv[0][1]= s->block[index].my;
2145     last_mv[1][0]= right->mx;
2146     last_mv[1][1]= right->my;
2147     last_mv[2][0]= bottom->mx;
2148     last_mv[2][1]= bottom->my;
2149
2150     s->m.mb_stride=2;
2151     s->m.mb_x=
2152     s->m.mb_y= 0;
2153     c->skip= 0;
2154
2155     assert(c->  stride ==   stride);
2156     assert(c->uvstride == uvstride);
2157
2158     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2159     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2160     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2161     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2162
2163     c->xmin = - x*block_w - 16+3;
2164     c->ymin = - y*block_w - 16+3;
2165     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
2166     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
2167
2168     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2169     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2170     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2171     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2172     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2173     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2174     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2175
2176     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2177     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2178
2179     if (!y) {
2180         c->pred_x= P_LEFT[0];
2181         c->pred_y= P_LEFT[1];
2182     } else {
2183         c->pred_x = P_MEDIAN[0];
2184         c->pred_y = P_MEDIAN[1];
2185     }
2186
2187     score= INT_MAX;
2188     best_ref= 0;
2189     for(ref=0; ref<s->ref_frames; ref++){
2190         init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2191
2192         ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2193                                          (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2194
2195         assert(ref_mx >= c->xmin);
2196         assert(ref_mx <= c->xmax);
2197         assert(ref_my >= c->ymin);
2198         assert(ref_my <= c->ymax);
2199
2200         ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2201         ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2202         ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2203         if(s->ref_mvs[ref]){
2204             s->ref_mvs[ref][index][0]= ref_mx;
2205             s->ref_mvs[ref][index][1]= ref_my;
2206             s->ref_scores[ref][index]= ref_score;
2207         }
2208         if(score > ref_score){
2209             score= ref_score;
2210             best_ref= ref;
2211             mx= ref_mx;
2212             my= ref_my;
2213         }
2214     }
2215     //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
2216
2217   //  subpel search
2218     base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
2219     pc= s->c;
2220     pc.bytestream_start=
2221     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2222     memcpy(p_state, s->block_state, sizeof(s->block_state));
2223
2224     if(level!=s->block_max_depth)
2225         put_rac(&pc, &p_state[4 + s_context], 1);
2226     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2227     if(s->ref_frames > 1)
2228         put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2229     pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
2230     put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2231     put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2232     p_len= pc.bytestream - pc.bytestream_start;
2233     score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
2234
2235     block_s= block_w*block_w;
2236     sum = pix_sum(current_data[0], stride, block_w);
2237     l= (sum + block_s/2)/block_s;
2238     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2239
2240     block_s= block_w*block_w>>2;
2241     sum = pix_sum(current_data[1], uvstride, block_w>>1);
2242     cb= (sum + block_s/2)/block_s;
2243 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2244     sum = pix_sum(current_data[2], uvstride, block_w>>1);
2245     cr= (sum + block_s/2)/block_s;
2246 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2247
2248     ic= s->c;
2249     ic.bytestream_start=
2250     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2251     memcpy(i_state, s->block_state, sizeof(s->block_state));
2252     if(level!=s->block_max_depth)
2253         put_rac(&ic, &i_state[4 + s_context], 1);
2254     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2255     put_symbol(&ic, &i_state[32],  l-pl , 1);
2256     put_symbol(&ic, &i_state[64], cb-pcb, 1);
2257     put_symbol(&ic, &i_state[96], cr-pcr, 1);
2258     i_len= ic.bytestream - ic.bytestream_start;
2259     iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
2260
2261 //    assert(score==256*256*256*64-1);
2262     assert(iscore < 255*255*256 + s->lambda2*10);
2263     assert(iscore >= 0);
2264     assert(l>=0 && l<=255);
2265     assert(pl>=0 && pl<=255);
2266
2267     if(level==0){
2268         int varc= iscore >> 8;
2269         int vard= score >> 8;
2270         if (vard <= 64 || vard < varc)
2271             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2272         else
2273             c->scene_change_score+= s->m.qscale;
2274     }
2275
2276     if(level!=s->block_max_depth){
2277         put_rac(&s->c, &s->block_state[4 + s_context], 0);
2278         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2279         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2280         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2281         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2282         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2283
2284         if(score2 < score && score2 < iscore)
2285             return score2;
2286     }
2287
2288     if(iscore < score){
2289         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2290         memcpy(pbbak, i_buffer, i_len);
2291         s->c= ic;
2292         s->c.bytestream_start= pbbak_start;
2293         s->c.bytestream= pbbak + i_len;
2294         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2295         memcpy(s->block_state, i_state, sizeof(s->block_state));
2296         return iscore;
2297     }else{
2298         memcpy(pbbak, p_buffer, p_len);
2299         s->c= pc;
2300         s->c.bytestream_start= pbbak_start;
2301         s->c.bytestream= pbbak + p_len;
2302         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2303         memcpy(s->block_state, p_state, sizeof(s->block_state));
2304         return score;
2305     }
2306 }
2307
2308 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2309     const int w= s->b_width  << s->block_max_depth;
2310     const int rem_depth= s->block_max_depth - level;
2311     const int index= (x + y*w) << rem_depth;
2312     int trx= (x+1)<<rem_depth;
2313     BlockNode *b= &s->block[index];
2314     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2315     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2316     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2317     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2318     int pl = left->color[0];
2319     int pcb= left->color[1];
2320     int pcr= left->color[2];
2321     int pmx, pmy;
2322     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2323     int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2324     int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2325     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2326
2327     if(s->keyframe){
2328         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2329         return;
2330     }
2331
2332     if(level!=s->block_max_depth){
2333         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2334             put_rac(&s->c, &s->block_state[4 + s_context], 1);
2335         }else{
2336             put_rac(&s->c, &s->block_state[4 + s_context], 0);
2337             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2338             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2339             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2340             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2341             return;
2342         }
2343     }
2344     if(b->type & BLOCK_INTRA){
2345         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2346         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2347         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2348         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2349         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2350         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2351     }else{
2352         pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2353         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2354         if(s->ref_frames > 1)
2355             put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2356         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2357         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2358         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2359     }
2360 }
2361
2362 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2363     int i, x2, y2;
2364     Plane *p= &s->plane[plane_index];
2365     const int block_size = MB_SIZE >> s->block_max_depth;
2366     const int block_w    = plane_index ? block_size/2 : block_size;
2367     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2368     const int obmc_stride= plane_index ? block_size : 2*block_size;
2369     const int ref_stride= s->current_picture.linesize[plane_index];
2370     uint8_t *src= s-> input_picture.data[plane_index];
2371     IDWTELEM *dst= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
2372     const int b_stride = s->b_width << s->block_max_depth;
2373     const int w= p->width;
2374     const int h= p->height;
2375     int index= mb_x + mb_y*b_stride;
2376     BlockNode *b= &s->block[index];
2377     BlockNode backup= *b;
2378     int ab=0;
2379     int aa=0;
2380
2381     b->type|= BLOCK_INTRA;
2382     b->color[plane_index]= 0;
2383     memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
2384
2385     for(i=0; i<4; i++){
2386         int mb_x2= mb_x + (i &1) - 1;
2387         int mb_y2= mb_y + (i>>1) - 1;
2388         int x= block_w*mb_x2 + block_w/2;
2389         int y= block_w*mb_y2 + block_w/2;
2390
2391         add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2392                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2393
2394         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2395             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2396                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2397                 int obmc_v= obmc[index];
2398                 int d;
2399                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2400                 if(x<0) obmc_v += obmc[index + block_w];
2401                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2402                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2403                 //FIXME precalculate this or simplify it somehow else
2404
2405                 d = -dst[index] + (1<<(FRAC_BITS-1));
2406                 dst[index] = d;
2407                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2408                 aa += obmc_v * obmc_v; //FIXME precalculate this
2409             }
2410         }
2411     }
2412     *b= backup;
2413
2414     return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we should not need clipping
2415 }
2416
2417 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2418     const int b_stride = s->b_width << s->block_max_depth;
2419     const int b_height = s->b_height<< s->block_max_depth;
2420     int index= x + y*b_stride;
2421     const BlockNode *b     = &s->block[index];
2422     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2423     const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2424     const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2425     const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2426     int dmx, dmy;
2427 //  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2428 //  int my_context= av_log2(2*FFABS(left->my - top->my));
2429
2430     if(x<0 || x>=b_stride || y>=b_height)
2431         return 0;
2432 /*
2433 1            0      0
2434 01X          1-2    1
2435 001XX        3-6    2-3
2436 0001XXX      7-14   4-7
2437 00001XXXX   15-30   8-15
2438 */
2439 //FIXME try accurate rate
2440 //FIXME intra and inter predictors if surrounding blocks are not the same type
2441     if(b->type & BLOCK_INTRA){
2442         return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2443                    + av_log2(2*FFABS(left->color[1] - b->color[1]))
2444                    + av_log2(2*FFABS(left->color[2] - b->color[2])));
2445     }else{
2446         pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2447         dmx-= b->mx;
2448         dmy-= b->my;
2449         return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2450                     + av_log2(2*FFABS(dmy))
2451                     + av_log2(2*b->ref));
2452     }
2453 }
2454
2455 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2456     Plane *p= &s->plane[plane_index];
2457     const int block_size = MB_SIZE >> s->block_max_depth;
2458     const int block_w    = plane_index ? block_size/2 : block_size;
2459     const int obmc_stride= plane_index ? block_size : 2*block_size;
2460     const int ref_stride= s->current_picture.linesize[plane_index];
2461     uint8_t *dst= s->current_picture.data[plane_index];
2462     uint8_t *src= s->  input_picture.data[plane_index];
2463     IDWTELEM *pred= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2464     uint8_t *cur = s->scratchbuf;
2465     uint8_t tmp[ref_stride*(2*MB_SIZE+HTAPS_MAX-1)];
2466     const int b_stride = s->b_width << s->block_max_depth;
2467     const int b_height = s->b_height<< s->block_max_depth;
2468     const int w= p->width;
2469     const int h= p->height;
2470     int distortion;
2471     int rate= 0;
2472     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2473     int sx= block_w*mb_x - block_w/2;
2474     int sy= block_w*mb_y - block_w/2;
2475     int x0= FFMAX(0,-sx);
2476     int y0= FFMAX(0,-sy);
2477     int x1= FFMIN(block_w*2, w-sx);
2478     int y1= FFMIN(block_w*2, h-sy);
2479     int i,x,y;
2480
2481     pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2482
2483     for(y=y0; y<y1; y++){
2484         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2485         const IDWTELEM *pred1 = pred + y*obmc_stride;
2486         uint8_t *cur1 = cur + y*ref_stride;
2487         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2488         for(x=x0; x<x1; x++){
2489 #if FRAC_BITS >= LOG2_OBMC_MAX
2490             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2491 #else
2492             int v = (cur1[x] * obmc1[x] + (1<<(LOG2_OBMC_MAX - FRAC_BITS-1))) >> (LOG2_OBMC_MAX - FRAC_BITS);
2493 #endif
2494             v = (v + pred1[x]) >> FRAC_BITS;
2495             if(v&(~255)) v= ~(v>>31);
2496             dst1[x] = v;
2497         }
2498     }
2499
2500     /* copy the regions where obmc[] = (uint8_t)256 */
2501     if(LOG2_OBMC_MAX == 8
2502         && (mb_x == 0 || mb_x == b_stride-1)
2503         && (mb_y == 0 || mb_y == b_height-1)){
2504         if(mb_x == 0)
2505             x1 = block_w;
2506         else
2507             x0 = block_w;
2508         if(mb_y == 0)
2509             y1 = block_w;
2510         else
2511             y0 = block_w;
2512         for(y=y0; y<y1; y++)
2513             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2514     }
2515
2516     if(block_w==16){
2517         /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2518         /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2519         /* FIXME cmps overlap but do not cover the wavelet's whole support.
2520          * So improving the score of one block is not strictly guaranteed
2521          * to improve the score of the whole frame, thus iterative motion
2522          * estimation does not always converge. */
2523         if(s->avctx->me_cmp == FF_CMP_W97)
2524             distortion = ff_w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2525         else if(s->avctx->me_cmp == FF_CMP_W53)
2526             distortion = ff_w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2527         else{
2528             distortion = 0;
2529             for(i=0; i<4; i++){
2530                 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
2531                 distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
2532             }
2533         }
2534     }else{
2535         assert(block_w==8);
2536         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
2537     }
2538
2539     if(plane_index==0){
2540         for(i=0; i<4; i++){
2541 /* ..RRr
2542  * .RXx.
2543  * rxx..
2544  */
2545             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
2546         }
2547         if(mb_x == b_stride-2)
2548             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
2549     }
2550     return distortion + rate*penalty_factor;
2551 }
2552
2553 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
2554     int i, y2;
2555     Plane *p= &s->plane[plane_index];
2556     const int block_size = MB_SIZE >> s->block_max_depth;
2557     const int block_w    = plane_index ? block_size/2 : block_size;
2558     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2559     const int obmc_stride= plane_index ? block_size : 2*block_size;
2560     const int ref_stride= s->current_picture.linesize[plane_index];
2561     uint8_t *dst= s->current_picture.data[plane_index];
2562     uint8_t *src= s-> input_picture.data[plane_index];
2563     //FIXME zero_dst is const but add_yblock changes dst if add is 0 (this is never the case for dst=zero_dst
2564     // const has only been removed from zero_dst to suppress a warning
2565     static IDWTELEM zero_dst[4096]; //FIXME
2566     const int b_stride = s->b_width << s->block_max_depth;
2567     const int w= p->width;
2568     const int h= p->height;
2569     int distortion= 0;
2570     int rate= 0;
2571     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2572
2573     for(i=0; i<9; i++){
2574         int mb_x2= mb_x + (i%3) - 1;
2575         int mb_y2= mb_y + (i/3) - 1;
2576         int x= block_w*mb_x2 + block_w/2;
2577         int y= block_w*mb_y2 + block_w/2;
2578
2579         add_yblock(s, 0, NULL, zero_dst, dst, obmc,
2580                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
2581
2582         //FIXME find a cleaner/simpler way to skip the outside stuff
2583         for(y2= y; y2<0; y2++)
2584             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2585         for(y2= h; y2<y+block_w; y2++)
2586             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2587         if(x<0){
2588             for(y2= y; y2<y+block_w; y2++)
2589                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
2590         }
2591         if(x+block_w > w){
2592             for(y2= y; y2<y+block_w; y2++)
2593                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
2594         }
2595
2596         assert(block_w== 8 || block_w==16);
2597         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
2598     }
2599
2600     if(plane_index==0){
2601         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
2602         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
2603
2604 /* ..RRRr
2605  * .RXXx.
2606  * .RXXx.
2607  * rxxx.
2608  */
2609         if(merged)
2610             rate = get_block_bits(s, mb_x, mb_y, 2);
2611         for(i=merged?4:0; i<9; i++){
2612             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
2613             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
2614         }
2615     }
2616     return distortion + rate*penalty_factor;
2617 }
2618
2619 static int encode_subband_c0run(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
2620     const int w= b->width;
2621     const int h= b->height;
2622     int x, y;
2623
2624     if(1){
2625         int run=0;
2626         int runs[w*h];
2627         int run_index=0;
2628         int max_index;
2629
2630         for(y=0; y<h; y++){
2631             for(x=0; x<w; x++){
2632                 int v, p=0;
2633                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
2634                 v= src[x + y*stride];
2635
2636                 if(y){
2637                     t= src[x + (y-1)*stride];
2638                     if(x){
2639                         lt= src[x - 1 + (y-1)*stride];
2640                     }
2641                     if(x + 1 < w){
2642                         rt= src[x + 1 + (y-1)*stride];
2643                     }
2644                 }
2645                 if(x){
2646                     l= src[x - 1 + y*stride];
2647                     /*if(x > 1){
2648                         if(orientation==1) ll= src[y + (x-2)*stride];
2649                         else               ll= src[x - 2 + y*stride];
2650                     }*/
2651                 }
2652                 if(parent){
2653                     int px= x>>1;
2654                     int py= y>>1;
2655                     if(px<b->parent->width && py<b->parent->height)
2656                         p= parent[px + py*2*stride];
2657                 }
2658                 if(!(/*ll|*/l|lt|t|rt|p)){
2659                     if(v){
2660                         runs[run_index++]= run;
2661                         run=0;
2662                     }else{
2663                         run++;
2664                     }
2665                 }
2666             }
2667         }
2668         max_index= run_index;
2669         runs[run_index++]= run;
2670         run_index=0;
2671         run= runs[run_index++];
2672
2673         put_symbol2(&s->c, b->state[30], max_index, 0);
2674         if(run_index <= max_index)
2675             put_symbol2(&s->c, b->state[1], run, 3);
2676
2677         for(y=0; y<h; y++){
2678             if(s->c.bytestream_end - s->c.bytestream < w*40){
2679                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2680                 return -1;
2681             }
2682             for(x=0; x<w; x++){
2683                 int v, p=0;
2684                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
2685                 v= src[x + y*stride];
2686
2687                 if(y){
2688                     t= src[x + (y-1)*stride];
2689                     if(x){
2690                         lt= src[x - 1 + (y-1)*stride];
2691                     }
2692                     if(x + 1 < w){
2693                         rt= src[x + 1 + (y-1)*stride];
2694                     }
2695                 }
2696                 if(x){
2697                     l= src[x - 1 + y*stride];
2698                     /*if(x > 1){
2699                         if(orientation==1) ll= src[y + (x-2)*stride];
2700                         else               ll= src[x - 2 + y*stride];
2701                     }*/
2702                 }
2703                 if(parent){
2704                     int px= x>>1;
2705                     int py= y>>1;
2706                     if(px<b->parent->width && py<b->parent->height)
2707                         p= parent[px + py*2*stride];
2708                 }
2709                 if(/*ll|*/l|lt|t|rt|p){
2710                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
2711
2712                     put_rac(&s->c, &b->state[0][context], !!v);
2713                 }else{
2714                     if(!run){
2715                         run= runs[run_index++];
2716
2717                         if(run_index <= max_index)
2718                             put_symbol2(&s->c, b->state[1], run, 3);
2719                         assert(v);
2720                     }else{
2721                         run--;
2722                         assert(!v);
2723                     }
2724                 }
2725                 if(v){
2726                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
2727                     int l2= 2*FFABS(l) + (l<0);
2728                     int t2= 2*FFABS(t) + (t<0);
2729
2730                     put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
2731                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
2732                 }
2733             }
2734         }
2735     }
2736     return 0;
2737 }
2738
2739 static int encode_subband(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
2740 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
2741 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
2742     return encode_subband_c0run(s, b, src, parent, stride, orientation);
2743 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
2744 }
2745
2746 static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
2747     const int b_stride= s->b_width << s->block_max_depth;
2748     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
2749     BlockNode backup= *block;
2750     int rd, index, value;
2751
2752     assert(mb_x>=0 && mb_y>=0);
2753     assert(mb_x<b_stride);
2754
2755     if(intra){
2756         block->color[0] = p[0];
2757         block->color[1] = p[1];
2758         block->color[2] = p[2];
2759         block->type |= BLOCK_INTRA;
2760     }else{
2761         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
2762         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
2763         if(s->me_cache[index] == value)
2764             return 0;
2765         s->me_cache[index]= value;
2766
2767         block->mx= p[0];
2768         block->my= p[1];
2769         block->type &= ~BLOCK_INTRA;
2770     }
2771
2772     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
2773
2774 //FIXME chroma
2775     if(rd < *best_rd){
2776         *best_rd= rd;
2777         return 1;
2778     }else{
2779         *block= backup;
2780         return 0;
2781     }
2782 }
2783
2784 /* special case for int[2] args we discard afterwards,
2785  * fixes compilation problem with gcc 2.95 */
2786 static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
2787     int p[2] = {p0, p1};
2788     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
2789 }
2790
2791 static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
2792     const int b_stride= s->b_width << s->block_max_depth;
2793     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
2794     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
2795     int rd, index, value;
2796
2797     assert(mb_x>=0 && mb_y>=0);
2798     assert(mb_x<b_stride);
2799     assert(((mb_x|mb_y)&1) == 0);
2800
2801     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
2802     value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
2803     if(s->me_cache[index] == value)
2804         return 0;
2805     s->me_cache[index]= value;
2806
2807     block->mx= p0;
2808     block->my= p1;
2809     block->ref= ref;
2810     block->type &= ~BLOCK_INTRA;
2811     block[1]= block[b_stride]= block[b_stride+1]= *block;
2812
2813     rd= get_4block_rd(s, mb_x, mb_y, 0);
2814
2815 //FIXME chroma
2816     if(rd < *best_rd){
2817         *best_rd= rd;
2818         return 1;
2819     }else{
2820         block[0]= backup[0];
2821         block[1]= backup[1];
2822         block[b_stride]= backup[2];
2823         block[b_stride+1]= backup[3];
2824         return 0;
2825     }
2826 }
2827
2828 static void iterative_me(SnowContext *s){
2829     int pass, mb_x, mb_y;
2830     const int b_width = s->b_width  << s->block_max_depth;
2831     const int b_height= s->b_height << s->block_max_depth;
2832     const int b_stride= b_width;
2833     int color[3];
2834
2835     {
2836         RangeCoder r = s->c;
2837         uint8_t state[sizeof(s->block_state)];
2838         memcpy(state, s->block_state, sizeof(s->block_state));
2839         for(mb_y= 0; mb_y<s->b_height; mb_y++)
2840             for(mb_x= 0; mb_x<s->b_width; mb_x++)
2841                 encode_q_branch(s, 0, mb_x, mb_y);
2842         s->c = r;
2843         memcpy(s->block_state, state, sizeof(s->block_state));
2844     }
2845
2846     for(pass=0; pass<25; pass++){
2847         int change= 0;
2848
2849         for(mb_y= 0; mb_y<b_height; mb_y++){
2850             for(mb_x= 0; mb_x<b_width; mb_x++){
2851                 int dia_change, i, j, ref;
2852                 int best_rd= INT_MAX, ref_rd;
2853                 BlockNode backup, ref_b;
2854                 const int index= mb_x + mb_y * b_stride;
2855                 BlockNode *block= &s->block[index];
2856                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
2857                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
2858                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
2859                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
2860                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
2861                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
2862                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
2863                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
2864                 const int b_w= (MB_SIZE >> s->block_max_depth);
2865                 uint8_t obmc_edged[b_w*2][b_w*2];
2866
2867                 if(pass && (block->type & BLOCK_OPT))
2868                     continue;
2869                 block->type |= BLOCK_OPT;
2870
2871                 backup= *block;
2872
2873                 if(!s->me_cache_generation)
2874                     memset(s->me_cache, 0, sizeof(s->me_cache));
2875                 s->me_cache_generation += 1<<22;
2876
2877                 //FIXME precalculate
2878                 {
2879                     int x, y;
2880                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
2881                     if(mb_x==0)
2882                         for(y=0; y<b_w*2; y++)
2883                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
2884                     if(mb_x==b_stride-1)
2885                         for(y=0; y<b_w*2; y++)
2886                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
2887                     if(mb_y==0){
2888                         for(x=0; x<b_w*2; x++)
2889                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
2890                         for(y=1; y<b_w; y++)
2891                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
2892                     }
2893                     if(mb_y==b_height-1){
2894                         for(x=0; x<b_w*2; x++)
2895                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
2896                         for(y=b_w; y<b_w*2-1; y++)
2897                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
2898                     }
2899                 }
2900
2901                 //skip stuff outside the picture
2902                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1){
2903                     uint8_t *src= s->  input_picture.data[0];
2904                     uint8_t *dst= s->current_picture.data[0];
2905                     const int stride= s->current_picture.linesize[0];
2906                     const int block_w= MB_SIZE >> s->block_max_depth;
2907                     const int sx= block_w*mb_x - block_w/2;
2908                     const int sy= block_w*mb_y - block_w/2;
2909                     const int w= s->plane[0].width;
2910                     const int h= s->plane[0].height;
2911                     int y;
2912
2913                     for(y=sy; y<0; y++)
2914                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
2915                     for(y=h; y<sy+block_w*2; y++)
2916                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
2917                     if(sx<0){
2918                         for(y=sy; y<sy+block_w*2; y++)
2919                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
2920                     }
2921                     if(sx+block_w*2 > w){
2922                         for(y=sy; y<sy+block_w*2; y++)
2923                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
2924                     }
2925                 }
2926
2927                 // intra(black) = neighbors' contribution to the current block
2928                 for(i=0; i<3; i++)
2929                     color[i]= get_dc(s, mb_x, mb_y, i);
2930
2931                 // get previous score (cannot be cached due to OBMC)
2932                 if(pass > 0 && (block->type&BLOCK_INTRA)){
2933                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
2934                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
2935                 }else
2936                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
2937
2938                 ref_b= *block;
2939                 ref_rd= best_rd;
2940                 for(ref=0; ref < s->ref_frames; ref++){
2941                     int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
2942                     if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
2943                         continue;
2944                     block->ref= ref;
2945                     best_rd= INT_MAX;
2946
2947                     check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
2948                     check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
2949                     if(tb)
2950                         check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
2951                     if(lb)
2952                         check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
2953                     if(rb)
2954                         check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
2955                     if(bb)
2956                         check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
2957
2958                     /* fullpel ME */
2959                     //FIXME avoid subpel interpolation / round to nearest integer
2960                     do{
2961                         dia_change=0;
2962                         for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
2963                             for(j=0; j<i; j++){
2964                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
2965                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
2966                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
2967                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
2968                             }
2969                         }
2970                     }while(dia_change);
2971                     /* subpel ME */
2972                     do{
2973                         static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
2974                         dia_change=0;
2975                         for(i=0; i<8; i++)
2976                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
2977                     }while(dia_change);
2978                     //FIXME or try the standard 2 pass qpel or similar
2979
2980                     mvr[0][0]= block->mx;
2981                     mvr[0][1]= block->my;
2982                     if(ref_rd > best_rd){
2983                         ref_rd= best_rd;
2984                         ref_b= *block;
2985                     }
2986                 }
2987                 best_rd= ref_rd;
2988                 *block= ref_b;
2989                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
2990                 //FIXME RD style color selection
2991                 if(!same_block(block, &backup)){
2992                     if(tb ) tb ->type &= ~BLOCK_OPT;
2993                     if(lb ) lb ->type &= ~BLOCK_OPT;
2994                     if(rb ) rb ->type &= ~BLOCK_OPT;
2995                     if(bb ) bb ->type &= ~BLOCK_OPT;
2996                     if(tlb) tlb->type &= ~BLOCK_OPT;
2997                     if(trb) trb->type &= ~BLOCK_OPT;
2998                     if(blb) blb->type &= ~BLOCK_OPT;
2999                     if(brb) brb->type &= ~BLOCK_OPT;
3000                     change ++;
3001                 }
3002             }
3003         }
3004         av_log(s->avctx, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3005         if(!change)
3006             break;
3007     }
3008
3009     if(s->block_max_depth == 1){
3010         int change= 0;
3011         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3012             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3013                 int i;
3014                 int best_rd, init_rd;
3015                 const int index= mb_x + mb_y * b_stride;
3016                 BlockNode *b[4];
3017
3018                 b[0]= &s->block[index];
3019                 b[1]= b[0]+1;
3020                 b[2]= b[0]+b_stride;
3021                 b[3]= b[2]+1;
3022                 if(same_block(b[0], b[1]) &&
3023                    same_block(b[0], b[2]) &&
3024                    same_block(b[0], b[3]))
3025                     continue;
3026
3027                 if(!s->me_cache_generation)
3028                     memset(s->me_cache, 0, sizeof(s->me_cache));
3029                 s->me_cache_generation += 1<<22;
3030
3031                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3032
3033                 //FIXME more multiref search?
3034                 check_4block_inter(s, mb_x, mb_y,
3035                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3036                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3037
3038                 for(i=0; i<4; i++)
3039                     if(!(b[i]->type&BLOCK_INTRA))
3040                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3041
3042                 if(init_rd != best_rd)
3043                     change++;
3044             }
3045         }
3046         av_log(s->avctx, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3047     }
3048 }
3049
3050 static void encode_blocks(SnowContext *s, int search){
3051     int x, y;
3052     int w= s->b_width;
3053     int h= s->b_height;
3054
3055     if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
3056         iterative_me(s);
3057
3058     for(y=0; y<h; y++){
3059         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
3060             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
3061             return;
3062         }
3063         for(x=0; x<w; x++){
3064             if(s->avctx->me_method == ME_ITER || !search)
3065                 encode_q_branch2(s, 0, x, y);
3066             else
3067                 encode_q_branch (s, 0, x, y);
3068         }
3069     }
3070 }
3071
3072 static void quantize(SnowContext *s, SubBand *b, IDWTELEM *dst, DWTELEM *src, int stride, int bias){
3073     const int w= b->width;
3074     const int h= b->height;
3075     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3076     const int qmul= qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
3077     int x,y, thres1, thres2;
3078
3079     if(s->qlog == LOSSLESS_QLOG){
3080         for(y=0; y<h; y++)
3081             for(x=0; x<w; x++)
3082                 dst[x + y*stride]= src[x + y*stride];
3083         return;
3084     }
3085
3086     bias= bias ? 0 : (3*qmul)>>3;
3087     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3088     thres2= 2*thres1;
3089
3090     if(!bias){
3091         for(y=0; y<h; y++){
3092             for(x=0; x<w; x++){
3093                 int i= src[x + y*stride];
3094
3095                 if((unsigned)(i+thres1) > thres2){
3096                     if(i>=0){
3097                         i<<= QEXPSHIFT;
3098                         i/= qmul; //FIXME optimize
3099                         dst[x + y*stride]=  i;
3100                     }else{
3101                         i= -i;
3102                         i<<= QEXPSHIFT;
3103                         i/= qmul; //FIXME optimize
3104                         dst[x + y*stride]= -i;
3105                     }
3106                 }else
3107                     dst[x + y*stride]= 0;
3108             }
3109         }
3110     }else{
3111         for(y=0; y<h; y++){
3112             for(x=0; x<w; x++){
3113                 int i= src[x + y*stride];
3114
3115                 if((unsigned)(i+thres1) > thres2){
3116                     if(i>=0){
3117                         i<<= QEXPSHIFT;
3118                         i= (i + bias) / qmul; //FIXME optimize
3119                         dst[x + y*stride]=  i;
3120                     }else{
3121                         i= -i;
3122                         i<<= QEXPSHIFT;
3123                         i= (i + bias) / qmul; //FIXME optimize
3124                         dst[x + y*stride]= -i;
3125                     }
3126                 }else
3127                     dst[x + y*stride]= 0;
3128             }
3129         }
3130     }
3131 }
3132
3133 static void dequantize(SnowContext *s, SubBand *b, IDWTELEM *src, int stride){
3134     const int w= b->width;
3135     const int h= b->height;
3136     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3137     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3138     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3139     int x,y;
3140
3141     if(s->qlog == LOSSLESS_QLOG) return;
3142
3143     for(y=0; y<h; y++){
3144         for(x=0; x<w; x++){
3145             int i= src[x + y*stride];
3146             if(i<0){
3147                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3148             }else if(i>0){
3149                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3150             }
3151         }
3152     }
3153 }
3154
3155 static void decorrelate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3156     const int w= b->width;
3157     const int h= b->height;
3158     int x,y;
3159
3160     for(y=h-1; y>=0; y--){
3161         for(x=w-1; x>=0; x--){
3162             int i= x + y*stride;
3163
3164             if(x){
3165                 if(use_median){
3166                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3167                     else  src[i] -= src[i - 1];
3168                 }else{
3169                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3170                     else  src[i] -= src[i - 1];
3171                 }
3172             }else{
3173                 if(y) src[i] -= src[i - stride];
3174             }
3175         }
3176     }
3177 }
3178
3179 static void correlate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3180     const int w= b->width;
3181     const int h= b->height;
3182     int x,y;
3183
3184     for(y=0; y<h; y++){
3185         for(x=0; x<w; x++){
3186             int i= x + y*stride;
3187
3188             if(x){
3189                 if(use_median){
3190                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3191                     else  src[i] += src[i - 1];
3192                 }else{
3193                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3194                     else  src[i] += src[i - 1];
3195                 }
3196             }else{
3197                 if(y) src[i] += src[i - stride];
3198             }
3199         }
3200     }
3201 }
3202
3203 static void encode_qlogs(SnowContext *s){
3204     int plane_index, level, orientation;
3205
3206     for(plane_index=0; plane_index<2; plane_index++){
3207         for(level=0; level<s->spatial_decomposition_count; level++){
3208             for(orientation=level ? 1:0; orientation<4; orientation++){
3209                 if(orientation==2) continue;
3210                 put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3211             }
3212         }
3213     }
3214 }
3215
3216 static void encode_header(SnowContext *s){
3217     int plane_index, i;
3218     uint8_t kstate[32];
3219
3220     memset(kstate, MID_STATE, sizeof(kstate));
3221
3222     put_rac(&s->c, kstate, s->keyframe);
3223     if(s->keyframe || s->always_reset){
3224         reset_contexts(s);
3225         s->last_spatial_decomposition_type=
3226         s->last_qlog=
3227         s->last_qbias=
3228         s->last_mv_scale=
3229         s->last_block_max_depth= 0;
3230         for(plane_index=0; plane_index<2; plane_index++){
3231             Plane *p= &s->plane[plane_index];
3232             p->last_htaps=0;
3233             p->last_diag_mc=0;
3234             memset(p->last_hcoeff, 0, sizeof(p->last_hcoeff));
3235         }
3236     }
3237     if(s->keyframe){
3238         put_symbol(&s->c, s->header_state, s->version, 0);
3239         put_rac(&s->c, s->header_state, s->always_reset);
3240         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3241         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3242         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3243         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3244         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3245         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3246         put_rac(&s->c, s->header_state, s->spatial_scalability);
3247 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3248         put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3249
3250         encode_qlogs(s);
3251     }
3252
3253     if(!s->keyframe){
3254         int update_mc=0;
3255         for(plane_index=0; plane_index<2; plane_index++){
3256             Plane *p= &s->plane[plane_index];
3257             update_mc |= p->last_htaps   != p->htaps;
3258             update_mc |= p->last_diag_mc != p->diag_mc;
3259             update_mc |= !!memcmp(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
3260         }
3261         put_rac(&s->c, s->header_state, update_mc);
3262         if(update_mc){
3263             for(plane_index=0; plane_index<2; plane_index++){
3264                 Plane *p= &s->plane[plane_index];
3265                 put_rac(&s->c, s->header_state, p->diag_mc);
3266                 put_symbol(&s->c, s->header_state, p->htaps/2-1, 0);
3267                 for(i= p->htaps/2; i; i--)
3268                     put_symbol(&s->c, s->header_state, FFABS(p->hcoeff[i]), 0);
3269             }
3270         }
3271         if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
3272             put_rac(&s->c, s->header_state, 1);
3273             put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3274             encode_qlogs(s);
3275         }else
3276             put_rac(&s->c, s->header_state, 0);
3277     }
3278
3279     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3280     put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3281     put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3282     put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3283     put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3284
3285 }
3286
3287 static void update_last_header_values(SnowContext *s){
3288     int plane_index;
3289
3290     if(!s->keyframe){
3291         for(plane_index=0; plane_index<2; plane_index++){
3292             Plane *p= &s->plane[plane_index];
3293             p->last_diag_mc= p->diag_mc;
3294             p->last_htaps  = p->htaps;
3295             memcpy(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
3296         }
3297     }
3298
3299     s->last_spatial_decomposition_type  = s->spatial_decomposition_type;
3300     s->last_qlog                        = s->qlog;
3301     s->last_qbias                       = s->qbias;
3302     s->last_mv_scale                    = s->mv_scale;
3303     s->last_block_max_depth             = s->block_max_depth;
3304     s->last_spatial_decomposition_count = s->spatial_decomposition_count;
3305 }
3306
3307 static int qscale2qlog(int qscale){
3308     return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3309            + 61*QROOT/8; ///< 64 > 60
3310 }
3311
3312 static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3313 {
3314     /* Estimate the frame's complexity as a sum of weighted dwt coefficients.
3315      * FIXME we know exact mv bits at this point,
3316      * but ratecontrol isn't set up to include them. */
3317     uint32_t coef_sum= 0;
3318     int level, orientation, delta_qlog;
3319
3320     for(level=0; level<s->spatial_decomposition_count; level++){
3321         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3322             SubBand *b= &s->plane[0].band[level][orientation];
3323             IDWTELEM *buf= b->ibuf;
3324             const int w= b->width;
3325             const int h= b->height;
3326             const int stride= b->stride;
3327             const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3328             const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3329             const int qdiv= (1<<16)/qmul;
3330             int x, y;
3331             //FIXME this is ugly
3332             for(y=0; y<h; y++)
3333                 for(x=0; x<w; x++)
3334                     buf[x+y*stride]= b->buf[x+y*stride];
3335             if(orientation==0)
3336                 decorrelate(s, b, buf, stride, 1, 0);
3337             for(y=0; y<h; y++)
3338                 for(x=0; x<w; x++)
3339                     coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3340         }
3341     }
3342
3343     /* ugly, ratecontrol just takes a sqrt again */
3344     coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3345     assert(coef_sum < INT_MAX);
3346
3347     if(pict->pict_type == AV_PICTURE_TYPE_I){
3348         s->m.current_picture.mb_var_sum= coef_sum;
3349         s->m.current_picture.mc_mb_var_sum= 0;
3350     }else{
3351         s->m.current_picture.mc_mb_var_sum= coef_sum;
3352         s->m.current_picture.mb_var_sum= 0;
3353     }
3354
3355     pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3356     if (pict->quality < 0)
3357         return INT_MIN;
3358     s->lambda= pict->quality * 3/2;
3359     delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3360     s->qlog+= delta_qlog;
3361     return delta_qlog;
3362 }
3363
3364 static void calculate_visual_weight(SnowContext *s, Plane *p){
3365     int width = p->width;
3366     int height= p->height;
3367     int level, orientation, x, y;
3368
3369     for(level=0; level<s->spatial_decomposition_count; level++){
3370         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3371             SubBand *b= &p->band[level][orientation];
3372             IDWTELEM *ibuf= b->ibuf;
3373             int64_t error=0;
3374
3375             memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
3376             ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
3377             ff_spatial_idwt(s->spatial_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3378             for(y=0; y<height; y++){
3379                 for(x=0; x<width; x++){
3380                     int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
3381                     error += d*d;
3382                 }
3383             }
3384
3385             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3386         }
3387     }
3388 }
3389
3390 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
3391     SnowContext *s = avctx->priv_data;
3392     RangeCoder * const c= &s->c;
3393     AVFrame *pict = data;
3394     const int width= s->avctx->width;
3395     const int height= s->avctx->height;
3396     int level, orientation, plane_index, i, y;
3397     uint8_t rc_header_bak[sizeof(s->header_state)];
3398     uint8_t rc_block_bak[sizeof(s->block_state)];
3399
3400     ff_init_range_encoder(c, buf, buf_size);
3401     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
3402
3403     for(i=0; i<3; i++){
3404         int shift= !!i;
3405         for(y=0; y<(height>>shift); y++)
3406             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
3407                    &pict->data[i][y * pict->linesize[i]],
3408                    width>>shift);
3409     }
3410     s->new_picture = *pict;
3411
3412     s->m.picture_number= avctx->frame_number;
3413     if(avctx->flags&CODEC_FLAG_PASS2){
3414         s->m.pict_type =
3415         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
3416         s->keyframe= pict->pict_type==AV_PICTURE_TYPE_I;
3417         if(!(avctx->flags&CODEC_FLAG_QSCALE)) {
3418             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
3419             if (pict->quality < 0)
3420                 return -1;
3421         }
3422     }else{
3423         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
3424         s->m.pict_type=
3425         pict->pict_type= s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3426     }
3427
3428     if(s->pass1_rc && avctx->frame_number == 0)
3429         pict->quality= 2*FF_QP2LAMBDA;
3430     if(pict->quality){
3431         s->qlog= qscale2qlog(pict->quality);
3432         s->lambda = pict->quality * 3/2;
3433     }
3434     if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
3435         s->qlog= LOSSLESS_QLOG;
3436         s->lambda = 0;
3437     }//else keep previous frame's qlog until after motion estimation
3438
3439     frame_start(s);
3440
3441     s->m.current_picture_ptr= &s->m.current_picture;
3442     s->m.last_picture.f.pts = s->m.current_picture.f.pts;
3443     s->m.current_picture.f.pts = pict->pts;
3444     if(pict->pict_type == AV_PICTURE_TYPE_P){
3445         int block_width = (width +15)>>4;
3446         int block_height= (height+15)>>4;
3447         int stride= s->current_picture.linesize[0];
3448
3449         assert(s->current_picture.data[0]);
3450         assert(s->last_picture[0].data[0]);
3451
3452         s->m.avctx= s->avctx;
3453         s->m.current_picture.f.data[0] = s->current_picture.data[0];
3454         s->m.   last_picture.f.data[0] = s->last_picture[0].data[0];
3455         s->m.    new_picture.f.data[0] = s->  input_picture.data[0];
3456         s->m.   last_picture_ptr= &s->m.   last_picture;
3457         s->m.linesize=
3458         s->m.   last_picture.f.linesize[0] =
3459         s->m.    new_picture.f.linesize[0] =
3460         s->m.current_picture.f.linesize[0] = stride;
3461         s->m.uvlinesize= s->current_picture.linesize[1];
3462         s->m.width = width;
3463         s->m.height= height;
3464         s->m.mb_width = block_width;
3465         s->m.mb_height= block_height;
3466         s->m.mb_stride=   s->m.mb_width+1;
3467         s->m.b8_stride= 2*s->m.mb_width+1;
3468         s->m.f_code=1;
3469         s->m.pict_type= pict->pict_type;
3470         s->m.me_method= s->avctx->me_method;
3471         s->m.me.scene_change_score=0;
3472         s->m.flags= s->avctx->flags;
3473         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
3474         s->m.out_format= FMT_H263;
3475         s->m.unrestricted_mv= 1;
3476
3477         s->m.lambda = s->lambda;
3478         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
3479         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
3480
3481         s->m.dsp= s->dsp; //move
3482         ff_init_me(&s->m);
3483         s->dsp= s->m.dsp;
3484     }
3485
3486     if(s->pass1_rc){
3487         memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
3488         memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
3489     }
3490
3491 redo_frame:
3492
3493     if(pict->pict_type == AV_PICTURE_TYPE_I)
3494         s->spatial_decomposition_count= 5;
3495     else
3496         s->spatial_decomposition_count= 5;
3497
3498     s->m.pict_type = pict->pict_type;
3499     s->qbias= pict->pict_type == AV_PICTURE_TYPE_P ? 2 : 0;
3500
3501     common_init_after_header(avctx);
3502
3503     if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
3504         for(plane_index=0; plane_index<3; plane_index++){
3505             calculate_visual_weight(s, &s->plane[plane_index]);
3506         }
3507     }
3508
3509     encode_header(s);
3510     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
3511     encode_blocks(s, 1);
3512     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
3513
3514     for(plane_index=0; plane_index<3; plane_index++){
3515         Plane *p= &s->plane[plane_index];
3516         int w= p->width;
3517         int h= p->height;
3518         int x, y;
3519 //        int bits= put_bits_count(&s->c.pb);
3520
3521         if(!(avctx->flags2 & CODEC_FLAG2_MEMC_ONLY)){
3522             //FIXME optimize
3523             if(pict->data[plane_index]) //FIXME gray hack
3524                 for(y=0; y<h; y++){
3525                     for(x=0; x<w; x++){
3526                         s->spatial_idwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
3527                     }
3528                 }
3529             predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
3530
3531             if(   plane_index==0
3532                && pict->pict_type == AV_PICTURE_TYPE_P
3533                && !(avctx->flags&CODEC_FLAG_PASS2)
3534                && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
3535                 ff_init_range_encoder(c, buf, buf_size);
3536                 ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
3537                 pict->pict_type= AV_PICTURE_TYPE_I;
3538                 s->keyframe=1;
3539                 s->current_picture.key_frame=1;
3540                 goto redo_frame;
3541             }
3542
3543             if(s->qlog == LOSSLESS_QLOG){
3544                 for(y=0; y<h; y++){
3545                     for(x=0; x<w; x++){
3546                         s->spatial_dwt_buffer[y*w + x]= (s->spatial_idwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
3547                     }
3548                 }
3549             }else{
3550                 for(y=0; y<h; y++){
3551                     for(x=0; x<w; x++){
3552                         s->spatial_dwt_buffer[y*w + x]=s->spatial_idwt_buffer[y*w + x]<<ENCODER_EXTRA_BITS;
3553                     }
3554                 }
3555             }
3556
3557             /*  if(QUANTIZE2)
3558                 dwt_quantize(s, p, s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type);
3559             else*/
3560                 ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
3561
3562             if(s->pass1_rc && plane_index==0){
3563                 int delta_qlog = ratecontrol_1pass(s, pict);
3564                 if (delta_qlog <= INT_MIN)
3565                     return -1;
3566                 if(delta_qlog){
3567                     //reordering qlog in the bitstream would eliminate this reset
3568                     ff_init_range_encoder(c, buf, buf_size);
3569                     memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
3570                     memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
3571                     encode_header(s);
3572                     encode_blocks(s, 0);
3573                 }
3574             }
3575
3576             for(level=0; level<s->spatial_decomposition_count; level++){
3577                 for(orientation=level ? 1 : 0; orientation<4; orientation++){
3578                     SubBand *b= &p->band[level][orientation];
3579
3580                     if(!QUANTIZE2)
3581                         quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
3582                     if(orientation==0)
3583                         decorrelate(s, b, b->ibuf, b->stride, pict->pict_type == AV_PICTURE_TYPE_P, 0);
3584                     encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
3585                     assert(b->parent==NULL || b->parent->stride == b->stride*2);
3586                     if(orientation==0)
3587                         correlate(s, b, b->ibuf, b->stride, 1, 0);
3588                 }
3589             }
3590
3591             for(level=0; level<s->spatial_decomposition_count; level++){
3592                 for(orientation=level ? 1 : 0; orientation<4; orientation++){
3593                     SubBand *b= &p->band[level][orientation];
3594
3595                     dequantize(s, b, b->ibuf, b->stride);
3596                 }
3597             }
3598
3599             ff_spatial_idwt(s->spatial_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
3600             if(s->qlog == LOSSLESS_QLOG){
3601                 for(y=0; y<h; y++){
3602                     for(x=0; x<w; x++){
3603                         s->spatial_idwt_buffer[y*w + x]<<=FRAC_BITS;
3604                     }
3605                 }
3606             }
3607             predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
3608         }else{
3609             //ME/MC only
3610             if(pict->pict_type == AV_PICTURE_TYPE_I){
3611                 for(y=0; y<h; y++){
3612                     for(x=0; x<w; x++){
3613                         s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x]=
3614                             pict->data[plane_index][y*pict->linesize[plane_index] + x];
3615                     }
3616                 }
3617             }else{
3618                 memset(s->spatial_idwt_buffer, 0, sizeof(IDWTELEM)*w*h);
3619                 predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
3620             }
3621         }
3622         if(s->avctx->flags&CODEC_FLAG_PSNR){
3623             int64_t error= 0;
3624
3625             if(pict->data[plane_index]) //FIXME gray hack
3626                 for(y=0; y<h; y++){
3627                     for(x=0; x<w; x++){
3628                         int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
3629                         error += d*d;
3630                     }
3631                 }
3632             s->avctx->error[plane_index] += error;
3633             s->current_picture.error[plane_index] = error;
3634         }
3635
3636     }
3637
3638     update_last_header_values(s);
3639
3640     release_buffer(avctx);
3641
3642     s->current_picture.coded_picture_number = avctx->frame_number;
3643     s->current_picture.pict_type = pict->pict_type;
3644     s->current_picture.quality = pict->quality;
3645     s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
3646     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
3647     s->m.current_picture.f.display_picture_number =
3648     s->m.current_picture.f.coded_picture_number   = avctx->frame_number;
3649     s->m.current_picture.f.quality                = pict->quality;
3650     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
3651     if(s->pass1_rc)
3652         if (ff_rate_estimate_qscale(&s->m, 0) < 0)
3653             return -1;
3654     if(avctx->flags&CODEC_FLAG_PASS1)
3655         ff_write_pass1_stats(&s->m);
3656     s->m.last_pict_type = s->m.pict_type;
3657     avctx->frame_bits = s->m.frame_bits;
3658     avctx->mv_bits = s->m.mv_bits;
3659     avctx->misc_bits = s->m.misc_bits;
3660     avctx->p_tex_bits = s->m.p_tex_bits;
3661
3662     emms_c();
3663
3664     return ff_rac_terminate(c);
3665 }
3666
3667 static av_cold int encode_end(AVCodecContext *avctx)
3668 {
3669     SnowContext *s = avctx->priv_data;
3670
3671     common_end(s);
3672     if (s->input_picture.data[0])
3673         avctx->release_buffer(avctx, &s->input_picture);
3674     av_free(avctx->stats_out);
3675
3676     return 0;
3677 }
3678
3679 AVCodec ff_snow_encoder = {
3680     .name           = "snow",
3681     .type           = AVMEDIA_TYPE_VIDEO,
3682     .id             = CODEC_ID_SNOW,
3683     .priv_data_size = sizeof(SnowContext),
3684     .init           = encode_init,
3685     .encode         = encode_frame,
3686     .close          = encode_end,
3687     .long_name = NULL_IF_CONFIG_SMALL("Snow"),
3688 };
3689 #endif
3690
3691
3692 #ifdef TEST
3693 #undef malloc
3694 #undef free
3695 #undef printf
3696
3697 #include "libavutil/lfg.h"
3698 #include "libavutil/mathematics.h"
3699
3700 int main(void){
3701     int width=256;
3702     int height=256;
3703     int buffer[2][width*height];
3704     SnowContext s;
3705     int i;
3706     AVLFG prng;
3707     s.spatial_decomposition_count=6;
3708     s.spatial_decomposition_type=1;
3709
3710     av_lfg_init(&prng, 1);
3711
3712     printf("testing 5/3 DWT\n");
3713     for(i=0; i<width*height; i++)
3714         buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
3715
3716     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
3717     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
3718
3719     for(i=0; i<width*height; i++)
3720         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
3721
3722     printf("testing 9/7 DWT\n");
3723     s.spatial_decomposition_type=0;
3724     for(i=0; i<width*height; i++)
3725         buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
3726
3727     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
3728     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
3729
3730     for(i=0; i<width*height; i++)
3731         if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
3732
3733     {
3734     int level, orientation, x, y;
3735     int64_t errors[8][4];
3736     int64_t g=0;
3737
3738         memset(errors, 0, sizeof(errors));
3739         s.spatial_decomposition_count=3;
3740         s.spatial_decomposition_type=0;
3741         for(level=0; level<s.spatial_decomposition_count; level++){
3742             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3743                 int w= width  >> (s.spatial_decomposition_count-level);
3744                 int h= height >> (s.spatial_decomposition_count-level);
3745                 int stride= width  << (s.spatial_decomposition_count-level);
3746                 DWTELEM *buf= buffer[0];
3747                 int64_t error=0;
3748
3749                 if(orientation&1) buf+=w;
3750                 if(orientation>1) buf+=stride>>1;
3751
3752                 memset(buffer[0], 0, sizeof(int)*width*height);
3753                 buf[w/2 + h/2*stride]= 256*256;
3754                 ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
3755                 for(y=0; y<height; y++){
3756                     for(x=0; x<width; x++){
3757                         int64_t d= buffer[0][x + y*width];
3758                         error += d*d;
3759                         if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
3760                     }
3761                     if(FFABS(height/2-y)<9 && level==2) printf("\n");
3762                 }
3763                 error= (int)(sqrt(error)+0.5);
3764                 errors[level][orientation]= error;
3765                 if(g) g=av_gcd(g, error);
3766                 else g= error;
3767             }
3768         }
3769         printf("static int const visual_weight[][4]={\n");
3770         for(level=0; level<s.spatial_decomposition_count; level++){
3771             printf("  {");
3772             for(orientation=0; orientation<4; orientation++){
3773                 printf("%8"PRId64",", errors[level][orientation]/g);
3774             }
3775             printf("},\n");
3776         }
3777         printf("};\n");
3778         {
3779             int level=2;
3780             int w= width  >> (s.spatial_decomposition_count-level);
3781             //int h= height >> (s.spatial_decomposition_count-level);
3782             int stride= width  << (s.spatial_decomposition_count-level);
3783             DWTELEM *buf= buffer[0];
3784             int64_t error=0;
3785
3786             buf+=w;
3787             buf+=stride>>1;
3788
3789             memset(buffer[0], 0, sizeof(int)*width*height);
3790             for(y=0; y<height; y++){
3791                 for(x=0; x<width; x++){
3792                     int tab[4]={0,2,3,1};
3793                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
3794                 }
3795             }
3796             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
3797             for(y=0; y<height; y++){
3798                 for(x=0; x<width; x++){
3799                     int64_t d= buffer[0][x + y*width];
3800                     error += d*d;
3801                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
3802                 }
3803                 if(FFABS(height/2-y)<9) printf("\n");
3804             }
3805         }
3806
3807     }
3808     return 0;
3809 }
3810 #endif /* TEST */