git.sesse.net Git - x264/blob - common/opencl/downscale.cl

   1 /*
   2  * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
   3  *
   4  * --
   5  *
   6  * fenc_img is an output image (area of memory referenced through a texture
   7  * cache). A read of any pixel location (x,y) returns four pixel values:
   8  *
   9  * val.s0 = P(x,y)
  10  * val.s1 = P(x+1,y)
  11  * val.s2 = P(x+2,y)
  12  * val.s3 = P(x+3,y)
  13  *
  14  * This is a 4x replication of the lowres pixels, a trade-off between memory
  15  * size and read latency.
  16  *
  17  * --
  18  *
  19  * hpel_planes is an output image that contains the four HPEL planes used for
  20  * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
  21  * the four planar values C | V | H | F
  22  *
  23  * launch dimensions:  [lowres-width, lowres-height]
  24  */
  25 kernel void downscale_hpel( const global pixel *fenc,
  26                             write_only image2d_t fenc_img,
  27                             write_only image2d_t hpel_planes,
  28                             int stride )
  29 {
  30     int x = get_global_id( 0 );
  31     int y = get_global_id( 1 );
  32     uint4 values;
  33
  34     fenc += y * stride * 2;
  35     const global pixel *src1 = fenc + stride;
  36     const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
  37     int2 pos = (int2)(x, y);
  38     pixel right, left;
  39
  40     right = rhadd( fenc[x*2], src1[x*2] );
  41     left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  42     values.s0 = rhadd( right, left );           // F
  43
  44     right = rhadd( fenc[2*x+1], src1[2*x+1] );
  45     left  = rhadd( fenc[2*x+2], src1[2*x+2] );
  46     values.s1 = rhadd( right, left );           // H
  47
  48     right = rhadd( src1[2*x], src2[2*x] );
  49     left  = rhadd( src1[2*x+1], src2[2*x+1] );
  50     values.s2 = rhadd( right, left );           // V
  51
  52     right = rhadd( src1[2*x+1], src2[2*x+1] );
  53     left  = rhadd( src1[2*x+2], src2[2*x+2] );
  54     values.s3 = rhadd( right, left );           // C
  55
  56     uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
  57     write_imageui( hpel_planes, pos, val );
  58
  59     x = select( x, x+1, x+1 < get_global_size( 0 ) );
  60     right = rhadd( fenc[x*2], src1[x*2] );
  61     left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  62     values.s1 = rhadd( right, left );
  63
  64     x = select( x, x+1, x+1 < get_global_size( 0 ) );
  65     right = rhadd( fenc[x*2], src1[x*2] );
  66     left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  67     values.s2 = rhadd( right, left );
  68
  69     x = select( x, x+1, x+1 < get_global_size( 0 ) );
  70     right = rhadd( fenc[x*2], src1[x*2] );
  71     left  = rhadd( fenc[x*2+1], src1[x*2+1] );
  72     values.s3 = rhadd( right, left );
  73
  74     write_imageui( fenc_img, pos, values );
  75 }
  76
  77 /*
  78  * downscale lowres hierarchical motion search image, copy from one image to
  79  * another decimated image.  This kernel is called iteratively to generate all
  80  * of the downscales.
  81  *
  82  * launch dimensions:  [lower_res width, lower_res height]
  83  */
  84 kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
  85 {
  86     int x = get_global_id( 0 );
  87     int y = get_global_id( 1 );
  88     int2 pos = (int2)(x, y);
  89     int gs = get_global_size( 0 );
  90     uint4 top, bot, values;
  91     top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
  92     bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
  93     values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
  94
  95     /* these select statements appear redundant, and they should be, but tests break when
  96      * they are not here.  I believe this was caused by a driver bug
  97      */
  98     values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
  99     top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
 100     bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
 101     values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
 102     values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
 103     write_imageui( lower_res, pos, (uint4)(values) );
 104 }
 105
 106 /*
 107  * Second copy of downscale kernel, no differences. This is a (no perf loss)
 108  * workaround for a scheduling bug in current Tahiti drivers.  This bug has
 109  * theoretically been fixed in the July 2012 driver release from AMD.
 110  */
 111 kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
 112 {
 113     int x = get_global_id( 0 );
 114     int y = get_global_id( 1 );
 115     int2 pos = (int2)(x, y);
 116     int gs = get_global_size( 0 );
 117     uint4 top, bot, values;
 118     top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
 119     bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
 120     values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
 121
 122     // see comment in above function copy
 123     values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
 124     top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
 125     bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
 126     values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
 127     values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
 128     write_imageui( lower_res, pos, (uint4)(values) );
 129 }
 130
 131 /* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
 132 kernel void memset_int16( global int16_t *buf, int16_t value )
 133 {
 134     buf[get_global_id( 0 )] = value;
 135 }