]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/compress.c
Disable pristine-tar option in gbp.conf, since there is no pristine-tar branch.
[bcachefs-tools-debian] / libbcachefs / compress.c
index 80b12f3b35ae2f5fedee70b74836d1f4d31fa8e4..1410365a889156450c78da9165bdb146872370ed 100644 (file)
@@ -1,83 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "checksum.h"
 #include "compress.h"
 #include "extents.h"
-#include "io.h"
 #include "super-io.h"
 
-#include "lz4.h"
+#include <linux/lz4.h>
 #include <linux/zlib.h>
-
-enum bounced {
-       BOUNCED_CONTIG,
-       BOUNCED_MAPPED,
-       BOUNCED_KMALLOCED,
-       BOUNCED_VMALLOCED,
-       BOUNCED_MEMPOOLED,
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+       void            *b;
+       enum {
+               BB_NONE,
+               BB_VMAP,
+               BB_KMALLOC,
+               BB_MEMPOOL,
+       }               type;
+       int             rw;
 };
 
-static void *__bounce_alloc(struct bch_fs *c, unsigned size,
-                           unsigned *bounced, int direction)
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 {
-       void *data;
+       void *b;
 
-       *bounced = BOUNCED_KMALLOCED;
-       data = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
-       if (data)
-               return data;
+       BUG_ON(size > c->opts.encoded_extent_max);
 
-       *bounced = BOUNCED_MEMPOOLED;
-       data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT);
-       if (data)
-               return page_address(data);
+       b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
+       if (b)
+               return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
 
-       *bounced = BOUNCED_VMALLOCED;
-       data = vmalloc(size);
-       if (data)
-               return data;
+       b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
+       if (b)
+               return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+       BUG();
+}
+
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+       struct bio_vec bv;
+       struct bvec_iter iter;
+       void *expected_start = NULL;
 
-       *bounced = BOUNCED_MEMPOOLED;
-       data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO);
-       return page_address(data);
+       __bio_for_each_bvec(bv, bio, iter, start) {
+               if (expected_start &&
+                   expected_start != page_address(bv.bv_page) + bv.bv_offset)
+                       return false;
+
+               expected_start = page_address(bv.bv_page) +
+                       bv.bv_offset + bv.bv_len;
+       }
+
+       return true;
 }
 
-static void *__bio_map_or_bounce(struct bch_fs *c,
-                                struct bio *bio, struct bvec_iter start,
-                                unsigned *bounced, int direction)
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+                                      struct bvec_iter start, int rw)
 {
+       struct bbuf ret;
        struct bio_vec bv;
        struct bvec_iter iter;
        unsigned nr_pages = 0;
        struct page *stack_pages[16];
        struct page **pages = NULL;
-       bool first = true;
-       unsigned prev_end = PAGE_SIZE;
        void *data;
 
-       BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX);
+       BUG_ON(start.bi_size > c->opts.encoded_extent_max);
 
-#ifndef CONFIG_HIGHMEM
-       *bounced = BOUNCED_CONTIG;
-
-       __bio_for_each_contig_segment(bv, bio, iter, start) {
-               if (bv.bv_len == start.bi_size)
-                       return page_address(bv.bv_page) + bv.bv_offset;
-       }
-#endif
-       *bounced = BOUNCED_MAPPED;
+       if (!PageHighMem(bio_iter_page(bio, start)) &&
+           bio_phys_contig(bio, start))
+               return (struct bbuf) {
+                       .b = page_address(bio_iter_page(bio, start)) +
+                               bio_iter_offset(bio, start),
+                       .type = BB_NONE, .rw = rw
+               };
 
+       /* check if we can map the pages contiguously: */
        __bio_for_each_segment(bv, bio, iter, start) {
-               if ((!first && bv.bv_offset) ||
-                   prev_end != PAGE_SIZE)
+               if (iter.bi_size != start.bi_size &&
+                   bv.bv_offset)
+                       goto bounce;
+
+               if (bv.bv_len < iter.bi_size &&
+                   bv.bv_offset + bv.bv_len < PAGE_SIZE)
                        goto bounce;
 
-               prev_end = bv.bv_offset + bv.bv_len;
                nr_pages++;
        }
 
        BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
 
        pages = nr_pages > ARRAY_SIZE(stack_pages)
-               ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+               ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
                : stack_pages;
        if (!pages)
                goto bounce;
@@ -90,41 +106,39 @@ static void *__bio_map_or_bounce(struct bch_fs *c,
        if (pages != stack_pages)
                kfree(pages);
 
-       return data + bio_iter_offset(bio, start);
+       if (data)
+               return (struct bbuf) {
+                       .b = data + bio_iter_offset(bio, start),
+                       .type = BB_VMAP, .rw = rw
+               };
 bounce:
-       data = __bounce_alloc(c, start.bi_size, bounced, direction);
+       ret = __bounce_alloc(c, start.bi_size, rw);
 
-       if (direction == READ)
-               memcpy_from_bio(data, bio, start);
+       if (rw == READ)
+               memcpy_from_bio(ret.b, bio, start);
 
-       return data;
+       return ret;
 }
 
-static void *bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
-                              unsigned *bounced, int direction)
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
 {
-       return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction);
+       return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
 }
 
-static void bio_unmap_or_unbounce(struct bch_fs *c, void *data,
-                                 unsigned bounced, int direction)
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
 {
-       if (!data)
-               return;
-
-       switch (bounced) {
-       case BOUNCED_MAPPED:
-               vunmap((void *) ((unsigned long) data & PAGE_MASK));
-               return;
-       case BOUNCED_KMALLOCED:
-               kfree(data);
-               return;
-       case BOUNCED_VMALLOCED:
-               vfree(data);
-               return;
-       case BOUNCED_MEMPOOLED:
-               mempool_free(virt_to_page(data), &c->compression_bounce[direction]);
-               return;
+       switch (buf.type) {
+       case BB_NONE:
+               break;
+       case BB_VMAP:
+               vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+               break;
+       case BB_KMALLOC:
+               kfree(buf.b);
+               break;
+       case BB_MEMPOOL:
+               mempool_free(buf.b, &c->compression_bounce[buf.rw]);
+               break;
        }
 }
 
@@ -136,364 +150,578 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace)
 }
 
 static int __bio_uncompress(struct bch_fs *c, struct bio *src,
-                           void *dst_data, struct bch_extent_crc128 crc)
+                           void *dst_data, struct bch_extent_crc_unpacked crc)
 {
-       void *src_data = NULL;
-       unsigned src_bounced;
+       struct bbuf src_data = { NULL };
        size_t src_len = src->bi_iter.bi_size;
-       size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
+       size_t dst_len = crc.uncompressed_size << 9;
+       void *workspace;
        int ret;
 
-       src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+       src_data = bio_map_or_bounce(c, src, READ);
 
        switch (crc.compression_type) {
-       case BCH_COMPRESSION_LZ4:
-               ret = lz4_decompress(src_data, &src_len,
-                                    dst_data, dst_len);
-               if (ret) {
-                       ret = -EIO;
+       case BCH_COMPRESSION_TYPE_lz4_old:
+       case BCH_COMPRESSION_TYPE_lz4:
+               ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+                                                 src_len, dst_len, dst_len);
+               if (ret != dst_len)
                        goto err;
-               }
                break;
-       case BCH_COMPRESSION_GZIP: {
-               void *workspace;
-               z_stream strm;
-
-               workspace = kmalloc(zlib_inflate_workspacesize(),
-                                   GFP_NOIO|__GFP_NOWARN);
-               if (!workspace) {
-                       mutex_lock(&c->zlib_workspace_lock);
-                       workspace = c->zlib_workspace;
-               }
+       case BCH_COMPRESSION_TYPE_gzip: {
+               z_stream strm = {
+                       .next_in        = src_data.b,
+                       .avail_in       = src_len,
+                       .next_out       = dst_data,
+                       .avail_out      = dst_len,
+               };
+
+               workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
 
-               strm.next_in    = src_data;
-               strm.avail_in   = src_len;
-               strm.next_out   = dst_data;
-               strm.avail_out  = dst_len;
                zlib_set_workspace(&strm, workspace);
                zlib_inflateInit2(&strm, -MAX_WBITS);
-
                ret = zlib_inflate(&strm, Z_FINISH);
 
-               if (workspace == c->zlib_workspace)
-                       mutex_unlock(&c->zlib_workspace_lock);
-               else
-                       kfree(workspace);
+               mempool_free(workspace, &c->decompress_workspace);
 
-               if (ret != Z_STREAM_END) {
-                       ret = -EIO;
+               if (ret != Z_STREAM_END)
+                       goto err;
+               break;
+       }
+       case BCH_COMPRESSION_TYPE_zstd: {
+               ZSTD_DCtx *ctx;
+               size_t real_src_len = le32_to_cpup(src_data.b);
+
+               if (real_src_len > src_len - 4)
+                       goto err;
+
+               workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+               ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
+
+               ret = zstd_decompress_dctx(ctx,
+                               dst_data,       dst_len,
+                               src_data.b + 4, real_src_len);
+
+               mempool_free(workspace, &c->decompress_workspace);
+
+               if (ret != dst_len)
                        goto err;
-               }
                break;
        }
        default:
                BUG();
        }
        ret = 0;
-err:
-       bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
+out:
+       bio_unmap_or_unbounce(c, src_data);
        return ret;
+err:
+       ret = -EIO;
+       goto out;
 }
 
 int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
-                              unsigned live_data_sectors,
-                              struct bch_extent_crc128 crc)
+                               struct bch_extent_crc_unpacked *crc)
 {
-       void *dst_data = NULL;
-       size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
-       int ret = -ENOMEM;
+       struct bbuf data = { NULL };
+       size_t dst_len = crc->uncompressed_size << 9;
 
-       BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
+       /* bio must own its pages: */
+       BUG_ON(!bio->bi_vcnt);
+       BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
 
-       /* XXX mempoolify */
-       dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN);
-       if (!dst_data) {
-               dst_data = vmalloc(dst_len);
-               if (!dst_data)
-                       goto err;
+       if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
+           crc->compressed_size << 9   > c->opts.encoded_extent_max) {
+               bch_err(c, "error rewriting existing data: extent too big");
+               return -EIO;
        }
 
-       ret = __bio_uncompress(c, bio, dst_data, crc);
-       if (ret)
-               goto err;
-
-       while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
-               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+       data = __bounce_alloc(c, dst_len, WRITE);
 
-               bv->bv_page = alloc_page(GFP_NOIO);
-               if (!bv->bv_page)
-                       goto use_mempool;
-
-               bv->bv_len = PAGE_SIZE;
-               bv->bv_offset = 0;
-               bio->bi_vcnt++;
+       if (__bio_uncompress(c, bio, data.b, *crc)) {
+               if (!c->opts.no_data_io)
+                       bch_err(c, "error rewriting existing data: decompression error");
+               bio_unmap_or_unbounce(c, data);
+               return -EIO;
        }
 
-       bio->bi_iter.bi_size = live_data_sectors << 9;
-copy_data:
-       memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9));
-err:
-       kvfree(dst_data);
-       return ret;
-use_mempool:
        /*
-        * We already allocated from mempool, we can't allocate from it again
-        * without freeing the pages we already allocated or else we could
-        * deadlock:
+        * XXX: don't have a good way to assert that the bio was allocated with
+        * enough space, we depend on bch2_move_extent doing the right thing
         */
+       bio->bi_iter.bi_size = crc->live_size << 9;
 
-       bch2_bio_free_pages_pool(c, bio);
-       bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
-       goto copy_data;
+       memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+       crc->csum_type          = 0;
+       crc->compression_type   = 0;
+       crc->compressed_size    = crc->live_size;
+       crc->uncompressed_size  = crc->live_size;
+       crc->offset             = 0;
+       crc->csum               = (struct bch_csum) { 0, 0 };
+
+       bio_unmap_or_unbounce(c, data);
+       return 0;
 }
 
 int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
                       struct bio *dst, struct bvec_iter dst_iter,
-                      struct bch_extent_crc128 crc)
+                      struct bch_extent_crc_unpacked crc)
 {
-       void *dst_data = NULL;
-       unsigned dst_bounced;
-       size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
-       int ret = -ENOMEM;
+       struct bbuf dst_data = { NULL };
+       size_t dst_len = crc.uncompressed_size << 9;
+       int ret;
+
+       if (crc.uncompressed_size << 9  > c->opts.encoded_extent_max ||
+           crc.compressed_size << 9    > c->opts.encoded_extent_max)
+               return -EIO;
 
        dst_data = dst_len == dst_iter.bi_size
-               ? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE)
-               : __bounce_alloc(c, dst_len, &dst_bounced, WRITE);
+               ? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+               : __bounce_alloc(c, dst_len, WRITE);
 
-       ret = __bio_uncompress(c, src, dst_data, crc);
+       ret = __bio_uncompress(c, src, dst_data.b, crc);
        if (ret)
                goto err;
 
-       if (dst_bounced)
-               memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9));
+       if (dst_data.type != BB_NONE &&
+           dst_data.type != BB_VMAP)
+               memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
 err:
-       bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+       bio_unmap_or_unbounce(c, dst_data);
        return ret;
 }
 
-static int __bio_compress(struct bch_fs *c,
-                         struct bio *dst, size_t *dst_len,
-                         struct bio *src, size_t *src_len,
-                         unsigned compression_type)
+static int attempt_compress(struct bch_fs *c,
+                           void *workspace,
+                           void *dst, size_t dst_len,
+                           void *src, size_t src_len,
+                           struct bch_compression_opt compression)
 {
-       void *src_data = NULL, *dst_data = NULL;
-       unsigned src_bounced, dst_bounced, pad;
-       int ret = -1;
-
-       dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE);
-       src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
+       enum bch_compression_type compression_type =
+               __bch2_compression_opt_to_type[compression.type];
 
        switch (compression_type) {
-       case BCH_COMPRESSION_LZ4: {
-               void *workspace;
-
-               *dst_len = dst->bi_iter.bi_size;
-               *src_len = src->bi_iter.bi_size;
-
-               workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
-
-               while (*src_len > block_bytes(c) &&
-                      (ret = lz4_compress(src_data, *src_len,
-                                          dst_data, dst_len,
-                                          workspace))) {
-                       /*
-                        * On error, the compressed data was bigger than
-                        * dst_len, and -ret is the amount of data we were able
-                        * to compress - round down to nearest block and try
-                        * again:
-                        */
-                       BUG_ON(ret > 0);
-                       BUG_ON(-ret >= *src_len);
-
-                       *src_len = round_down(-ret, block_bytes(c));
-               }
-
-               mempool_free(workspace, &c->lz4_workspace_pool);
+       case BCH_COMPRESSION_TYPE_lz4:
+               if (compression.level < LZ4HC_MIN_CLEVEL) {
+                       int len = src_len;
+                       int ret = LZ4_compress_destSize(
+                                       src,            dst,
+                                       &len,           dst_len,
+                                       workspace);
+                       if (len < src_len)
+                               return -len;
 
-               if (ret)
-                       goto err;
-               break;
-       }
-       case BCH_COMPRESSION_GZIP: {
-               void *workspace;
-               z_stream strm;
-
-               workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS,
-                                                              DEF_MEM_LEVEL),
-                                   GFP_NOIO|__GFP_NOWARN);
-               if (!workspace) {
-                       mutex_lock(&c->zlib_workspace_lock);
-                       workspace = c->zlib_workspace;
+                       return ret;
+               } else {
+                       int ret = LZ4_compress_HC(
+                                       src,            dst,
+                                       src_len,        dst_len,
+                                       compression.level,
+                                       workspace);
+
+                       return ret ?: -1;
                }
+       case BCH_COMPRESSION_TYPE_gzip: {
+               z_stream strm = {
+                       .next_in        = src,
+                       .avail_in       = src_len,
+                       .next_out       = dst,
+                       .avail_out      = dst_len,
+               };
 
-               strm.next_in    = src_data;
-               strm.avail_in   = min(src->bi_iter.bi_size,
-                                     dst->bi_iter.bi_size);
-               strm.next_out   = dst_data;
-               strm.avail_out  = dst->bi_iter.bi_size;
                zlib_set_workspace(&strm, workspace);
-               zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+               zlib_deflateInit2(&strm,
+                                 compression.level
+                                 ? clamp_t(unsigned, compression.level,
+                                           Z_BEST_SPEED, Z_BEST_COMPRESSION)
+                                 : Z_DEFAULT_COMPRESSION,
                                  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
                                  Z_DEFAULT_STRATEGY);
 
-               ret = zlib_deflate(&strm, Z_FINISH);
-               if (ret != Z_STREAM_END) {
-                       ret = -EIO;
-                       goto zlib_err;
-               }
-
-               ret = zlib_deflateEnd(&strm);
-               if (ret != Z_OK) {
-                       ret = -EIO;
-                       goto zlib_err;
-               }
+               if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+                       return 0;
 
-               ret = 0;
-zlib_err:
-               if (workspace == c->zlib_workspace)
-                       mutex_unlock(&c->zlib_workspace_lock);
-               else
-                       kfree(workspace);
+               if (zlib_deflateEnd(&strm) != Z_OK)
+                       return 0;
 
-               if (ret)
-                       goto err;
+               return strm.total_out;
+       }
+       case BCH_COMPRESSION_TYPE_zstd: {
+               /*
+                * rescale:
+                * zstd max compression level is 22, our max level is 15
+                */
+               unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
+               ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
+               ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
+
+               /*
+                * ZSTD requires that when we decompress we pass in the exact
+                * compressed size - rounding it up to the nearest sector
+                * doesn't work, so we use the first 4 bytes of the buffer for
+                * that.
+                *
+                * Additionally, the ZSTD code seems to have a bug where it will
+                * write just past the end of the buffer - so subtract a fudge
+                * factor (7 bytes) from the dst buffer size to account for
+                * that.
+                */
+               size_t len = zstd_compress_cctx(ctx,
+                               dst + 4,        dst_len - 4 - 7,
+                               src,            src_len,
+                               &params);
+               if (zstd_is_error(len))
+                       return 0;
 
-               *dst_len = strm.total_out;
-               *src_len = strm.total_in;
-               break;
+               *((__le32 *) dst) = cpu_to_le32(len);
+               return len + 4;
        }
        default:
                BUG();
        }
+}
 
-       BUG_ON(!*dst_len);
-       BUG_ON(*dst_len > dst->bi_iter.bi_size);
+static unsigned __bio_compress(struct bch_fs *c,
+                              struct bio *dst, size_t *dst_len,
+                              struct bio *src, size_t *src_len,
+                              struct bch_compression_opt compression)
+{
+       struct bbuf src_data = { NULL }, dst_data = { NULL };
+       void *workspace;
+       enum bch_compression_type compression_type =
+               __bch2_compression_opt_to_type[compression.type];
+       unsigned pad;
+       int ret = 0;
 
-       BUG_ON(*src_len & (block_bytes(c) - 1));
-       BUG_ON(*src_len > src->bi_iter.bi_size);
+       BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
+       BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+       /* If it's only one block, don't bother trying to compress: */
+       if (src->bi_iter.bi_size <= c->opts.block_size)
+               return BCH_COMPRESSION_TYPE_incompressible;
+
+       dst_data = bio_map_or_bounce(c, dst, WRITE);
+       src_data = bio_map_or_bounce(c, src, READ);
+
+       workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
+
+       *src_len = src->bi_iter.bi_size;
+       *dst_len = dst->bi_iter.bi_size;
+
+       /*
+        * XXX: this algorithm sucks when the compression code doesn't tell us
+        * how much would fit, like LZ4 does:
+        */
+       while (1) {
+               if (*src_len <= block_bytes(c)) {
+                       ret = -1;
+                       break;
+               }
+
+               ret = attempt_compress(c, workspace,
+                                      dst_data.b,      *dst_len,
+                                      src_data.b,      *src_len,
+                                      compression);
+               if (ret > 0) {
+                       *dst_len = ret;
+                       ret = 0;
+                       break;
+               }
+
+               /* Didn't fit: should we retry with a smaller amount?  */
+               if (*src_len <= *dst_len) {
+                       ret = -1;
+                       break;
+               }
+
+               /*
+                * If ret is negative, it's a hint as to how much data would fit
+                */
+               BUG_ON(-ret >= *src_len);
+
+               if (ret < 0)
+                       *src_len = -ret;
+               else
+                       *src_len -= (*src_len - *dst_len) / 2;
+               *src_len = round_down(*src_len, block_bytes(c));
+       }
+
+       mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+       if (ret)
+               goto err;
 
        /* Didn't get smaller: */
-       if (round_up(*dst_len, block_bytes(c)) >= *src_len) {
-               ret = -1;
+       if (round_up(*dst_len, block_bytes(c)) >= *src_len)
                goto err;
-       }
 
        pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
 
-       memset(dst_data + *dst_len, 0, pad);
+       memset(dst_data.b + *dst_len, 0, pad);
        *dst_len += pad;
 
-       if (dst_bounced)
-               memcpy_to_bio(dst, dst->bi_iter, dst_data);
-err:
-       bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
-       bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
+       if (dst_data.type != BB_NONE &&
+           dst_data.type != BB_VMAP)
+               memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+       BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+       BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+       BUG_ON(*dst_len & (block_bytes(c) - 1));
+       BUG_ON(*src_len & (block_bytes(c) - 1));
+       ret = compression_type;
+out:
+       bio_unmap_or_unbounce(c, src_data);
+       bio_unmap_or_unbounce(c, dst_data);
        return ret;
+err:
+       ret = BCH_COMPRESSION_TYPE_incompressible;
+       goto out;
 }
 
-void bch2_bio_compress(struct bch_fs *c,
-                     struct bio *dst, size_t *dst_len,
-                     struct bio *src, size_t *src_len,
-                     unsigned *compression_type)
+unsigned bch2_bio_compress(struct bch_fs *c,
+                          struct bio *dst, size_t *dst_len,
+                          struct bio *src, size_t *src_len,
+                          unsigned compression_opt)
 {
        unsigned orig_dst = dst->bi_iter.bi_size;
        unsigned orig_src = src->bi_iter.bi_size;
+       unsigned compression_type;
 
        /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
-       src->bi_iter.bi_size =
-               min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9);
-
+       src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+                                    c->opts.encoded_extent_max);
        /* Don't generate a bigger output than input: */
-       dst->bi_iter.bi_size =
-               min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+       dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+       compression_type =
+               __bio_compress(c, dst, dst_len, src, src_len,
+                              bch2_compression_decode(compression_opt));
 
-       /* If it's only one block, don't bother trying to compress: */
-       if (*compression_type != BCH_COMPRESSION_NONE &&
-           bio_sectors(src) > c->sb.block_size &&
-           !__bio_compress(c, dst, dst_len, src, src_len, *compression_type))
-               goto out;
-
-       /* If compressing failed (didn't get smaller), just copy: */
-       *compression_type = BCH_COMPRESSION_NONE;
-       *dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-       bio_copy_data(dst, src);
-out:
        dst->bi_iter.bi_size = orig_dst;
        src->bi_iter.bi_size = orig_src;
+       return compression_type;
 }
 
-/* doesn't write superblock: */
-int bch2_check_set_has_compressed_data(struct bch_fs *c,
-                                     unsigned compression_type)
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_none       0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+       BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+#undef BCH_FEATURE_none
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 {
-       switch (compression_type) {
-       case BCH_COMPRESSION_NONE:
+       int ret = 0;
+
+       if ((c->sb.features & f) == f)
                return 0;
-       case BCH_COMPRESSION_LZ4:
-               if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
-                       return 0;
 
-               bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
-               break;
-       case BCH_COMPRESSION_GZIP:
-               if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
-                       return 0;
+       mutex_lock(&c->sb_lock);
 
-               bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
-               break;
+       if ((c->sb.features & f) == f) {
+               mutex_unlock(&c->sb_lock);
+               return 0;
+       }
+
+       ret = __bch2_fs_compress_init(c, c->sb.features|f);
+       if (ret) {
+               mutex_unlock(&c->sb_lock);
+               return ret;
        }
 
-       return bch2_fs_compress_init(c);
+       c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+                                      unsigned compression_opt)
+{
+       unsigned compression_type = bch2_compression_decode(compression_opt).type;
+
+       BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+       return compression_type
+               ? __bch2_check_set_has_compressed_data(c,
+                               1ULL << bch2_compression_opt_to_feature[compression_type])
+               : 0;
 }
 
 void bch2_fs_compress_exit(struct bch_fs *c)
 {
-       vfree(c->zlib_workspace);
-       mempool_exit(&c->lz4_workspace_pool);
+       unsigned i;
+
+       mempool_exit(&c->decompress_workspace);
+       for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+               mempool_exit(&c->compress_workspace[i]);
        mempool_exit(&c->compression_bounce[WRITE]);
        mempool_exit(&c->compression_bounce[READ]);
 }
 
-#define COMPRESSION_WORKSPACE_SIZE                                     \
-       max_t(size_t, zlib_inflate_workspacesize(),                     \
-             zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+       size_t decompress_workspace_size = 0;
+       ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
+                                                c->opts.encoded_extent_max);
+
+       c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
+
+       struct {
+               unsigned                        feature;
+               enum bch_compression_type       type;
+               size_t                          compress_workspace;
+               size_t                          decompress_workspace;
+       } compression_types[] = {
+               { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
+                       max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+                       0 },
+               { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
+                       zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+                       zlib_inflate_workspacesize(), },
+               { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
+                       c->zstd_workspace_size,
+                       zstd_dctx_workspace_bound() },
+       }, *i;
+       bool have_compressed = false;
+
+       for (i = compression_types;
+            i < compression_types + ARRAY_SIZE(compression_types);
+            i++)
+               have_compressed |= (features & (1 << i->feature)) != 0;
+
+       if (!have_compressed)
+               return 0;
+
+       if (!mempool_initialized(&c->compression_bounce[READ]) &&
+           mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+                                      1, c->opts.encoded_extent_max))
+               return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+
+       if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
+           mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+                                      1, c->opts.encoded_extent_max))
+               return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+
+       for (i = compression_types;
+            i < compression_types + ARRAY_SIZE(compression_types);
+            i++) {
+               decompress_workspace_size =
+                       max(decompress_workspace_size, i->decompress_workspace);
+
+               if (!(features & (1 << i->feature)))
+                       continue;
+
+               if (mempool_initialized(&c->compress_workspace[i->type]))
+                       continue;
+
+               if (mempool_init_kvmalloc_pool(
+                               &c->compress_workspace[i->type],
+                               1, i->compress_workspace))
+                       return -BCH_ERR_ENOMEM_compression_workspace_init;
+       }
+
+       if (!mempool_initialized(&c->decompress_workspace) &&
+           mempool_init_kvmalloc_pool(&c->decompress_workspace,
+                                      1, decompress_workspace_size))
+               return -BCH_ERR_ENOMEM_decompression_workspace_init;
+
+       return 0;
+}
+
+static u64 compression_opt_to_feature(unsigned v)
+{
+       unsigned type = bch2_compression_decode(v).type;
+
+       return BIT_ULL(bch2_compression_opt_to_feature[type]);
+}
 
 int bch2_fs_compress_init(struct bch_fs *c)
 {
-       unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
+       u64 f = c->sb.features;
+
+       f |= compression_opt_to_feature(c->opts.compression);
+       f |= compression_opt_to_feature(c->opts.background_compression);
+
+       return __bch2_fs_compress_init(c, f);
+}
+
+int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
+                              struct printbuf *err)
+{
+       char *val = kstrdup(_val, GFP_KERNEL);
+       char *p = val, *type_str, *level_str;
+       struct bch_compression_opt opt = { 0 };
        int ret;
 
-       if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
-           !bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
-               return 0;
+       if (!val)
+               return -ENOMEM;
 
-       if (!mempool_initialized(&c->compression_bounce[READ])) {
-               ret = mempool_init_page_pool(&c->compression_bounce[READ],
-                                            1, order);
-               if (ret)
-                       return ret;
-       }
+       type_str = strsep(&p, ":");
+       level_str = p;
 
-       if (!mempool_initialized(&c->compression_bounce[WRITE])) {
-               ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
-                                            1, order);
-               if (ret)
-                       return ret;
-       }
+       ret = match_string(bch2_compression_opts, -1, type_str);
+       if (ret < 0 && err)
+               prt_str(err, "invalid compression type");
+       if (ret < 0)
+               goto err;
 
-       if (!mempool_initialized(&c->lz4_workspace_pool) &&
-           bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) {
-               ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool,
-                                               1, LZ4_MEM_COMPRESS);
-               if (ret)
-                       return ret;
+       opt.type = ret;
+
+       if (level_str) {
+               unsigned level;
+
+               ret = kstrtouint(level_str, 10, &level);
+               if (!ret && !opt.type && level)
+                       ret = -EINVAL;
+               if (!ret && level > 15)
+                       ret = -EINVAL;
+               if (ret < 0 && err)
+                       prt_str(err, "invalid compression level");
+               if (ret < 0)
+                       goto err;
+
+               opt.level = level;
        }
 
-       if (!c->zlib_workspace &&
-           bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) {
-               c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
-               if (!c->zlib_workspace)
-                       return -ENOMEM;
+       *res = bch2_compression_encode(opt);
+err:
+       kfree(val);
+       return ret;
+}
+
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+       struct bch_compression_opt opt = bch2_compression_decode(v);
+
+       if (opt.type < BCH_COMPRESSION_OPT_NR)
+               prt_str(out, bch2_compression_opts[opt.type]);
+       else
+               prt_printf(out, "(unknown compression opt %u)", opt.type);
+       if (opt.level)
+               prt_printf(out, ":%u", opt.level);
+}
+
+void bch2_opt_compression_to_text(struct printbuf *out,
+                                 struct bch_fs *c,
+                                 struct bch_sb *sb,
+                                 u64 v)
+{
+       return bch2_compression_opt_to_text(out, v);
+}
+
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+       if (!bch2_compression_opt_valid(v)) {
+               prt_printf(err, "invalid compression opt %llu", v);
+               return -BCH_ERR_invalid_sb_opt_compression;
        }
 
        return 0;