X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=3c614c864b6eef698c9dac8ebc1884d7f4a61bed;hb=8d5e53b88aaafe7c01fc369e52dbd1fc8955a77d;hp=15ce0657c37b5830d2c5aeb6438e384a9b6ec75d;hpb=30cca2e94d0dfa8c3151daf1393f402d32bb9407;p=bcachefs-tools-debian diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 15ce065..3c614c8 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -32,14 +32,13 @@ #include "subvolume.h" #include "super.h" #include "super-io.h" +#include "trace.h" #include #include #include #include -#include - const char *bch2_blk_status_to_str(blk_status_t status) { if (status == BLK_STS_REMOVED) @@ -164,7 +163,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) struct page *page; if (likely(!*using_mempool)) { - page = alloc_page(GFP_NOIO); + page = alloc_page(GFP_NOFS); if (unlikely(!page)) { mutex_lock(&c->bio_bounce_pages_lock); *using_mempool = true; @@ -173,7 +172,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) } } else { pool_alloc: - page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); + page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); } return page; @@ -218,7 +217,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) { + for_each_btree_key_upto_continue_norestart(iter, + new->k.p, BTREE_ITER_SLOTS, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -257,15 +257,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); - k = bch2_bkey_get_mut(trans, &iter); + k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), + BTREE_ITER_CACHED); ret = PTR_ERR_OR_ZERO(k); if (unlikely(ret)) - goto err; + return ret; if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { k = bch2_inode_to_v3(trans, k); @@ -381,9 +380,10 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct bch_fs *c = trans->c; struct disk_reservation disk_res = { 0 }; struct closure cl; - struct open_buckets open_buckets; + struct open_buckets open_buckets = { 0 }; struct bkey_s_c k; struct bkey_buf old, new; + unsigned sectors_allocated = 0; bool have_reservation = false; bool unwritten = opts.nocow && c->sb.version >= bcachefs_metadata_version_unwritten_extents; @@ -392,8 +392,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, bch2_bkey_buf_init(&old); bch2_bkey_buf_init(&new); closure_init_stack(&cl); - open_buckets.nr = 0; -retry: + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -411,14 +410,14 @@ retry: */ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); if (unlikely(ret)) - goto out; + goto err; bch2_bkey_buf_reassemble(&old, c, k); } if (have_reservation) { if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) - goto out; + goto err; bch2_key_resize(&new.k->k, sectors); } else if (!unwritten) { @@ -449,16 +448,14 @@ retry: &devs_have, opts.data_replicas, opts.data_replicas, - RESERVE_none, 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { - bch2_trans_unlock(trans); - closure_sync(&cl); - goto retry; - } + BCH_WATERMARK_normal, 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + ret = -BCH_ERR_transaction_restart_nested; if (ret) - return ret; + goto err; sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; bch2_key_resize(&e->k, sectors); @@ -474,22 +471,20 @@ retry: ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, 0, i_sectors_delta, true); -out: - if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); - } - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - goto retry; - } +err: + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); bch2_bkey_buf_exit(&new, c); bch2_bkey_buf_exit(&old, c); + if (closure_nr_remaining(&cl) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + return ret; } @@ -654,7 +649,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOIO, &ca->replica_set)); + GFP_NOFS, &ca->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -704,11 +699,14 @@ static void bch2_write_done(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - bch2_disk_reservation_put(c, &op->res); - bch2_write_ref_put(c, BCH_WRITE_REF_write); - bch2_keylist_free(&op->insert_keys, op->inline_keys); + EBUG_ON(op->open_buckets.nr); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_disk_reservation_put(c, &op->res); + + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); EBUG_ON(cl->parent); closure_debug_destroy(cl); @@ -734,7 +732,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) } if (dst != src) - memmove_u64s_down(dst, src, src->u64s); + memmove_u64s_down(dst, src, src->k.u64s); dst = bkey_next(dst); } @@ -834,22 +832,32 @@ static void bch2_write_index(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct write_point *wp = op->wp; struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; - barrier(); + if ((op->flags & BCH_WRITE_DONE) && + (op->flags & BCH_WRITE_MOVE)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - /* - * We're not using wp->writes_lock here, so this is racey: that's ok, - * because this is just for diagnostic purposes, and we're running out - * of interrupt context here so if we were to take the log we'd have to - * switch to spin_lock_irq()/irqsave(), which is not free: - */ + spin_lock_irqsave(&wp->writes_lock, flags); if (wp->state == WRITE_POINT_waiting_io) __wp_update_state(wp, WRITE_POINT_waiting_work); + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock_irqrestore (&wp->writes_lock, flags); - op->btree_update_ready = true; queue_work(wq, &wp->index_update_work); } +static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) +{ + op->wp = wp; + + if (wp->state == WRITE_POINT_stopped) { + spin_lock_irq(&wp->writes_lock); + __wp_update_state(wp, WRITE_POINT_waiting_io); + spin_unlock_irq(&wp->writes_lock); + } +} + void bch2_write_point_do_index_updates(struct work_struct *work) { struct write_point *wp = @@ -857,16 +865,12 @@ void bch2_write_point_do_index_updates(struct work_struct *work) struct bch_write_op *op; while (1) { - spin_lock(&wp->writes_lock); - list_for_each_entry(op, &wp->writes, wp_list) - if (op->btree_update_ready) { - list_del(&op->wp_list); - goto unlock; - } - op = NULL; -unlock: + spin_lock_irq(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op) + list_del(&op->wp_list); wp_update_state(wp, op != NULL); - spin_unlock(&wp->writes_lock); + spin_unlock_irq(&wp->writes_lock); if (!op) break; @@ -963,7 +967,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, pages = min(pages, BIO_MAX_VECS); bio = bio_alloc_bioset(NULL, pages, 0, - GFP_NOIO, &c->bio_write); + GFP_NOFS, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; /* copy WRITE_SYNC flag */ @@ -1065,11 +1069,12 @@ static enum prep_encoded_ret { /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && op->crc.compressed_size <= wp->sectors_free && - (op->crc.compression_type == op->compression_type || + (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type)) + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; return PREP_ENCODED_DO_WRITE; @@ -1089,7 +1094,7 @@ static enum prep_encoded_ret { csum = bch2_checksum_bio(c, op->crc.csum_type, extent_nonce(op->version, op->crc), bio); - if (bch2_crc_cmp(op->crc.csum, csum)) + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) @@ -1107,13 +1112,14 @@ static enum prep_encoded_ret { */ if ((op->crc.live_size != op->crc.uncompressed_size || op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type)) + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_type || + if ((op->compression_opt || bch2_csum_type_is_encryption(op->crc.csum_type) != bch2_csum_type_is_encryption(op->csum_type)) && bch2_write_decrypt(op)) @@ -1160,7 +1166,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } if (ec_buf || - op->compression_type || + op->compression_opt || (op->csum_type && !(op->flags & BCH_WRITE_PAGES_STABLE)) || (bch2_csum_type_is_encryption(op->csum_type) && @@ -1183,16 +1189,16 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, dst->bi_iter.bi_size < c->opts.encoded_extent_max) break; - BUG_ON(op->compression_type && + BUG_ON(op->compression_opt && (op->flags & BCH_WRITE_DATA_ENCODED) && bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_type && !bounce); + BUG_ON(op->compression_opt && !bounce); crc.compression_type = op->incompressible ? BCH_COMPRESSION_TYPE_incompressible - : op->compression_type + : op->compression_opt ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_type) + op->compression_opt) : 0; if (!crc_is_compressed(crc)) { dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); @@ -1301,7 +1307,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, BUG_ON(total_output != total_input); dst = bio_split(src, total_input >> 9, - GFP_NOIO, &c->bio_write); + GFP_NOFS, &c->bio_write); wbio_init(dst)->put_bio = true; /* copy WRITE_SYNC flag */ dst->bi_opf = src->bi_opf; @@ -1380,7 +1386,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return 0; } - new = bch2_bkey_make_mut(trans, k); + new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -1467,7 +1473,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct btree_iter iter; struct bkey_s_c k; struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr, *ptr2; + const struct bch_extent_ptr *ptr; struct { struct bpos b; unsigned gen; @@ -1522,11 +1528,12 @@ retry: bucket_to_u64(buckets[nr_buckets].b)); prefetch(buckets[nr_buckets].l); - nr_buckets++; if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) goto err_get_ioref; + nr_buckets++; + if (ptr->unwritten) op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; } @@ -1617,12 +1624,8 @@ err: } return; err_get_ioref: - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - - percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); - } + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); /* Fall back to COW path: */ goto out; @@ -1631,12 +1634,11 @@ err_bucket_stale: bch2_bucket_nocow_unlock(&c->nocow_locks, buckets[i].b, BUCKET_NOCOW_LOCK_UPDATE); - - bkey_for_each_ptr(ptrs, ptr2) - percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); /* We can retry this: */ - ret = BCH_ERR_transaction_restart; + ret = -BCH_ERR_transaction_restart; goto out; } @@ -1657,7 +1659,6 @@ static void __bch2_write(struct bch_write_op *op) } again: memset(&op->failed, 0, sizeof(op->failed)); - op->btree_update_ready = false; do { struct bkey_i *key_to_write; @@ -1688,7 +1689,7 @@ again: &op->devs_have, op->nr_replicas, op->nr_replicas_required, - op->alloc_reserve, + op->watermark, op->flags, (op->flags & (BCH_WRITE_ALLOC_NOWAIT| BCH_WRITE_ONLY_SPECIFIED_DEVS)) @@ -1708,15 +1709,6 @@ again: bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - if (!(op->flags & BCH_WRITE_SYNC)) { - spin_lock(&wp->writes_lock); - op->wp = wp; - list_add_tail(&op->wp_list, &wp->writes); - if (wp->state == WRITE_POINT_stopped) - __wp_update_state(wp, WRITE_POINT_waiting_io); - spin_unlock(&wp->writes_lock); - } - op->flags |= BCH_WRITE_DONE; if (ret < 0) { @@ -1755,6 +1747,7 @@ err: goto again; bch2_write_done(&op->cl); } else { + bch2_write_queue(op, wp); continue_at(&op->cl, bch2_write_index, NULL); } out_nofs_restore: @@ -1845,7 +1838,12 @@ void bch2_write(struct closure *cl) goto err; } - if (c->opts.nochanges || + if (c->opts.nochanges) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + if (!(op->flags & BCH_WRITE_MOVE) && !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; @@ -1873,6 +1871,34 @@ err: op->end_io(op); } +static const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x + NULL +}; + +void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) +{ + prt_str(out, "pos: "); + bch2_bpos_to_text(out, op->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "started: "); + bch2_pr_time_units(out, local_clock() - op->start_time); + prt_newline(out); + + prt_str(out, "flags: "); + prt_bitflags(out, bch2_write_flags, op->flags); + prt_newline(out); + + prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + /* Cache promotion on read */ struct promote_op { @@ -1980,7 +2006,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return NULL; - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); if (!op) goto err; @@ -1993,7 +2019,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, */ *rbio = kzalloc(sizeof(struct bch_read_bio) + sizeof(struct bio_vec) * pages, - GFP_NOIO); + GFP_NOFS); if (!*rbio) goto err; @@ -2001,7 +2027,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOIO)) + GFP_NOFS)) goto err; (*rbio)->bounce = true; @@ -2024,14 +2050,17 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, }, btree_id, k); - if (ret == -BCH_ERR_nocow_lock_blocked) { + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) { ret = rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params); BUG_ON(ret); goto err; } - BUG_ON(ret); op->write.op.end_io = promote_done; return op; @@ -2278,9 +2307,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (crc_is_compressed(rbio->pick.crc)) return 0; - bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); if ((ret = bkey_err(k))) goto out; @@ -2381,7 +2409,8 @@ static void __bch2_read_endio(struct work_struct *work) if (ret) goto decrypt_err; - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) goto decompression_err; } else { /* don't need to decrypt the entire bio: */ @@ -2397,6 +2426,7 @@ static void __bch2_read_endio(struct work_struct *work) if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } } @@ -2516,10 +2546,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), - BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), 0); ret = bkey_err(k); if (ret) goto err; @@ -2716,7 +2744,7 @@ get_bio: rbio = rbio_init(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, - GFP_NOIO, + GFP_NOFS, &c->bio_read_split), orig->opts); @@ -2732,7 +2760,7 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO, + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), orig->opts); rbio->bio.bi_iter = iter; @@ -2998,18 +3026,26 @@ void bch2_fs_io_exit(struct bch_fs *c) int bch2_fs_io_init(struct bch_fs *c) { if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS) || - mempool_init_page_pool(&c->bio_bounce_pages, + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; + + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_write_init; + + if (mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->opts.btree_node_size, c->opts.encoded_extent_max) / - PAGE_SIZE, 0) || - rhashtable_init(&c->promote_table, &bch_promote_params)) - return -ENOMEM; + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; return 0; }