1 // SPDX-License-Identifier: GPL-2.0
3 * Some low level IO code, and hacks for various block layer limitations
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
14 #include "btree_update.h"
19 #include "data_update.h"
21 #include "disk_groups.h"
24 #include "extent_update.h"
30 #include "rebalance.h"
31 #include "subvolume.h"
35 #include <linux/blkdev.h>
36 #include <linux/random.h>
37 #include <linux/sched/mm.h>
39 #include <trace/events/bcachefs.h>
41 const char *bch2_blk_status_to_str(blk_status_t status)
43 if (status == BLK_STS_REMOVED)
44 return "device removed";
45 return blk_status_to_str(status);
48 static bool bch2_target_congested(struct bch_fs *c, u16 target)
50 const struct bch_devs_mask *devs;
51 unsigned d, nr = 0, total = 0;
52 u64 now = local_clock(), last;
60 devs = bch2_target_to_mask(c, target) ?:
61 &c->rw_devs[BCH_DATA_user];
63 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
64 ca = rcu_dereference(c->devs[d]);
68 congested = atomic_read(&ca->congested);
69 last = READ_ONCE(ca->congested_last);
70 if (time_after64(now, last))
71 congested -= (now - last) >> 12;
73 total += max(congested, 0LL);
78 return bch2_rand_range(nr * CONGESTED_MAX) < total;
81 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
85 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
86 /* ideally we'd be taking into account the device's variance here: */
87 u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
88 s64 latency_over = io_latency - latency_threshold;
90 if (latency_threshold && latency_over > 0) {
92 * bump up congested by approximately latency_over * 4 /
93 * latency_threshold - we don't need much accuracy here so don't
94 * bother with the divide:
96 if (atomic_read(&ca->congested) < CONGESTED_MAX)
97 atomic_add(latency_over >>
98 max_t(int, ilog2(latency_threshold) - 2, 0),
101 ca->congested_last = now;
102 } else if (atomic_read(&ca->congested) > 0) {
103 atomic_dec(&ca->congested);
107 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
109 atomic64_t *latency = &ca->cur_latency[rw];
110 u64 now = local_clock();
111 u64 io_latency = time_after64(now, submit_time)
114 u64 old, new, v = atomic64_read(latency);
120 * If the io latency was reasonably close to the current
121 * latency, skip doing the update and atomic operation - most of
124 if (abs((int) (old - io_latency)) < (old >> 1) &&
128 new = ewma_add(old, io_latency, 5);
129 } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
131 bch2_congested_acct(ca, io_latency, now, rw);
133 __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
136 /* Allocate, free from mempool: */
138 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
140 struct bvec_iter_all iter;
143 bio_for_each_segment_all(bv, bio, iter)
144 if (bv->bv_page != ZERO_PAGE(0))
145 mempool_free(bv->bv_page, &c->bio_bounce_pages);
149 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
153 if (likely(!*using_mempool)) {
154 page = alloc_page(GFP_NOIO);
155 if (unlikely(!page)) {
156 mutex_lock(&c->bio_bounce_pages_lock);
157 *using_mempool = true;
163 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
169 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
172 bool using_mempool = false;
175 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
176 unsigned len = min_t(size_t, PAGE_SIZE, size);
178 BUG_ON(!bio_add_page(bio, page, len, 0));
183 mutex_unlock(&c->bio_bounce_pages_lock);
186 /* Extent update path: */
188 int bch2_sum_sector_overwrites(struct btree_trans *trans,
189 struct btree_iter *extent_iter,
191 bool *usage_increasing,
192 s64 *i_sectors_delta,
193 s64 *disk_sectors_delta)
195 struct bch_fs *c = trans->c;
196 struct btree_iter iter;
198 unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
199 bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
202 *usage_increasing = false;
203 *i_sectors_delta = 0;
204 *disk_sectors_delta = 0;
206 bch2_trans_copy_iter(&iter, extent_iter);
208 for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
209 s64 sectors = min(new->k.p.offset, old.k->p.offset) -
210 max(bkey_start_offset(&new->k),
211 bkey_start_offset(old.k));
213 *i_sectors_delta += sectors *
214 (bkey_extent_is_allocation(&new->k) -
215 bkey_extent_is_allocation(old.k));
217 *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
218 *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
219 ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
222 if (!*usage_increasing &&
223 (new->k.p.snapshot != old.k->p.snapshot ||
224 new_replicas > bch2_bkey_replicas(c, old) ||
225 (!new_compressed && bch2_bkey_sectors_compressed(old))))
226 *usage_increasing = true;
228 if (bkey_ge(old.k->p, new->k.p))
232 bch2_trans_iter_exit(trans, &iter);
236 static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
237 struct btree_iter *extent_iter,
241 struct btree_iter iter;
243 struct bkey_i_inode_v3 *inode;
246 bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
248 extent_iter->pos.inode,
249 extent_iter->snapshot),
250 BTREE_ITER_INTENT|BTREE_ITER_CACHED);
251 k = bch2_bkey_get_mut(trans, &iter);
252 ret = PTR_ERR_OR_ZERO(k);
256 if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
257 k = bch2_inode_to_v3(trans, k);
258 ret = PTR_ERR_OR_ZERO(k);
263 inode = bkey_i_to_inode_v3(k);
265 if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
266 new_i_size > le64_to_cpu(inode->v.bi_size))
267 inode->v.bi_size = cpu_to_le64(new_i_size);
269 le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
271 inode->k.p.snapshot = iter.snapshot;
273 ret = bch2_trans_update(trans, &iter, &inode->k_i,
274 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
276 bch2_trans_iter_exit(trans, &iter);
280 int bch2_extent_update(struct btree_trans *trans,
282 struct btree_iter *iter,
284 struct disk_reservation *disk_res,
286 s64 *i_sectors_delta_total,
289 struct bpos next_pos;
290 bool usage_increasing;
291 s64 i_sectors_delta = 0, disk_sectors_delta = 0;
295 * This traverses us the iterator without changing iter->path->pos to
296 * search_key() (which is pos + 1 for extents): we want there to be a
297 * path already traversed at iter->pos because
298 * bch2_trans_extent_update() will use it to attempt extent merging
300 ret = __bch2_btree_iter_traverse(iter);
304 ret = bch2_extent_trim_atomic(trans, iter, k);
310 ret = bch2_sum_sector_overwrites(trans, iter, k,
313 &disk_sectors_delta);
318 disk_sectors_delta > (s64) disk_res->sectors) {
319 ret = bch2_disk_reservation_add(trans->c, disk_res,
320 disk_sectors_delta - disk_res->sectors,
321 !check_enospc || !usage_increasing
322 ? BCH_DISK_RESERVATION_NOFAIL : 0);
329 * We always have to do an inode update - even when i_size/i_sectors
330 * aren't changing - for fsync to work properly; fsync relies on
331 * inode->bi_journal_seq which is updated by the trigger code:
333 ret = bch2_extent_update_i_size_sectors(trans, iter,
334 min(k->k.p.offset << 9, new_i_size),
336 bch2_trans_update(trans, iter, k, 0) ?:
337 bch2_trans_commit(trans, disk_res, NULL,
338 BTREE_INSERT_NOCHECK_RW|
339 BTREE_INSERT_NOFAIL);
343 if (i_sectors_delta_total)
344 *i_sectors_delta_total += i_sectors_delta;
345 bch2_btree_iter_set_pos(iter, next_pos);
349 /* Overwrites whatever was present with zeroes: */
350 int bch2_extent_fallocate(struct btree_trans *trans,
352 struct btree_iter *iter,
354 struct bch_io_opts opts,
355 s64 *i_sectors_delta,
356 struct write_point_specifier write_point)
358 struct bch_fs *c = trans->c;
359 struct disk_reservation disk_res = { 0 };
361 struct open_buckets open_buckets;
363 struct bkey_buf old, new;
364 bool have_reservation = false;
365 bool unwritten = opts.nocow &&
366 c->sb.version >= bcachefs_metadata_version_unwritten_extents;
369 bch2_bkey_buf_init(&old);
370 bch2_bkey_buf_init(&new);
371 closure_init_stack(&cl);
374 k = bch2_btree_iter_peek_slot(iter);
379 sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
381 if (!have_reservation) {
382 unsigned new_replicas =
383 max(0, (int) opts.data_replicas -
384 (int) bch2_bkey_nr_ptrs_fully_allocated(k));
386 * Get a disk reservation before (in the nocow case) calling
387 * into the allocator:
389 ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
393 bch2_bkey_buf_reassemble(&old, c, k);
396 if (have_reservation) {
397 if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
400 bch2_key_resize(&new.k->k, sectors);
401 } else if (!unwritten) {
402 struct bkey_i_reservation *reservation;
404 bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
405 reservation = bkey_reservation_init(new.k);
406 reservation->k.p = iter->pos;
407 bch2_key_resize(&reservation->k, sectors);
408 reservation->v.nr_replicas = opts.data_replicas;
410 struct bkey_i_extent *e;
411 struct bch_devs_list devs_have;
412 struct write_point *wp;
413 struct bch_extent_ptr *ptr;
417 bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
419 e = bkey_extent_init(new.k);
422 ret = bch2_alloc_sectors_start_trans(trans,
423 opts.foreground_target,
429 RESERVE_none, 0, &cl, &wp);
430 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
431 bch2_trans_unlock(trans);
438 sectors = min(sectors, wp->sectors_free);
440 bch2_key_resize(&e->k, sectors);
442 bch2_open_bucket_get(c, wp, &open_buckets);
443 bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
444 bch2_alloc_sectors_done(c, wp);
446 extent_for_each_ptr(extent_i_to_s(e), ptr)
447 ptr->unwritten = true;
450 have_reservation = true;
452 ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
453 0, i_sectors_delta, true);
455 if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
456 bch2_trans_unlock(trans);
460 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
461 bch2_trans_begin(trans);
465 bch2_open_buckets_put(c, &open_buckets);
466 bch2_disk_reservation_put(c, &disk_res);
467 bch2_bkey_buf_exit(&new, c);
468 bch2_bkey_buf_exit(&old, c);
474 * Returns -BCH_ERR_transacton_restart if we had to drop locks:
476 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
477 subvol_inum inum, u64 end,
478 s64 *i_sectors_delta)
480 struct bch_fs *c = trans->c;
481 unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
482 struct bpos end_pos = POS(inum.inum, end);
484 int ret = 0, ret2 = 0;
488 bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
489 struct disk_reservation disk_res =
490 bch2_disk_reservation_init(c, 0);
491 struct bkey_i delete;
496 bch2_trans_begin(trans);
498 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
502 bch2_btree_iter_set_snapshot(iter, snapshot);
505 * peek_upto() doesn't have ideal semantics for extents:
507 k = bch2_btree_iter_peek_upto(iter, end_pos);
515 BUG_ON(bkey_ge(iter->pos, end_pos));
517 bkey_init(&delete.k);
518 delete.k.p = iter->pos;
520 /* create the biggest key we can */
521 bch2_key_resize(&delete.k, max_sectors);
522 bch2_cut_back(end_pos, &delete);
524 ret = bch2_extent_update(trans, inum, iter, &delete,
525 &disk_res, 0, i_sectors_delta, false);
526 bch2_disk_reservation_put(c, &disk_res);
529 BUG_ON(bkey_gt(iter->pos, end_pos));
534 int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
535 s64 *i_sectors_delta)
537 struct btree_trans trans;
538 struct btree_iter iter;
541 bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
542 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
543 POS(inum.inum, start),
546 ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
548 bch2_trans_iter_exit(&trans, &iter);
549 bch2_trans_exit(&trans);
551 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
557 static int bch2_write_index_default(struct bch_write_op *op)
559 struct bch_fs *c = op->c;
561 struct keylist *keys = &op->insert_keys;
562 struct bkey_i *k = bch2_keylist_front(keys);
563 struct btree_trans trans;
564 struct btree_iter iter;
566 .subvol = op->subvol,
567 .inum = k->k.p.inode,
571 BUG_ON(!inum.subvol);
573 bch2_bkey_buf_init(&sk);
574 bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
577 bch2_trans_begin(&trans);
579 k = bch2_keylist_front(keys);
580 bch2_bkey_buf_copy(&sk, c, k);
582 ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
583 &sk.k->k.p.snapshot);
584 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
589 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
590 bkey_start_pos(&sk.k->k),
591 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
593 ret = bch2_extent_update(&trans, inum, &iter, sk.k,
595 op->new_i_size, &op->i_sectors_delta,
596 op->flags & BCH_WRITE_CHECK_ENOSPC);
597 bch2_trans_iter_exit(&trans, &iter);
599 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
604 if (bkey_ge(iter.pos, k->k.p))
605 bch2_keylist_pop_front(&op->insert_keys);
607 bch2_cut_front(iter.pos, k);
608 } while (!bch2_keylist_empty(keys));
610 bch2_trans_exit(&trans);
611 bch2_bkey_buf_exit(&sk, c);
618 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
619 enum bch_data_type type,
620 const struct bkey_i *k,
623 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
624 const struct bch_extent_ptr *ptr;
625 struct bch_write_bio *n;
628 BUG_ON(c->opts.nochanges);
630 bkey_for_each_ptr(ptrs, ptr) {
631 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
634 ca = bch_dev_bkey_exists(c, ptr->dev);
636 if (to_entry(ptr + 1) < ptrs.end) {
637 n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
638 GFP_NOIO, &ca->replica_set));
640 n->bio.bi_end_io = wbio->bio.bi_end_io;
641 n->bio.bi_private = wbio->bio.bi_private;
646 n->bio.bi_opf = wbio->bio.bi_opf;
647 bio_inc_remaining(&wbio->bio);
655 n->have_ioref = nocow || bch2_dev_get_ioref(ca,
656 type == BCH_DATA_btree ? READ : WRITE);
658 n->submit_time = local_clock();
659 n->inode_offset = bkey_start_offset(&k->k);
660 n->bio.bi_iter.bi_sector = ptr->offset;
662 if (likely(n->have_ioref)) {
663 this_cpu_add(ca->io_done->sectors[WRITE][type],
664 bio_sectors(&n->bio));
666 bio_set_dev(&n->bio, ca->disk_sb.bdev);
669 n->bio.bi_status = BLK_STS_REMOVED;
675 static void __bch2_write(struct bch_write_op *);
677 static void bch2_write_done(struct closure *cl)
679 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
680 struct bch_fs *c = op->c;
682 bch2_disk_reservation_put(c, &op->res);
683 percpu_ref_put(&c->writes);
684 bch2_keylist_free(&op->insert_keys, op->inline_keys);
686 bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
688 closure_debug_destroy(cl);
693 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
695 struct keylist *keys = &op->insert_keys;
696 struct bch_extent_ptr *ptr;
697 struct bkey_i *src, *dst = keys->keys, *n;
699 for (src = keys->keys; src != keys->top; src = n) {
702 if (bkey_extent_is_direct_data(&src->k)) {
703 bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
704 test_bit(ptr->dev, op->failed.d));
706 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
711 memmove_u64s_down(dst, src, src->u64s);
712 dst = bkey_next(dst);
720 * bch_write_index - after a write, update index to point to new data
722 static void __bch2_write_index(struct bch_write_op *op)
724 struct bch_fs *c = op->c;
725 struct keylist *keys = &op->insert_keys;
730 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
731 ret = bch2_write_drop_io_error_ptrs(op);
737 * probably not the ideal place to hook this in, but I don't
738 * particularly want to plumb io_opts all the way through the btree
739 * update stack right now
741 for_each_keylist_key(keys, k)
742 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
744 if (!bch2_keylist_empty(keys)) {
745 u64 sectors_start = keylist_sectors(keys);
747 ret = !(op->flags & BCH_WRITE_MOVE)
748 ? bch2_write_index_default(op)
749 : bch2_data_update_index_update(op);
751 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
752 BUG_ON(keylist_sectors(keys) && !ret);
754 op->written += sectors_start - keylist_sectors(keys);
757 struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
759 bch_err_inum_offset_ratelimited(c,
760 k->k.p.inode, k->k.p.offset << 9,
761 "write error while doing btree update: %s",
767 /* If some a bucket wasn't written, we can't erasure code it: */
768 for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
769 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
771 bch2_open_buckets_put(c, &op->open_buckets);
774 keys->top = keys->keys;
776 op->flags |= BCH_WRITE_DONE;
780 static void bch2_write_index(struct closure *cl)
782 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
783 struct write_point *wp = op->wp;
784 struct workqueue_struct *wq = index_update_wq(op);
787 op->btree_update_ready = true;
788 queue_work(wq, &wp->index_update_work);
791 void bch2_write_point_do_index_updates(struct work_struct *work)
793 struct write_point *wp =
794 container_of(work, struct write_point, index_update_work);
795 struct bch_write_op *op;
798 spin_lock(&wp->writes_lock);
799 op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
800 if (op && !op->btree_update_ready)
803 list_del(&op->wp_list);
804 spin_unlock(&wp->writes_lock);
809 __bch2_write_index(op);
811 if (!(op->flags & BCH_WRITE_DONE))
814 bch2_write_done(&op->cl);
818 static void bch2_write_endio(struct bio *bio)
820 struct closure *cl = bio->bi_private;
821 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
822 struct bch_write_bio *wbio = to_wbio(bio);
823 struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
824 struct bch_fs *c = wbio->c;
825 struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
827 if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
829 wbio->inode_offset << 9,
830 "data write error: %s",
831 bch2_blk_status_to_str(bio->bi_status))) {
832 set_bit(wbio->dev, op->failed.d);
833 op->flags |= BCH_WRITE_IO_ERROR;
837 set_bit(wbio->dev, op->devs_need_flush->d);
839 if (wbio->have_ioref) {
840 bch2_latency_acct(ca, wbio->submit_time, WRITE);
841 percpu_ref_put(&ca->io_ref);
845 bch2_bio_free_pages_pool(c, bio);
851 bio_endio(&parent->bio);
858 static void init_append_extent(struct bch_write_op *op,
859 struct write_point *wp,
860 struct bversion version,
861 struct bch_extent_crc_unpacked crc)
863 struct bch_fs *c = op->c;
864 struct bkey_i_extent *e;
866 op->pos.offset += crc.uncompressed_size;
868 e = bkey_extent_init(op->insert_keys.top);
870 e->k.size = crc.uncompressed_size;
871 e->k.version = version;
874 crc.compression_type ||
876 bch2_extent_crc_append(&e->k_i, crc);
878 bch2_alloc_sectors_append_ptrs_inlined(c, wp, &e->k_i, crc.compressed_size,
879 op->flags & BCH_WRITE_CACHED);
881 bch2_keylist_push(&op->insert_keys);
884 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
885 struct write_point *wp,
887 bool *page_alloc_failed,
890 struct bch_write_bio *wbio;
892 unsigned output_available =
893 min(wp->sectors_free << 9, src->bi_iter.bi_size);
894 unsigned pages = DIV_ROUND_UP(output_available +
896 ? ((unsigned long) buf & (PAGE_SIZE - 1))
899 pages = min(pages, BIO_MAX_VECS);
901 bio = bio_alloc_bioset(NULL, pages, 0,
902 GFP_NOIO, &c->bio_write);
903 wbio = wbio_init(bio);
904 wbio->put_bio = true;
905 /* copy WRITE_SYNC flag */
906 wbio->bio.bi_opf = src->bi_opf;
909 bch2_bio_map(bio, buf, output_available);
916 * We can't use mempool for more than c->sb.encoded_extent_max
917 * worth of pages, but we'd like to allocate more if we can:
919 bch2_bio_alloc_pages_pool(c, bio,
920 min_t(unsigned, output_available,
921 c->opts.encoded_extent_max));
923 if (bio->bi_iter.bi_size < output_available)
925 bch2_bio_alloc_pages(bio,
927 bio->bi_iter.bi_size,
933 static int bch2_write_rechecksum(struct bch_fs *c,
934 struct bch_write_op *op,
935 unsigned new_csum_type)
937 struct bio *bio = &op->wbio.bio;
938 struct bch_extent_crc_unpacked new_crc;
941 /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
943 if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
944 bch2_csum_type_is_encryption(new_csum_type))
945 new_csum_type = op->crc.csum_type;
947 ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
949 op->crc.offset, op->crc.live_size,
954 bio_advance(bio, op->crc.offset << 9);
955 bio->bi_iter.bi_size = op->crc.live_size << 9;
960 static int bch2_write_decrypt(struct bch_write_op *op)
962 struct bch_fs *c = op->c;
963 struct nonce nonce = extent_nonce(op->version, op->crc);
964 struct bch_csum csum;
967 if (!bch2_csum_type_is_encryption(op->crc.csum_type))
971 * If we need to decrypt data in the write path, we'll no longer be able
972 * to verify the existing checksum (poly1305 mac, in this case) after
973 * it's decrypted - this is the last point we'll be able to reverify the
976 csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
977 if (bch2_crc_cmp(op->crc.csum, csum))
980 ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
981 op->crc.csum_type = 0;
982 op->crc.csum = (struct bch_csum) { 0, 0 };
986 static enum prep_encoded_ret {
989 PREP_ENCODED_CHECKSUM_ERR,
990 PREP_ENCODED_DO_WRITE,
991 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
993 struct bch_fs *c = op->c;
994 struct bio *bio = &op->wbio.bio;
996 if (!(op->flags & BCH_WRITE_DATA_ENCODED))
997 return PREP_ENCODED_OK;
999 BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
1001 /* Can we just write the entire extent as is? */
1002 if (op->crc.uncompressed_size == op->crc.live_size &&
1003 op->crc.compressed_size <= wp->sectors_free &&
1004 (op->crc.compression_type == op->compression_type ||
1005 op->incompressible)) {
1006 if (!crc_is_compressed(op->crc) &&
1007 op->csum_type != op->crc.csum_type &&
1008 bch2_write_rechecksum(c, op, op->csum_type))
1009 return PREP_ENCODED_CHECKSUM_ERR;
1011 return PREP_ENCODED_DO_WRITE;
1015 * If the data is compressed and we couldn't write the entire extent as
1016 * is, we have to decompress it:
1018 if (crc_is_compressed(op->crc)) {
1019 struct bch_csum csum;
1021 if (bch2_write_decrypt(op))
1022 return PREP_ENCODED_CHECKSUM_ERR;
1024 /* Last point we can still verify checksum: */
1025 csum = bch2_checksum_bio(c, op->crc.csum_type,
1026 extent_nonce(op->version, op->crc),
1028 if (bch2_crc_cmp(op->crc.csum, csum))
1029 return PREP_ENCODED_CHECKSUM_ERR;
1031 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
1032 return PREP_ENCODED_ERR;
1036 * No longer have compressed data after this point - data might be
1041 * If the data is checksummed and we're only writing a subset,
1042 * rechecksum and adjust bio to point to currently live data:
1044 if ((op->crc.live_size != op->crc.uncompressed_size ||
1045 op->crc.csum_type != op->csum_type) &&
1046 bch2_write_rechecksum(c, op, op->csum_type))
1047 return PREP_ENCODED_CHECKSUM_ERR;
1050 * If we want to compress the data, it has to be decrypted:
1052 if ((op->compression_type ||
1053 bch2_csum_type_is_encryption(op->crc.csum_type) !=
1054 bch2_csum_type_is_encryption(op->csum_type)) &&
1055 bch2_write_decrypt(op))
1056 return PREP_ENCODED_CHECKSUM_ERR;
1058 return PREP_ENCODED_OK;
1061 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
1064 struct bch_fs *c = op->c;
1065 struct bio *src = &op->wbio.bio, *dst = src;
1066 struct bvec_iter saved_iter;
1068 unsigned total_output = 0, total_input = 0;
1069 bool bounce = false;
1070 bool page_alloc_failed = false;
1073 BUG_ON(!bio_sectors(src));
1075 ec_buf = bch2_writepoint_ec_buf(c, wp);
1077 switch (bch2_write_prep_encoded_data(op, wp)) {
1078 case PREP_ENCODED_OK:
1080 case PREP_ENCODED_ERR:
1083 case PREP_ENCODED_CHECKSUM_ERR:
1085 case PREP_ENCODED_DO_WRITE:
1086 /* XXX look for bug here */
1088 dst = bch2_write_bio_alloc(c, wp, src,
1091 bio_copy_data(dst, src);
1094 init_append_extent(op, wp, op->version, op->crc);
1099 op->compression_type ||
1101 !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
1102 (bch2_csum_type_is_encryption(op->csum_type) &&
1103 !(op->flags & BCH_WRITE_PAGES_OWNED))) {
1104 dst = bch2_write_bio_alloc(c, wp, src,
1110 saved_iter = dst->bi_iter;
1113 struct bch_extent_crc_unpacked crc = { 0 };
1114 struct bversion version = op->version;
1115 size_t dst_len, src_len;
1117 if (page_alloc_failed &&
1118 dst->bi_iter.bi_size < (wp->sectors_free << 9) &&
1119 dst->bi_iter.bi_size < c->opts.encoded_extent_max)
1122 BUG_ON(op->compression_type &&
1123 (op->flags & BCH_WRITE_DATA_ENCODED) &&
1124 bch2_csum_type_is_encryption(op->crc.csum_type));
1125 BUG_ON(op->compression_type && !bounce);
1127 crc.compression_type = op->incompressible
1128 ? BCH_COMPRESSION_TYPE_incompressible
1129 : op->compression_type
1130 ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
1131 op->compression_type)
1133 if (!crc_is_compressed(crc)) {
1134 dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
1135 dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
1138 dst_len = min_t(unsigned, dst_len,
1139 c->opts.encoded_extent_max);
1142 swap(dst->bi_iter.bi_size, dst_len);
1143 bio_copy_data(dst, src);
1144 swap(dst->bi_iter.bi_size, dst_len);
1150 BUG_ON(!src_len || !dst_len);
1152 if (bch2_csum_type_is_encryption(op->csum_type)) {
1153 if (bversion_zero(version)) {
1154 version.lo = atomic64_inc_return(&c->key_version);
1156 crc.nonce = op->nonce;
1157 op->nonce += src_len >> 9;
1161 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
1162 !crc_is_compressed(crc) &&
1163 bch2_csum_type_is_encryption(op->crc.csum_type) ==
1164 bch2_csum_type_is_encryption(op->csum_type)) {
1165 u8 compression_type = crc.compression_type;
1166 u16 nonce = crc.nonce;
1168 * Note: when we're using rechecksum(), we need to be
1169 * checksumming @src because it has all the data our
1170 * existing checksum covers - if we bounced (because we
1171 * were trying to compress), @dst will only have the
1172 * part of the data the new checksum will cover.
1174 * But normally we want to be checksumming post bounce,
1175 * because part of the reason for bouncing is so the
1176 * data can't be modified (by userspace) while it's in
1179 if (bch2_rechecksum_bio(c, src, version, op->crc,
1182 bio_sectors(src) - (src_len >> 9),
1186 * rchecksum_bio sets compression_type on crc from op->crc,
1187 * this isn't always correct as sometimes we're changing
1188 * an extent from uncompressed to incompressible.
1190 crc.compression_type = compression_type;
1193 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
1194 bch2_rechecksum_bio(c, src, version, op->crc,
1197 bio_sectors(src) - (src_len >> 9),
1201 crc.compressed_size = dst_len >> 9;
1202 crc.uncompressed_size = src_len >> 9;
1203 crc.live_size = src_len >> 9;
1205 swap(dst->bi_iter.bi_size, dst_len);
1206 ret = bch2_encrypt_bio(c, op->csum_type,
1207 extent_nonce(version, crc), dst);
1211 crc.csum = bch2_checksum_bio(c, op->csum_type,
1212 extent_nonce(version, crc), dst);
1213 crc.csum_type = op->csum_type;
1214 swap(dst->bi_iter.bi_size, dst_len);
1217 init_append_extent(op, wp, version, crc);
1220 bio_advance(dst, dst_len);
1221 bio_advance(src, src_len);
1222 total_output += dst_len;
1223 total_input += src_len;
1224 } while (dst->bi_iter.bi_size &&
1225 src->bi_iter.bi_size &&
1227 !bch2_keylist_realloc(&op->insert_keys,
1229 ARRAY_SIZE(op->inline_keys),
1230 BKEY_EXTENT_U64s_MAX));
1232 more = src->bi_iter.bi_size != 0;
1234 dst->bi_iter = saved_iter;
1236 if (dst == src && more) {
1237 BUG_ON(total_output != total_input);
1239 dst = bio_split(src, total_input >> 9,
1240 GFP_NOIO, &c->bio_write);
1241 wbio_init(dst)->put_bio = true;
1242 /* copy WRITE_SYNC flag */
1243 dst->bi_opf = src->bi_opf;
1246 dst->bi_iter.bi_size = total_output;
1251 bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
1254 if (to_wbio(dst)->bounce)
1255 bch2_bio_free_pages_pool(c, dst);
1256 if (to_wbio(dst)->put_bio)
1262 static bool bch2_extent_is_writeable(struct bch_write_op *op,
1265 struct bch_fs *c = op->c;
1266 struct bkey_s_c_extent e;
1267 struct extent_ptr_decoded p;
1268 const union bch_extent_entry *entry;
1269 unsigned replicas = 0;
1271 if (k.k->type != KEY_TYPE_extent)
1274 e = bkey_s_c_to_extent(k);
1275 extent_for_each_ptr_decode(e, p, entry) {
1276 if (p.crc.csum_type ||
1277 crc_is_compressed(p.crc) ||
1281 replicas += bch2_extent_ptr_durability(c, &p);
1284 return replicas >= op->opts.data_replicas;
1287 static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
1289 struct bch_fs *c = op->c;
1290 const struct bch_extent_ptr *ptr;
1293 for_each_keylist_key(&op->insert_keys, k) {
1294 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
1296 bkey_for_each_ptr(ptrs, ptr)
1297 bch2_bucket_nocow_unlock(&c->nocow_locks,
1298 PTR_BUCKET_POS(c, ptr),
1299 BUCKET_NOCOW_LOCK_UPDATE);
1303 static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
1304 struct btree_iter *iter,
1305 struct bkey_i *orig,
1310 struct bkey_ptrs ptrs;
1311 struct bch_extent_ptr *ptr;
1314 if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
1319 new = bch2_bkey_make_mut(trans, k);
1320 ret = PTR_ERR_OR_ZERO(new);
1324 bch2_cut_front(bkey_start_pos(&orig->k), new);
1325 bch2_cut_back(orig->k.p, new);
1327 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
1328 bkey_for_each_ptr(ptrs, ptr)
1332 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
1333 * that was done when we kicked off the write, and here it's important
1334 * that we update the extent that we wrote to - even if a snapshot has
1335 * since been created. The write is still outstanding, so we're ok
1336 * w.r.t. snapshot atomicity:
1338 return bch2_extent_update_i_size_sectors(trans, iter,
1339 min(new->k.p.offset << 9, new_i_size), 0) ?:
1340 bch2_trans_update(trans, iter, new,
1341 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1344 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
1346 struct bch_fs *c = op->c;
1347 struct btree_trans trans;
1348 struct btree_iter iter;
1349 struct bkey_i *orig;
1353 bch2_trans_init(&trans, c, 0, 0);
1355 for_each_keylist_key(&op->insert_keys, orig) {
1356 ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
1357 bkey_start_pos(&orig->k), orig->k.p,
1358 BTREE_ITER_INTENT, k,
1359 NULL, NULL, BTREE_INSERT_NOFAIL, ({
1360 BUG_ON(bkey_ge(bkey_start_pos(k.k), orig->k.p));
1362 bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
1366 struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
1368 bch_err_inum_offset_ratelimited(c,
1369 k->k.p.inode, k->k.p.offset << 9,
1370 "write error while doing btree update: %s",
1377 bch2_trans_exit(&trans);
1380 static void __bch2_nocow_write_done(struct bch_write_op *op)
1382 bch2_nocow_write_unlock(op);
1384 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
1386 } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
1387 bch2_nocow_write_convert_unwritten(op);
1390 static void bch2_nocow_write_done(struct closure *cl)
1392 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1394 __bch2_nocow_write_done(op);
1395 bch2_write_done(cl);
1398 static void bch2_nocow_write(struct bch_write_op *op)
1400 struct bch_fs *c = op->c;
1401 struct btree_trans trans;
1402 struct btree_iter iter;
1404 struct bkey_ptrs_c ptrs;
1405 const struct bch_extent_ptr *ptr, *ptr2;
1409 two_state_lock_t *l;
1410 } buckets[BCH_REPLICAS_MAX];
1411 unsigned nr_buckets = 0;
1415 if (op->flags & BCH_WRITE_MOVE)
1418 bch2_trans_init(&trans, c, 0, 0);
1420 bch2_trans_begin(&trans);
1422 ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
1426 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1427 SPOS(op->pos.inode, op->pos.offset, snapshot),
1430 struct bio *bio = &op->wbio.bio;
1434 k = bch2_btree_iter_peek_slot(&iter);
1439 /* fall back to normal cow write path? */
1440 if (unlikely(k.k->p.snapshot != snapshot ||
1441 !bch2_extent_is_writeable(op, k)))
1444 if (bch2_keylist_realloc(&op->insert_keys,
1446 ARRAY_SIZE(op->inline_keys),
1450 /* Get iorefs before dropping btree locks: */
1451 ptrs = bch2_bkey_ptrs_c(k);
1452 bkey_for_each_ptr(ptrs, ptr) {
1453 buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
1454 buckets[nr_buckets].gen = ptr->gen;
1455 buckets[nr_buckets].l =
1456 bucket_nocow_lock(&c->nocow_locks, buckets[nr_buckets].b);
1458 prefetch(buckets[nr_buckets].l);
1461 if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
1465 op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
1468 /* Unlock before taking nocow locks, doing IO: */
1469 bkey_reassemble(op->insert_keys.top, k);
1470 bch2_trans_unlock(&trans);
1472 bch2_cut_front(op->pos, op->insert_keys.top);
1473 if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
1474 bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
1476 for (i = 0; i < nr_buckets; i++) {
1477 struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
1478 two_state_lock_t *l = buckets[i].l;
1481 if (!bch2_two_state_trylock(l, BUCKET_NOCOW_LOCK_UPDATE))
1482 __bch2_bucket_nocow_lock(&c->nocow_locks, l, BUCKET_NOCOW_LOCK_UPDATE);
1485 stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
1488 if (unlikely(stale))
1489 goto err_bucket_stale;
1492 bio = &op->wbio.bio;
1493 if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
1494 bio = bio_split(bio, k.k->p.offset - op->pos.offset,
1495 GFP_KERNEL, &c->bio_write);
1496 wbio_init(bio)->put_bio = true;
1497 bio->bi_opf = op->wbio.bio.bi_opf;
1499 op->flags |= BCH_WRITE_DONE;
1502 op->pos.offset += bio_sectors(bio);
1503 op->written += bio_sectors(bio);
1505 bio->bi_end_io = bch2_write_endio;
1506 bio->bi_private = &op->cl;
1507 bio->bi_opf |= REQ_OP_WRITE;
1508 closure_get(&op->cl);
1509 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
1510 op->insert_keys.top, true);
1512 bch2_keylist_push(&op->insert_keys);
1513 if (op->flags & BCH_WRITE_DONE)
1515 bch2_btree_iter_advance(&iter);
1518 bch2_trans_iter_exit(&trans, &iter);
1520 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1524 bch_err_inum_offset_ratelimited(c,
1526 op->pos.offset << 9,
1527 "%s: btree lookup error %s",
1528 __func__, bch2_err_str(ret));
1530 op->flags |= BCH_WRITE_DONE;
1533 bch2_trans_exit(&trans);
1535 /* fallback to cow write path? */
1536 if (!(op->flags & BCH_WRITE_DONE)) {
1537 closure_sync(&op->cl);
1538 __bch2_nocow_write_done(op);
1539 op->insert_keys.top = op->insert_keys.keys;
1540 } else if (op->flags & BCH_WRITE_SYNC) {
1541 closure_sync(&op->cl);
1542 bch2_nocow_write_done(&op->cl);
1546 * needs to run out of process context because ei_quota_lock is
1549 continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
1553 bkey_for_each_ptr(ptrs, ptr2) {
1557 percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
1560 /* Fall back to COW path: */
1564 bch2_bucket_nocow_unlock(&c->nocow_locks,
1566 BUCKET_NOCOW_LOCK_UPDATE);
1568 bkey_for_each_ptr(ptrs, ptr2)
1569 percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
1571 /* We can retry this: */
1572 ret = BCH_ERR_transaction_restart;
1576 static void __bch2_write(struct bch_write_op *op)
1578 struct bch_fs *c = op->c;
1579 struct write_point *wp = NULL;
1580 struct bio *bio = NULL;
1581 unsigned nofs_flags;
1584 nofs_flags = memalloc_nofs_save();
1586 if (unlikely(op->opts.nocow)) {
1587 bch2_nocow_write(op);
1588 if (op->flags & BCH_WRITE_DONE)
1589 goto out_nofs_restore;
1592 memset(&op->failed, 0, sizeof(op->failed));
1593 op->btree_update_ready = false;
1596 struct bkey_i *key_to_write;
1597 unsigned key_to_write_offset = op->insert_keys.top_p -
1598 op->insert_keys.keys_p;
1600 /* +1 for possible cache device: */
1601 if (op->open_buckets.nr + op->nr_replicas + 1 >
1602 ARRAY_SIZE(op->open_buckets.v))
1605 if (bch2_keylist_realloc(&op->insert_keys,
1607 ARRAY_SIZE(op->inline_keys),
1608 BKEY_EXTENT_U64s_MAX))
1612 * The copygc thread is now global, which means it's no longer
1613 * freeing up space on specific disks, which means that
1614 * allocations for specific disks may hang arbitrarily long:
1616 ret = bch2_trans_do(c, NULL, NULL, 0,
1617 bch2_alloc_sectors_start_trans(&trans,
1619 op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
1623 op->nr_replicas_required,
1626 (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
1627 BCH_WRITE_ONLY_SPECIFIED_DEVS))
1628 ? NULL : &op->cl, &wp));
1629 if (unlikely(ret)) {
1630 if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
1636 ret = bch2_write_extent(op, wp, &bio);
1639 bch2_open_bucket_get(c, wp, &op->open_buckets);
1640 bch2_alloc_sectors_done_inlined(c, wp);
1643 if (!(op->flags & BCH_WRITE_SYNC)) {
1644 spin_lock(&wp->writes_lock);
1646 list_add_tail(&op->wp_list, &wp->writes);
1647 spin_unlock(&wp->writes_lock);
1650 op->flags |= BCH_WRITE_DONE;
1658 bio->bi_end_io = bch2_write_endio;
1659 bio->bi_private = &op->cl;
1660 bio->bi_opf |= REQ_OP_WRITE;
1662 closure_get(bio->bi_private);
1664 key_to_write = (void *) (op->insert_keys.keys_p +
1665 key_to_write_offset);
1667 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
1668 key_to_write, false);
1674 * If we're running asynchronously, wne may still want to block
1675 * synchronously here if we weren't able to submit all of the IO at
1676 * once, as that signals backpressure to the caller.
1678 if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) {
1679 closure_sync(&op->cl);
1680 __bch2_write_index(op);
1682 if (!(op->flags & BCH_WRITE_DONE))
1684 bch2_write_done(&op->cl);
1686 continue_at(&op->cl, bch2_write_index, NULL);
1689 memalloc_nofs_restore(nofs_flags);
1692 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1694 struct bio *bio = &op->wbio.bio;
1695 struct bvec_iter iter;
1696 struct bkey_i_inline_data *id;
1700 bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
1702 ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1703 ARRAY_SIZE(op->inline_keys),
1704 BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1710 sectors = bio_sectors(bio);
1711 op->pos.offset += sectors;
1713 id = bkey_inline_data_init(op->insert_keys.top);
1715 id->k.version = op->version;
1716 id->k.size = sectors;
1718 iter = bio->bi_iter;
1719 iter.bi_size = data_len;
1720 memcpy_from_bio(id->v.data, bio, iter);
1722 while (data_len & 7)
1723 id->v.data[data_len++] = '\0';
1724 set_bkey_val_bytes(&id->k, data_len);
1725 bch2_keylist_push(&op->insert_keys);
1727 op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1728 op->flags |= BCH_WRITE_DONE;
1730 __bch2_write_index(op);
1732 bch2_write_done(&op->cl);
1736 * bch_write - handle a write to a cache device or flash only volume
1738 * This is the starting point for any data to end up in a cache device; it could
1739 * be from a normal write, or a writeback write, or a write to a flash only
1740 * volume - it's also used by the moving garbage collector to compact data in
1741 * mostly empty buckets.
1743 * It first writes the data to the cache, creating a list of keys to be inserted
1744 * (if the data won't fit in a single open bucket, there will be multiple keys);
1745 * after the data is written it calls bch_journal, and after the keys have been
1746 * added to the next journal write they're inserted into the btree.
1748 * If op->discard is true, instead of inserting the data it invalidates the
1749 * region of the cache represented by op->bio and op->inode.
1751 void bch2_write(struct closure *cl)
1753 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1754 struct bio *bio = &op->wbio.bio;
1755 struct bch_fs *c = op->c;
1758 EBUG_ON(op->cl.parent);
1759 BUG_ON(!op->nr_replicas);
1760 BUG_ON(!op->write_point.v);
1761 BUG_ON(bkey_eq(op->pos, POS_MAX));
1763 op->start_time = local_clock();
1764 bch2_keylist_init(&op->insert_keys, op->inline_keys);
1765 wbio_init(bio)->put_bio = false;
1767 if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
1768 bch_err_inum_offset_ratelimited(c,
1770 op->pos.offset << 9,
1771 "misaligned write");
1776 if (c->opts.nochanges ||
1777 !percpu_ref_tryget_live(&c->writes)) {
1778 op->error = -BCH_ERR_erofs_no_writes;
1782 this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
1783 bch2_increment_clock(c, bio_sectors(bio), WRITE);
1785 data_len = min_t(u64, bio->bi_iter.bi_size,
1786 op->new_i_size - (op->pos.offset << 9));
1788 if (c->opts.inline_data &&
1789 data_len <= min(block_bytes(c) / 2, 1024U)) {
1790 bch2_write_data_inline(op, data_len);
1797 bch2_disk_reservation_put(c, &op->res);
1799 closure_debug_destroy(&op->cl);
1804 /* Cache promotion on read */
1807 struct rcu_head rcu;
1810 struct rhash_head hash;
1813 struct data_update write;
1814 struct bio_vec bi_inline_vecs[0]; /* must be last */
1817 static const struct rhashtable_params bch_promote_params = {
1818 .head_offset = offsetof(struct promote_op, hash),
1819 .key_offset = offsetof(struct promote_op, pos),
1820 .key_len = sizeof(struct bpos),
1823 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1825 struct bch_io_opts opts,
1828 if (!(flags & BCH_READ_MAY_PROMOTE))
1831 if (!opts.promote_target)
1834 if (bch2_bkey_has_target(c, k, opts.promote_target))
1837 if (bkey_extent_is_unwritten(k))
1840 if (bch2_target_congested(c, opts.promote_target)) {
1841 /* XXX trace this */
1845 if (rhashtable_lookup_fast(&c->promote_table, &pos,
1846 bch_promote_params))
1852 static void promote_free(struct bch_fs *c, struct promote_op *op)
1856 ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1857 bch_promote_params);
1859 percpu_ref_put(&c->writes);
1863 static void promote_done(struct bch_write_op *wop)
1865 struct promote_op *op =
1866 container_of(wop, struct promote_op, write.op);
1867 struct bch_fs *c = op->write.op.c;
1869 bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1872 bch2_data_update_exit(&op->write);
1873 promote_free(c, op);
1876 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1878 struct bio *bio = &op->write.op.wbio.bio;
1880 trace_and_count(op->write.op.c, read_promote, &rbio->bio);
1882 /* we now own pages: */
1883 BUG_ON(!rbio->bounce);
1884 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1886 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1887 sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1888 swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1890 bch2_data_update_read_done(&op->write, rbio->pick.crc);
1893 static struct promote_op *__promote_alloc(struct bch_fs *c,
1894 enum btree_id btree_id,
1897 struct extent_ptr_decoded *pick,
1898 struct bch_io_opts opts,
1900 struct bch_read_bio **rbio)
1902 struct promote_op *op = NULL;
1904 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1907 if (!percpu_ref_tryget_live(&c->writes))
1910 op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1914 op->start_time = local_clock();
1918 * We don't use the mempool here because extents that aren't
1919 * checksummed or compressed can be too big for the mempool:
1921 *rbio = kzalloc(sizeof(struct bch_read_bio) +
1922 sizeof(struct bio_vec) * pages,
1927 rbio_init(&(*rbio)->bio, opts);
1928 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
1930 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1934 (*rbio)->bounce = true;
1935 (*rbio)->split = true;
1936 (*rbio)->kmalloc = true;
1938 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1939 bch_promote_params))
1942 bio = &op->write.op.wbio.bio;
1943 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
1945 ret = bch2_data_update_init(c, &op->write,
1946 writepoint_hashed((unsigned long) current),
1948 (struct data_update_opts) {
1949 .target = opts.promote_target,
1950 .extra_replicas = 1,
1951 .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
1955 op->write.op.end_io = promote_done;
1960 bio_free_pages(&(*rbio)->bio);
1964 percpu_ref_put(&c->writes);
1969 static struct promote_op *promote_alloc(struct bch_fs *c,
1970 struct bvec_iter iter,
1972 struct extent_ptr_decoded *pick,
1973 struct bch_io_opts opts,
1975 struct bch_read_bio **rbio,
1979 bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1980 /* data might have to be decompressed in the write path: */
1981 unsigned sectors = promote_full
1982 ? max(pick->crc.compressed_size, pick->crc.live_size)
1983 : bvec_iter_sectors(iter);
1984 struct bpos pos = promote_full
1985 ? bkey_start_pos(k.k)
1986 : POS(k.k->p.inode, iter.bi_sector);
1987 struct promote_op *promote;
1989 if (!should_promote(c, k, pos, opts, flags))
1992 promote = __promote_alloc(c,
1993 k.k->type == KEY_TYPE_reflink_v
1996 k, pos, pick, opts, sectors, rbio);
2001 *read_full = promote_full;
2007 #define READ_RETRY_AVOID 1
2008 #define READ_RETRY 2
2013 RBIO_CONTEXT_HIGHPRI,
2014 RBIO_CONTEXT_UNBOUND,
2017 static inline struct bch_read_bio *
2018 bch2_rbio_parent(struct bch_read_bio *rbio)
2020 return rbio->split ? rbio->parent : rbio;
2024 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
2025 enum rbio_context context,
2026 struct workqueue_struct *wq)
2028 if (context <= rbio->context) {
2031 rbio->work.func = fn;
2032 rbio->context = context;
2033 queue_work(wq, &rbio->work);
2037 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
2039 BUG_ON(rbio->bounce && !rbio->split);
2042 promote_free(rbio->c, rbio->promote);
2043 rbio->promote = NULL;
2046 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
2049 struct bch_read_bio *parent = rbio->parent;
2054 bio_put(&rbio->bio);
2063 * Only called on a top level bch_read_bio to complete an entire read request,
2066 static void bch2_rbio_done(struct bch_read_bio *rbio)
2068 if (rbio->start_time)
2069 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
2071 bio_endio(&rbio->bio);
2074 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
2075 struct bvec_iter bvec_iter,
2076 struct bch_io_failures *failed,
2079 struct btree_trans trans;
2080 struct btree_iter iter;
2085 flags &= ~BCH_READ_LAST_FRAGMENT;
2086 flags |= BCH_READ_MUST_CLONE;
2088 bch2_bkey_buf_init(&sk);
2089 bch2_trans_init(&trans, c, 0, 0);
2091 bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
2092 rbio->read_pos, BTREE_ITER_SLOTS);
2094 rbio->bio.bi_status = 0;
2096 k = bch2_btree_iter_peek_slot(&iter);
2100 bch2_bkey_buf_reassemble(&sk, c, k);
2101 k = bkey_i_to_s_c(sk.k);
2102 bch2_trans_unlock(&trans);
2104 if (!bch2_bkey_matches_ptr(c, k,
2106 rbio->data_pos.offset -
2107 rbio->pick.crc.offset)) {
2108 /* extent we wanted to read no longer exists: */
2113 ret = __bch2_read_extent(&trans, rbio, bvec_iter,
2116 k, 0, failed, flags);
2117 if (ret == READ_RETRY)
2122 bch2_rbio_done(rbio);
2123 bch2_trans_iter_exit(&trans, &iter);
2124 bch2_trans_exit(&trans);
2125 bch2_bkey_buf_exit(&sk, c);
2128 rbio->bio.bi_status = BLK_STS_IOERR;
2132 static void bch2_rbio_retry(struct work_struct *work)
2134 struct bch_read_bio *rbio =
2135 container_of(work, struct bch_read_bio, work);
2136 struct bch_fs *c = rbio->c;
2137 struct bvec_iter iter = rbio->bvec_iter;
2138 unsigned flags = rbio->flags;
2139 subvol_inum inum = {
2140 .subvol = rbio->subvol,
2141 .inum = rbio->read_pos.inode,
2143 struct bch_io_failures failed = { .nr = 0 };
2145 trace_and_count(c, read_retry, &rbio->bio);
2147 if (rbio->retry == READ_RETRY_AVOID)
2148 bch2_mark_io_failure(&failed, &rbio->pick);
2150 rbio->bio.bi_status = 0;
2152 rbio = bch2_rbio_free(rbio);
2154 flags |= BCH_READ_IN_RETRY;
2155 flags &= ~BCH_READ_MAY_PROMOTE;
2157 if (flags & BCH_READ_NODECODE) {
2158 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
2160 flags &= ~BCH_READ_LAST_FRAGMENT;
2161 flags |= BCH_READ_MUST_CLONE;
2163 __bch2_read(c, rbio, iter, inum, &failed, flags);
2167 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
2170 rbio->retry = retry;
2172 if (rbio->flags & BCH_READ_IN_RETRY)
2175 if (retry == READ_ERR) {
2176 rbio = bch2_rbio_free(rbio);
2178 rbio->bio.bi_status = error;
2179 bch2_rbio_done(rbio);
2181 bch2_rbio_punt(rbio, bch2_rbio_retry,
2182 RBIO_CONTEXT_UNBOUND, system_unbound_wq);
2186 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
2187 struct bch_read_bio *rbio)
2189 struct bch_fs *c = rbio->c;
2190 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
2191 struct bch_extent_crc_unpacked new_crc;
2192 struct btree_iter iter;
2197 if (crc_is_compressed(rbio->pick.crc))
2200 bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
2201 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
2202 k = bch2_btree_iter_peek_slot(&iter);
2203 if ((ret = bkey_err(k)))
2206 if (bversion_cmp(k.k->version, rbio->version) ||
2207 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
2210 /* Extent was merged? */
2211 if (bkey_start_offset(k.k) < data_offset ||
2212 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
2215 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
2216 rbio->pick.crc, NULL, &new_crc,
2217 bkey_start_offset(k.k) - data_offset, k.k->size,
2218 rbio->pick.crc.csum_type)) {
2219 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
2225 * going to be temporarily appending another checksum entry:
2227 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
2228 sizeof(struct bch_extent_crc128));
2229 if ((ret = PTR_ERR_OR_ZERO(new)))
2232 bkey_reassemble(new, k);
2234 if (!bch2_bkey_narrow_crcs(new, new_crc))
2237 ret = bch2_trans_update(trans, &iter, new,
2238 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
2240 bch2_trans_iter_exit(trans, &iter);
2244 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
2246 bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
2247 __bch2_rbio_narrow_crcs(&trans, rbio));
2250 /* Inner part that may run in process context */
2251 static void __bch2_read_endio(struct work_struct *work)
2253 struct bch_read_bio *rbio =
2254 container_of(work, struct bch_read_bio, work);
2255 struct bch_fs *c = rbio->c;
2256 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
2257 struct bio *src = &rbio->bio;
2258 struct bio *dst = &bch2_rbio_parent(rbio)->bio;
2259 struct bvec_iter dst_iter = rbio->bvec_iter;
2260 struct bch_extent_crc_unpacked crc = rbio->pick.crc;
2261 struct nonce nonce = extent_nonce(rbio->version, crc);
2262 unsigned nofs_flags;
2263 struct bch_csum csum;
2266 nofs_flags = memalloc_nofs_save();
2268 /* Reset iterator for checksumming and copying bounced data: */
2270 src->bi_iter.bi_size = crc.compressed_size << 9;
2271 src->bi_iter.bi_idx = 0;
2272 src->bi_iter.bi_bvec_done = 0;
2274 src->bi_iter = rbio->bvec_iter;
2277 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
2278 if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
2283 * We need to rework the narrow_crcs path to deliver the read completion
2284 * first, and then punt to a different workqueue, otherwise we're
2285 * holding up reads while doing btree updates which is bad for memory
2288 if (unlikely(rbio->narrow_crcs))
2289 bch2_rbio_narrow_crcs(rbio);
2291 if (rbio->flags & BCH_READ_NODECODE)
2294 /* Adjust crc to point to subset of data we want: */
2295 crc.offset += rbio->offset_into_extent;
2296 crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
2298 if (crc_is_compressed(crc)) {
2299 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
2303 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
2304 goto decompression_err;
2306 /* don't need to decrypt the entire bio: */
2307 nonce = nonce_add(nonce, crc.offset << 9);
2308 bio_advance(src, crc.offset << 9);
2310 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
2311 src->bi_iter.bi_size = dst_iter.bi_size;
2313 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
2318 struct bvec_iter src_iter = src->bi_iter;
2319 bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
2323 if (rbio->promote) {
2325 * Re encrypt data we decrypted, so it's consistent with
2328 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
2332 promote_start(rbio->promote, rbio);
2333 rbio->promote = NULL;
2336 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
2337 rbio = bch2_rbio_free(rbio);
2338 bch2_rbio_done(rbio);
2341 memalloc_nofs_restore(nofs_flags);
2345 * Checksum error: if the bio wasn't bounced, we may have been
2346 * reading into buffers owned by userspace (that userspace can
2347 * scribble over) - retry the read, bouncing it this time:
2349 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
2350 rbio->flags |= BCH_READ_MUST_BOUNCE;
2351 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
2355 bch_err_inum_offset_ratelimited(ca,
2356 rbio->read_pos.inode,
2357 rbio->read_pos.offset << 9,
2358 "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
2359 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
2360 csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
2362 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2365 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
2366 rbio->read_pos.offset << 9,
2367 "decompression error");
2368 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
2371 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
2372 rbio->read_pos.offset << 9,
2374 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
2378 static void bch2_read_endio(struct bio *bio)
2380 struct bch_read_bio *rbio =
2381 container_of(bio, struct bch_read_bio, bio);
2382 struct bch_fs *c = rbio->c;
2383 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
2384 struct workqueue_struct *wq = NULL;
2385 enum rbio_context context = RBIO_CONTEXT_NULL;
2387 if (rbio->have_ioref) {
2388 bch2_latency_acct(ca, rbio->submit_time, READ);
2389 percpu_ref_put(&ca->io_ref);
2393 rbio->bio.bi_end_io = rbio->end_io;
2395 if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
2396 rbio->read_pos.inode,
2397 rbio->read_pos.offset,
2398 "data read error: %s",
2399 bch2_blk_status_to_str(bio->bi_status))) {
2400 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
2404 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
2405 ptr_stale(ca, &rbio->pick.ptr)) {
2406 trace_and_count(c, read_reuse_race, &rbio->bio);
2408 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
2409 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
2411 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
2415 if (rbio->narrow_crcs ||
2417 crc_is_compressed(rbio->pick.crc) ||
2418 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
2419 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
2420 else if (rbio->pick.crc.csum_type)
2421 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
2423 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
2426 int __bch2_read_indirect_extent(struct btree_trans *trans,
2427 unsigned *offset_into_extent,
2428 struct bkey_buf *orig_k)
2430 struct btree_iter iter;
2435 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
2436 *offset_into_extent;
2438 bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
2439 POS(0, reflink_offset),
2441 k = bch2_btree_iter_peek_slot(&iter);
2446 if (k.k->type != KEY_TYPE_reflink_v &&
2447 k.k->type != KEY_TYPE_indirect_inline_data) {
2448 bch_err_inum_offset_ratelimited(trans->c,
2449 orig_k->k->k.p.inode,
2450 orig_k->k->k.p.offset << 9,
2451 "%llu len %u points to nonexistent indirect extent %llu",
2452 orig_k->k->k.p.offset,
2455 bch2_inconsistent_error(trans->c);
2460 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
2461 bch2_bkey_buf_reassemble(orig_k, trans->c, k);
2463 bch2_trans_iter_exit(trans, &iter);
2467 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
2469 struct bch_extent_ptr ptr)
2471 struct bch_fs *c = trans->c;
2472 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
2473 struct btree_iter iter;
2474 struct printbuf buf = PRINTBUF;
2477 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2478 PTR_BUCKET_POS(c, &ptr),
2481 prt_printf(&buf, "Attempting to read from stale dirty pointer:");
2482 printbuf_indent_add(&buf, 2);
2485 bch2_bkey_val_to_text(&buf, c, k);
2488 prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
2490 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
2493 bch2_bkey_val_to_text(&buf, c, k);
2496 bch2_fs_inconsistent(c, "%s", buf.buf);
2498 bch2_trans_iter_exit(trans, &iter);
2499 printbuf_exit(&buf);
2502 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
2503 struct bvec_iter iter, struct bpos read_pos,
2504 enum btree_id data_btree, struct bkey_s_c k,
2505 unsigned offset_into_extent,
2506 struct bch_io_failures *failed, unsigned flags)
2508 struct bch_fs *c = trans->c;
2509 struct extent_ptr_decoded pick;
2510 struct bch_read_bio *rbio = NULL;
2511 struct bch_dev *ca = NULL;
2512 struct promote_op *promote = NULL;
2513 bool bounce = false, read_full = false, narrow_crcs = false;
2514 struct bpos data_pos = bkey_start_pos(k.k);
2517 if (bkey_extent_is_inline_data(k.k)) {
2518 unsigned bytes = min_t(unsigned, iter.bi_size,
2519 bkey_inline_data_bytes(k.k));
2521 swap(iter.bi_size, bytes);
2522 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
2523 swap(iter.bi_size, bytes);
2524 bio_advance_iter(&orig->bio, &iter, bytes);
2525 zero_fill_bio_iter(&orig->bio, iter);
2529 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
2531 /* hole or reservation - just zero fill: */
2536 bch_err_inum_offset_ratelimited(c,
2537 read_pos.inode, read_pos.offset << 9,
2538 "no device to read from");
2542 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
2545 * Stale dirty pointers are treated as IO errors, but @failed isn't
2546 * allocated unless we're in the retry path - so if we're not in the
2547 * retry path, don't check here, it'll be caught in bch2_read_endio()
2548 * and we'll end up in the retry path:
2550 if ((flags & BCH_READ_IN_RETRY) &&
2552 unlikely(ptr_stale(ca, &pick.ptr))) {
2553 read_from_stale_dirty_pointer(trans, k, pick.ptr);
2554 bch2_mark_io_failure(failed, &pick);
2559 * Unlock the iterator while the btree node's lock is still in
2560 * cache, before doing the IO:
2562 bch2_trans_unlock(trans);
2564 if (flags & BCH_READ_NODECODE) {
2566 * can happen if we retry, and the extent we were going to read
2567 * has been merged in the meantime:
2569 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
2572 iter.bi_size = pick.crc.compressed_size << 9;
2576 if (!(flags & BCH_READ_LAST_FRAGMENT) ||
2577 bio_flagged(&orig->bio, BIO_CHAIN))
2578 flags |= BCH_READ_MUST_CLONE;
2580 narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
2581 bch2_can_narrow_extent_crcs(k, pick.crc);
2583 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
2584 flags |= BCH_READ_MUST_BOUNCE;
2586 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
2588 if (crc_is_compressed(pick.crc) ||
2589 (pick.crc.csum_type != BCH_CSUM_none &&
2590 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2591 (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2592 (flags & BCH_READ_USER_MAPPED)) ||
2593 (flags & BCH_READ_MUST_BOUNCE)))) {
2598 if (orig->opts.promote_target)
2599 promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
2600 &rbio, &bounce, &read_full);
2603 EBUG_ON(crc_is_compressed(pick.crc));
2604 EBUG_ON(pick.crc.csum_type &&
2605 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2606 bvec_iter_sectors(iter) != pick.crc.live_size ||
2608 offset_into_extent));
2610 data_pos.offset += offset_into_extent;
2611 pick.ptr.offset += pick.crc.offset +
2613 offset_into_extent = 0;
2614 pick.crc.compressed_size = bvec_iter_sectors(iter);
2615 pick.crc.uncompressed_size = bvec_iter_sectors(iter);
2616 pick.crc.offset = 0;
2617 pick.crc.live_size = bvec_iter_sectors(iter);
2618 offset_into_extent = 0;
2623 * promote already allocated bounce rbio:
2624 * promote needs to allocate a bio big enough for uncompressing
2625 * data in the write path, but we're not going to use it all
2628 EBUG_ON(rbio->bio.bi_iter.bi_size <
2629 pick.crc.compressed_size << 9);
2630 rbio->bio.bi_iter.bi_size =
2631 pick.crc.compressed_size << 9;
2632 } else if (bounce) {
2633 unsigned sectors = pick.crc.compressed_size;
2635 rbio = rbio_init(bio_alloc_bioset(NULL,
2636 DIV_ROUND_UP(sectors, PAGE_SECTORS),
2639 &c->bio_read_split),
2642 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2643 rbio->bounce = true;
2645 } else if (flags & BCH_READ_MUST_CLONE) {
2647 * Have to clone if there were any splits, due to error
2648 * reporting issues (if a split errored, and retrying didn't
2649 * work, when it reports the error to its parent (us) we don't
2650 * know if the error was from our bio, and we should retry, or
2651 * from the whole bio, in which case we don't want to retry and
2654 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
2655 &c->bio_read_split),
2657 rbio->bio.bi_iter = iter;
2661 rbio->bio.bi_iter = iter;
2662 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2665 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2668 rbio->submit_time = local_clock();
2670 rbio->parent = orig;
2672 rbio->end_io = orig->bio.bi_end_io;
2673 rbio->bvec_iter = iter;
2674 rbio->offset_into_extent= offset_into_extent;
2675 rbio->flags = flags;
2676 rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2677 rbio->narrow_crcs = narrow_crcs;
2681 /* XXX: only initialize this if needed */
2682 rbio->devs_have = bch2_bkey_devs(k);
2684 rbio->subvol = orig->subvol;
2685 rbio->read_pos = read_pos;
2686 rbio->data_btree = data_btree;
2687 rbio->data_pos = data_pos;
2688 rbio->version = k.k->version;
2689 rbio->promote = promote;
2690 INIT_WORK(&rbio->work, NULL);
2692 rbio->bio.bi_opf = orig->bio.bi_opf;
2693 rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2694 rbio->bio.bi_end_io = bch2_read_endio;
2697 trace_and_count(c, read_bounce, &rbio->bio);
2699 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
2700 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2703 * If it's being moved internally, we don't want to flag it as a cache
2706 if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
2707 bch2_bucket_io_time_reset(trans, pick.ptr.dev,
2708 PTR_BUCKET_NR(ca, &pick.ptr), READ);
2710 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2711 bio_inc_remaining(&orig->bio);
2712 trace_and_count(c, read_split, &orig->bio);
2715 if (!rbio->pick.idx) {
2716 if (!rbio->have_ioref) {
2717 bch_err_inum_offset_ratelimited(c,
2719 read_pos.offset << 9,
2720 "no device to read from");
2721 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2725 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
2726 bio_sectors(&rbio->bio));
2727 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2729 if (likely(!(flags & BCH_READ_IN_RETRY)))
2730 submit_bio(&rbio->bio);
2732 submit_bio_wait(&rbio->bio);
2734 /* Attempting reconstruct read: */
2735 if (bch2_ec_read_extent(c, rbio)) {
2736 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2740 if (likely(!(flags & BCH_READ_IN_RETRY)))
2741 bio_endio(&rbio->bio);
2744 if (likely(!(flags & BCH_READ_IN_RETRY))) {
2749 rbio->context = RBIO_CONTEXT_UNBOUND;
2750 bch2_read_endio(&rbio->bio);
2753 rbio = bch2_rbio_free(rbio);
2755 if (ret == READ_RETRY_AVOID) {
2756 bch2_mark_io_failure(failed, &pick);
2767 if (flags & BCH_READ_IN_RETRY)
2770 orig->bio.bi_status = BLK_STS_IOERR;
2775 * won't normally happen in the BCH_READ_NODECODE
2776 * (bch2_move_extent()) path, but if we retry and the extent we wanted
2777 * to read no longer exists we have to signal that:
2779 if (flags & BCH_READ_NODECODE)
2782 zero_fill_bio_iter(&orig->bio, iter);
2784 if (flags & BCH_READ_LAST_FRAGMENT)
2785 bch2_rbio_done(orig);
2789 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
2790 struct bvec_iter bvec_iter, subvol_inum inum,
2791 struct bch_io_failures *failed, unsigned flags)
2793 struct btree_trans trans;
2794 struct btree_iter iter;
2800 BUG_ON(flags & BCH_READ_NODECODE);
2802 bch2_bkey_buf_init(&sk);
2803 bch2_trans_init(&trans, c, 0, 0);
2805 bch2_trans_begin(&trans);
2806 iter = (struct btree_iter) { NULL };
2808 ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
2812 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
2813 SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
2816 unsigned bytes, sectors, offset_into_extent;
2817 enum btree_id data_btree = BTREE_ID_extents;
2820 * read_extent -> io_time_reset may cause a transaction restart
2821 * without returning an error, we need to check for that here:
2823 ret = bch2_trans_relock(&trans);
2827 bch2_btree_iter_set_pos(&iter,
2828 POS(inum.inum, bvec_iter.bi_sector));
2830 k = bch2_btree_iter_peek_slot(&iter);
2835 offset_into_extent = iter.pos.offset -
2836 bkey_start_offset(k.k);
2837 sectors = k.k->size - offset_into_extent;
2839 bch2_bkey_buf_reassemble(&sk, c, k);
2841 ret = bch2_read_indirect_extent(&trans, &data_btree,
2842 &offset_into_extent, &sk);
2846 k = bkey_i_to_s_c(sk.k);
2849 * With indirect extents, the amount of data to read is the min
2850 * of the original extent and the indirect extent:
2852 sectors = min(sectors, k.k->size - offset_into_extent);
2854 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
2855 swap(bvec_iter.bi_size, bytes);
2857 if (bvec_iter.bi_size == bytes)
2858 flags |= BCH_READ_LAST_FRAGMENT;
2860 ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
2862 offset_into_extent, failed, flags);
2866 if (flags & BCH_READ_LAST_FRAGMENT)
2869 swap(bvec_iter.bi_size, bytes);
2870 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
2872 ret = btree_trans_too_many_iters(&trans);
2877 bch2_trans_iter_exit(&trans, &iter);
2879 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2880 ret == READ_RETRY ||
2881 ret == READ_RETRY_AVOID)
2884 bch2_trans_exit(&trans);
2885 bch2_bkey_buf_exit(&sk, c);
2888 bch_err_inum_offset_ratelimited(c, inum.inum,
2889 bvec_iter.bi_sector << 9,
2890 "read error %i from btree lookup", ret);
2891 rbio->bio.bi_status = BLK_STS_IOERR;
2892 bch2_rbio_done(rbio);
2896 void bch2_fs_io_exit(struct bch_fs *c)
2898 if (c->promote_table.tbl)
2899 rhashtable_destroy(&c->promote_table);
2900 mempool_exit(&c->bio_bounce_pages);
2901 bioset_exit(&c->bio_write);
2902 bioset_exit(&c->bio_read_split);
2903 bioset_exit(&c->bio_read);
2906 int bch2_fs_io_init(struct bch_fs *c)
2910 for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
2911 two_state_lock_init(&c->nocow_locks.l[i]);
2913 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
2914 BIOSET_NEED_BVECS) ||
2915 bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
2916 BIOSET_NEED_BVECS) ||
2917 bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
2918 BIOSET_NEED_BVECS) ||
2919 mempool_init_page_pool(&c->bio_bounce_pages,
2921 c->opts.btree_node_size,
2922 c->opts.encoded_extent_max) /
2924 rhashtable_init(&c->promote_table, &bch_promote_params))