1 // SPDX-License-Identifier: GPL-2.0
3 * Some low level IO code, and hacks for various block layer limitations
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
14 #include "btree_update.h"
19 #include "data_update.h"
21 #include "disk_groups.h"
24 #include "extent_update.h"
30 #include "nocow_locking.h"
31 #include "rebalance.h"
32 #include "subvolume.h"
37 #include <linux/blkdev.h>
38 #include <linux/prefetch.h>
39 #include <linux/random.h>
40 #include <linux/sched/mm.h>
42 const char *bch2_blk_status_to_str(blk_status_t status)
44 if (status == BLK_STS_REMOVED)
45 return "device removed";
46 return blk_status_to_str(status);
49 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
51 static bool bch2_target_congested(struct bch_fs *c, u16 target)
53 const struct bch_devs_mask *devs;
54 unsigned d, nr = 0, total = 0;
55 u64 now = local_clock(), last;
63 devs = bch2_target_to_mask(c, target) ?:
64 &c->rw_devs[BCH_DATA_user];
66 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
67 ca = rcu_dereference(c->devs[d]);
71 congested = atomic_read(&ca->congested);
72 last = READ_ONCE(ca->congested_last);
73 if (time_after64(now, last))
74 congested -= (now - last) >> 12;
76 total += max(congested, 0LL);
81 return bch2_rand_range(nr * CONGESTED_MAX) < total;
84 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
88 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
89 /* ideally we'd be taking into account the device's variance here: */
90 u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
91 s64 latency_over = io_latency - latency_threshold;
93 if (latency_threshold && latency_over > 0) {
95 * bump up congested by approximately latency_over * 4 /
96 * latency_threshold - we don't need much accuracy here so don't
97 * bother with the divide:
99 if (atomic_read(&ca->congested) < CONGESTED_MAX)
100 atomic_add(latency_over >>
101 max_t(int, ilog2(latency_threshold) - 2, 0),
104 ca->congested_last = now;
105 } else if (atomic_read(&ca->congested) > 0) {
106 atomic_dec(&ca->congested);
110 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
112 atomic64_t *latency = &ca->cur_latency[rw];
113 u64 now = local_clock();
114 u64 io_latency = time_after64(now, submit_time)
117 u64 old, new, v = atomic64_read(latency);
123 * If the io latency was reasonably close to the current
124 * latency, skip doing the update and atomic operation - most of
127 if (abs((int) (old - io_latency)) < (old >> 1) &&
131 new = ewma_add(old, io_latency, 5);
132 } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
134 bch2_congested_acct(ca, io_latency, now, rw);
136 __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
141 static bool bch2_target_congested(struct bch_fs *c, u16 target)
148 /* Allocate, free from mempool: */
150 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
152 struct bvec_iter_all iter;
155 bio_for_each_segment_all(bv, bio, iter)
156 if (bv->bv_page != ZERO_PAGE(0))
157 mempool_free(bv->bv_page, &c->bio_bounce_pages);
161 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
165 if (likely(!*using_mempool)) {
166 page = alloc_page(GFP_NOFS);
167 if (unlikely(!page)) {
168 mutex_lock(&c->bio_bounce_pages_lock);
169 *using_mempool = true;
175 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
181 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
184 bool using_mempool = false;
187 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
188 unsigned len = min_t(size_t, PAGE_SIZE, size);
190 BUG_ON(!bio_add_page(bio, page, len, 0));
195 mutex_unlock(&c->bio_bounce_pages_lock);
198 /* Extent update path: */
200 int bch2_sum_sector_overwrites(struct btree_trans *trans,
201 struct btree_iter *extent_iter,
203 bool *usage_increasing,
204 s64 *i_sectors_delta,
205 s64 *disk_sectors_delta)
207 struct bch_fs *c = trans->c;
208 struct btree_iter iter;
210 unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
211 bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
214 *usage_increasing = false;
215 *i_sectors_delta = 0;
216 *disk_sectors_delta = 0;
218 bch2_trans_copy_iter(&iter, extent_iter);
220 for_each_btree_key_upto_continue_norestart(iter,
221 new->k.p, BTREE_ITER_SLOTS, old, ret) {
222 s64 sectors = min(new->k.p.offset, old.k->p.offset) -
223 max(bkey_start_offset(&new->k),
224 bkey_start_offset(old.k));
226 *i_sectors_delta += sectors *
227 (bkey_extent_is_allocation(&new->k) -
228 bkey_extent_is_allocation(old.k));
230 *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
231 *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
232 ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
235 if (!*usage_increasing &&
236 (new->k.p.snapshot != old.k->p.snapshot ||
237 new_replicas > bch2_bkey_replicas(c, old) ||
238 (!new_compressed && bch2_bkey_sectors_compressed(old))))
239 *usage_increasing = true;
241 if (bkey_ge(old.k->p, new->k.p))
245 bch2_trans_iter_exit(trans, &iter);
249 static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
250 struct btree_iter *extent_iter,
254 struct btree_iter iter;
256 struct bkey_i_inode_v3 *inode;
257 unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
260 k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
262 extent_iter->pos.inode,
263 extent_iter->snapshot),
265 ret = PTR_ERR_OR_ZERO(k);
269 if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
270 k = bch2_inode_to_v3(trans, k);
271 ret = PTR_ERR_OR_ZERO(k);
276 inode = bkey_i_to_inode_v3(k);
278 if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
279 new_i_size > le64_to_cpu(inode->v.bi_size)) {
280 inode->v.bi_size = cpu_to_le64(new_i_size);
281 inode_update_flags = 0;
284 if (i_sectors_delta) {
285 le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
286 inode_update_flags = 0;
289 if (inode->k.p.snapshot != iter.snapshot) {
290 inode->k.p.snapshot = iter.snapshot;
291 inode_update_flags = 0;
294 ret = bch2_trans_update(trans, &iter, &inode->k_i,
295 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
298 bch2_trans_iter_exit(trans, &iter);
302 int bch2_extent_update(struct btree_trans *trans,
304 struct btree_iter *iter,
306 struct disk_reservation *disk_res,
308 s64 *i_sectors_delta_total,
311 struct bpos next_pos;
312 bool usage_increasing;
313 s64 i_sectors_delta = 0, disk_sectors_delta = 0;
317 * This traverses us the iterator without changing iter->path->pos to
318 * search_key() (which is pos + 1 for extents): we want there to be a
319 * path already traversed at iter->pos because
320 * bch2_trans_extent_update() will use it to attempt extent merging
322 ret = __bch2_btree_iter_traverse(iter);
326 ret = bch2_extent_trim_atomic(trans, iter, k);
332 ret = bch2_sum_sector_overwrites(trans, iter, k,
335 &disk_sectors_delta);
340 disk_sectors_delta > (s64) disk_res->sectors) {
341 ret = bch2_disk_reservation_add(trans->c, disk_res,
342 disk_sectors_delta - disk_res->sectors,
343 !check_enospc || !usage_increasing
344 ? BCH_DISK_RESERVATION_NOFAIL : 0);
351 * We always have to do an inode update - even when i_size/i_sectors
352 * aren't changing - for fsync to work properly; fsync relies on
353 * inode->bi_journal_seq which is updated by the trigger code:
355 ret = bch2_extent_update_i_size_sectors(trans, iter,
356 min(k->k.p.offset << 9, new_i_size),
358 bch2_trans_update(trans, iter, k, 0) ?:
359 bch2_trans_commit(trans, disk_res, NULL,
360 BTREE_INSERT_NOCHECK_RW|
361 BTREE_INSERT_NOFAIL);
365 if (i_sectors_delta_total)
366 *i_sectors_delta_total += i_sectors_delta;
367 bch2_btree_iter_set_pos(iter, next_pos);
371 /* Overwrites whatever was present with zeroes: */
372 int bch2_extent_fallocate(struct btree_trans *trans,
374 struct btree_iter *iter,
376 struct bch_io_opts opts,
377 s64 *i_sectors_delta,
378 struct write_point_specifier write_point)
380 struct bch_fs *c = trans->c;
381 struct disk_reservation disk_res = { 0 };
383 struct open_buckets open_buckets = { 0 };
385 struct bkey_buf old, new;
386 unsigned sectors_allocated = 0;
387 bool have_reservation = false;
388 bool unwritten = opts.nocow &&
389 c->sb.version >= bcachefs_metadata_version_unwritten_extents;
392 bch2_bkey_buf_init(&old);
393 bch2_bkey_buf_init(&new);
394 closure_init_stack(&cl);
396 k = bch2_btree_iter_peek_slot(iter);
401 sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
403 if (!have_reservation) {
404 unsigned new_replicas =
405 max(0, (int) opts.data_replicas -
406 (int) bch2_bkey_nr_ptrs_fully_allocated(k));
408 * Get a disk reservation before (in the nocow case) calling
409 * into the allocator:
411 ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
415 bch2_bkey_buf_reassemble(&old, c, k);
418 if (have_reservation) {
419 if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
422 bch2_key_resize(&new.k->k, sectors);
423 } else if (!unwritten) {
424 struct bkey_i_reservation *reservation;
426 bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
427 reservation = bkey_reservation_init(new.k);
428 reservation->k.p = iter->pos;
429 bch2_key_resize(&reservation->k, sectors);
430 reservation->v.nr_replicas = opts.data_replicas;
432 struct bkey_i_extent *e;
433 struct bch_devs_list devs_have;
434 struct write_point *wp;
435 struct bch_extent_ptr *ptr;
439 bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
441 e = bkey_extent_init(new.k);
444 ret = bch2_alloc_sectors_start_trans(trans,
445 opts.foreground_target,
451 BCH_WATERMARK_normal, 0, &cl, &wp);
452 if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
453 ret = -BCH_ERR_transaction_restart_nested;
457 sectors = min(sectors, wp->sectors_free);
458 sectors_allocated = sectors;
460 bch2_key_resize(&e->k, sectors);
462 bch2_open_bucket_get(c, wp, &open_buckets);
463 bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
464 bch2_alloc_sectors_done(c, wp);
466 extent_for_each_ptr(extent_i_to_s(e), ptr)
467 ptr->unwritten = true;
470 have_reservation = true;
472 ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
473 0, i_sectors_delta, true);
475 if (!ret && sectors_allocated)
476 bch2_increment_clock(c, sectors_allocated, WRITE);
478 bch2_open_buckets_put(c, &open_buckets);
479 bch2_disk_reservation_put(c, &disk_res);
480 bch2_bkey_buf_exit(&new, c);
481 bch2_bkey_buf_exit(&old, c);
483 if (closure_nr_remaining(&cl) != 1) {
484 bch2_trans_unlock(trans);
492 * Returns -BCH_ERR_transacton_restart if we had to drop locks:
494 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
495 subvol_inum inum, u64 end,
496 s64 *i_sectors_delta)
498 struct bch_fs *c = trans->c;
499 unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
500 struct bpos end_pos = POS(inum.inum, end);
502 int ret = 0, ret2 = 0;
506 bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
507 struct disk_reservation disk_res =
508 bch2_disk_reservation_init(c, 0);
509 struct bkey_i delete;
514 bch2_trans_begin(trans);
516 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
520 bch2_btree_iter_set_snapshot(iter, snapshot);
523 * peek_upto() doesn't have ideal semantics for extents:
525 k = bch2_btree_iter_peek_upto(iter, end_pos);
533 bkey_init(&delete.k);
534 delete.k.p = iter->pos;
536 /* create the biggest key we can */
537 bch2_key_resize(&delete.k, max_sectors);
538 bch2_cut_back(end_pos, &delete);
540 ret = bch2_extent_update(trans, inum, iter, &delete,
541 &disk_res, 0, i_sectors_delta, false);
542 bch2_disk_reservation_put(c, &disk_res);
548 int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
549 s64 *i_sectors_delta)
551 struct btree_trans trans;
552 struct btree_iter iter;
555 bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
556 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
557 POS(inum.inum, start),
560 ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
562 bch2_trans_iter_exit(&trans, &iter);
563 bch2_trans_exit(&trans);
565 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
571 static int bch2_write_index_default(struct bch_write_op *op)
573 struct bch_fs *c = op->c;
575 struct keylist *keys = &op->insert_keys;
576 struct bkey_i *k = bch2_keylist_front(keys);
577 struct btree_trans trans;
578 struct btree_iter iter;
580 .subvol = op->subvol,
581 .inum = k->k.p.inode,
585 BUG_ON(!inum.subvol);
587 bch2_bkey_buf_init(&sk);
588 bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
591 bch2_trans_begin(&trans);
593 k = bch2_keylist_front(keys);
594 bch2_bkey_buf_copy(&sk, c, k);
596 ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
597 &sk.k->k.p.snapshot);
598 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
603 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
604 bkey_start_pos(&sk.k->k),
605 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
607 ret = bch2_extent_update(&trans, inum, &iter, sk.k,
609 op->new_i_size, &op->i_sectors_delta,
610 op->flags & BCH_WRITE_CHECK_ENOSPC);
611 bch2_trans_iter_exit(&trans, &iter);
613 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
618 if (bkey_ge(iter.pos, k->k.p))
619 bch2_keylist_pop_front(&op->insert_keys);
621 bch2_cut_front(iter.pos, k);
622 } while (!bch2_keylist_empty(keys));
624 bch2_trans_exit(&trans);
625 bch2_bkey_buf_exit(&sk, c);
632 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
633 enum bch_data_type type,
634 const struct bkey_i *k,
637 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
638 const struct bch_extent_ptr *ptr;
639 struct bch_write_bio *n;
642 BUG_ON(c->opts.nochanges);
644 bkey_for_each_ptr(ptrs, ptr) {
645 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
648 ca = bch_dev_bkey_exists(c, ptr->dev);
650 if (to_entry(ptr + 1) < ptrs.end) {
651 n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
652 GFP_NOFS, &ca->replica_set));
654 n->bio.bi_end_io = wbio->bio.bi_end_io;
655 n->bio.bi_private = wbio->bio.bi_private;
660 n->bio.bi_opf = wbio->bio.bi_opf;
661 bio_inc_remaining(&wbio->bio);
669 n->have_ioref = nocow || bch2_dev_get_ioref(ca,
670 type == BCH_DATA_btree ? READ : WRITE);
672 n->submit_time = local_clock();
673 n->inode_offset = bkey_start_offset(&k->k);
674 n->bio.bi_iter.bi_sector = ptr->offset;
676 if (likely(n->have_ioref)) {
677 this_cpu_add(ca->io_done->sectors[WRITE][type],
678 bio_sectors(&n->bio));
680 bio_set_dev(&n->bio, ca->disk_sb.bdev);
682 if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
689 n->bio.bi_status = BLK_STS_REMOVED;
695 static void __bch2_write(struct bch_write_op *);
697 static void bch2_write_done(struct closure *cl)
699 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
700 struct bch_fs *c = op->c;
702 EBUG_ON(op->open_buckets.nr);
704 bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
705 bch2_disk_reservation_put(c, &op->res);
707 if (!(op->flags & BCH_WRITE_MOVE))
708 bch2_write_ref_put(c, BCH_WRITE_REF_write);
709 bch2_keylist_free(&op->insert_keys, op->inline_keys);
712 closure_debug_destroy(cl);
717 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
719 struct keylist *keys = &op->insert_keys;
720 struct bch_extent_ptr *ptr;
721 struct bkey_i *src, *dst = keys->keys, *n;
723 for (src = keys->keys; src != keys->top; src = n) {
726 if (bkey_extent_is_direct_data(&src->k)) {
727 bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
728 test_bit(ptr->dev, op->failed.d));
730 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
735 memmove_u64s_down(dst, src, src->k.u64s);
736 dst = bkey_next(dst);
744 * bch_write_index - after a write, update index to point to new data
746 static void __bch2_write_index(struct bch_write_op *op)
748 struct bch_fs *c = op->c;
749 struct keylist *keys = &op->insert_keys;
754 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
755 ret = bch2_write_drop_io_error_ptrs(op);
761 * probably not the ideal place to hook this in, but I don't
762 * particularly want to plumb io_opts all the way through the btree
763 * update stack right now
765 for_each_keylist_key(keys, k)
766 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
768 if (!bch2_keylist_empty(keys)) {
769 u64 sectors_start = keylist_sectors(keys);
771 ret = !(op->flags & BCH_WRITE_MOVE)
772 ? bch2_write_index_default(op)
773 : bch2_data_update_index_update(op);
775 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
776 BUG_ON(keylist_sectors(keys) && !ret);
778 op->written += sectors_start - keylist_sectors(keys);
780 if (ret && !bch2_err_matches(ret, EROFS)) {
781 struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
783 bch_err_inum_offset_ratelimited(c,
784 k->k.p.inode, k->k.p.offset << 9,
785 "write error while doing btree update: %s",
793 /* If some a bucket wasn't written, we can't erasure code it: */
794 for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
795 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
797 bch2_open_buckets_put(c, &op->open_buckets);
800 keys->top = keys->keys;
802 op->flags |= BCH_WRITE_DONE;
806 static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
808 if (state != wp->state) {
809 u64 now = ktime_get_ns();
811 if (wp->last_state_change &&
812 time_after64(now, wp->last_state_change))
813 wp->time[wp->state] += now - wp->last_state_change;
815 wp->last_state_change = now;
819 static inline void wp_update_state(struct write_point *wp, bool running)
821 enum write_point_state state;
823 state = running ? WRITE_POINT_running :
824 !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
825 : WRITE_POINT_stopped;
827 __wp_update_state(wp, state);
830 static void bch2_write_index(struct closure *cl)
832 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
833 struct write_point *wp = op->wp;
834 struct workqueue_struct *wq = index_update_wq(op);
837 if ((op->flags & BCH_WRITE_DONE) &&
838 (op->flags & BCH_WRITE_MOVE))
839 bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
841 spin_lock_irqsave(&wp->writes_lock, flags);
842 if (wp->state == WRITE_POINT_waiting_io)
843 __wp_update_state(wp, WRITE_POINT_waiting_work);
844 list_add_tail(&op->wp_list, &wp->writes);
845 spin_unlock_irqrestore (&wp->writes_lock, flags);
847 queue_work(wq, &wp->index_update_work);
850 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
854 if (wp->state == WRITE_POINT_stopped) {
855 spin_lock_irq(&wp->writes_lock);
856 __wp_update_state(wp, WRITE_POINT_waiting_io);
857 spin_unlock_irq(&wp->writes_lock);
861 void bch2_write_point_do_index_updates(struct work_struct *work)
863 struct write_point *wp =
864 container_of(work, struct write_point, index_update_work);
865 struct bch_write_op *op;
868 spin_lock_irq(&wp->writes_lock);
869 op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
871 list_del(&op->wp_list);
872 wp_update_state(wp, op != NULL);
873 spin_unlock_irq(&wp->writes_lock);
878 op->flags |= BCH_WRITE_IN_WORKER;
880 __bch2_write_index(op);
882 if (!(op->flags & BCH_WRITE_DONE))
885 bch2_write_done(&op->cl);
889 static void bch2_write_endio(struct bio *bio)
891 struct closure *cl = bio->bi_private;
892 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
893 struct bch_write_bio *wbio = to_wbio(bio);
894 struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
895 struct bch_fs *c = wbio->c;
896 struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
898 if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
900 wbio->inode_offset << 9,
901 "data write error: %s",
902 bch2_blk_status_to_str(bio->bi_status))) {
903 set_bit(wbio->dev, op->failed.d);
904 op->flags |= BCH_WRITE_IO_ERROR;
908 set_bit(wbio->dev, op->devs_need_flush->d);
910 if (wbio->have_ioref) {
911 bch2_latency_acct(ca, wbio->submit_time, WRITE);
912 percpu_ref_put(&ca->io_ref);
916 bch2_bio_free_pages_pool(c, bio);
922 bio_endio(&parent->bio);
927 static void init_append_extent(struct bch_write_op *op,
928 struct write_point *wp,
929 struct bversion version,
930 struct bch_extent_crc_unpacked crc)
932 struct bkey_i_extent *e;
934 op->pos.offset += crc.uncompressed_size;
936 e = bkey_extent_init(op->insert_keys.top);
938 e->k.size = crc.uncompressed_size;
939 e->k.version = version;
942 crc.compression_type ||
944 bch2_extent_crc_append(&e->k_i, crc);
946 bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
947 op->flags & BCH_WRITE_CACHED);
949 bch2_keylist_push(&op->insert_keys);
952 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
953 struct write_point *wp,
955 bool *page_alloc_failed,
958 struct bch_write_bio *wbio;
960 unsigned output_available =
961 min(wp->sectors_free << 9, src->bi_iter.bi_size);
962 unsigned pages = DIV_ROUND_UP(output_available +
964 ? ((unsigned long) buf & (PAGE_SIZE - 1))
967 pages = min(pages, BIO_MAX_VECS);
969 bio = bio_alloc_bioset(NULL, pages, 0,
970 GFP_NOFS, &c->bio_write);
971 wbio = wbio_init(bio);
972 wbio->put_bio = true;
973 /* copy WRITE_SYNC flag */
974 wbio->bio.bi_opf = src->bi_opf;
977 bch2_bio_map(bio, buf, output_available);
984 * We can't use mempool for more than c->sb.encoded_extent_max
985 * worth of pages, but we'd like to allocate more if we can:
987 bch2_bio_alloc_pages_pool(c, bio,
988 min_t(unsigned, output_available,
989 c->opts.encoded_extent_max));
991 if (bio->bi_iter.bi_size < output_available)
993 bch2_bio_alloc_pages(bio,
995 bio->bi_iter.bi_size,
1001 static int bch2_write_rechecksum(struct bch_fs *c,
1002 struct bch_write_op *op,
1003 unsigned new_csum_type)
1005 struct bio *bio = &op->wbio.bio;
1006 struct bch_extent_crc_unpacked new_crc;
1009 /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
1011 if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
1012 bch2_csum_type_is_encryption(new_csum_type))
1013 new_csum_type = op->crc.csum_type;
1015 ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
1017 op->crc.offset, op->crc.live_size,
1022 bio_advance(bio, op->crc.offset << 9);
1023 bio->bi_iter.bi_size = op->crc.live_size << 9;
1028 static int bch2_write_decrypt(struct bch_write_op *op)
1030 struct bch_fs *c = op->c;
1031 struct nonce nonce = extent_nonce(op->version, op->crc);
1032 struct bch_csum csum;
1035 if (!bch2_csum_type_is_encryption(op->crc.csum_type))
1039 * If we need to decrypt data in the write path, we'll no longer be able
1040 * to verify the existing checksum (poly1305 mac, in this case) after
1041 * it's decrypted - this is the last point we'll be able to reverify the
1044 csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
1045 if (bch2_crc_cmp(op->crc.csum, csum))
1048 ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
1049 op->crc.csum_type = 0;
1050 op->crc.csum = (struct bch_csum) { 0, 0 };
1054 static enum prep_encoded_ret {
1057 PREP_ENCODED_CHECKSUM_ERR,
1058 PREP_ENCODED_DO_WRITE,
1059 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
1061 struct bch_fs *c = op->c;
1062 struct bio *bio = &op->wbio.bio;
1064 if (!(op->flags & BCH_WRITE_DATA_ENCODED))
1065 return PREP_ENCODED_OK;
1067 BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
1069 /* Can we just write the entire extent as is? */
1070 if (op->crc.uncompressed_size == op->crc.live_size &&
1071 op->crc.compressed_size <= wp->sectors_free &&
1072 (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
1073 op->incompressible)) {
1074 if (!crc_is_compressed(op->crc) &&
1075 op->csum_type != op->crc.csum_type &&
1076 bch2_write_rechecksum(c, op, op->csum_type) &&
1077 !c->opts.no_data_io)
1078 return PREP_ENCODED_CHECKSUM_ERR;
1080 return PREP_ENCODED_DO_WRITE;
1084 * If the data is compressed and we couldn't write the entire extent as
1085 * is, we have to decompress it:
1087 if (crc_is_compressed(op->crc)) {
1088 struct bch_csum csum;
1090 if (bch2_write_decrypt(op))
1091 return PREP_ENCODED_CHECKSUM_ERR;
1093 /* Last point we can still verify checksum: */
1094 csum = bch2_checksum_bio(c, op->crc.csum_type,
1095 extent_nonce(op->version, op->crc),
1097 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
1098 return PREP_ENCODED_CHECKSUM_ERR;
1100 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
1101 return PREP_ENCODED_ERR;
1105 * No longer have compressed data after this point - data might be
1110 * If the data is checksummed and we're only writing a subset,
1111 * rechecksum and adjust bio to point to currently live data:
1113 if ((op->crc.live_size != op->crc.uncompressed_size ||
1114 op->crc.csum_type != op->csum_type) &&
1115 bch2_write_rechecksum(c, op, op->csum_type) &&
1116 !c->opts.no_data_io)
1117 return PREP_ENCODED_CHECKSUM_ERR;
1120 * If we want to compress the data, it has to be decrypted:
1122 if ((op->compression_opt ||
1123 bch2_csum_type_is_encryption(op->crc.csum_type) !=
1124 bch2_csum_type_is_encryption(op->csum_type)) &&
1125 bch2_write_decrypt(op))
1126 return PREP_ENCODED_CHECKSUM_ERR;
1128 return PREP_ENCODED_OK;
1131 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
1134 struct bch_fs *c = op->c;
1135 struct bio *src = &op->wbio.bio, *dst = src;
1136 struct bvec_iter saved_iter;
1138 unsigned total_output = 0, total_input = 0;
1139 bool bounce = false;
1140 bool page_alloc_failed = false;
1143 BUG_ON(!bio_sectors(src));
1145 ec_buf = bch2_writepoint_ec_buf(c, wp);
1147 switch (bch2_write_prep_encoded_data(op, wp)) {
1148 case PREP_ENCODED_OK:
1150 case PREP_ENCODED_ERR:
1153 case PREP_ENCODED_CHECKSUM_ERR:
1155 case PREP_ENCODED_DO_WRITE:
1156 /* XXX look for bug here */
1158 dst = bch2_write_bio_alloc(c, wp, src,
1161 bio_copy_data(dst, src);
1164 init_append_extent(op, wp, op->version, op->crc);
1169 op->compression_opt ||
1171 !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
1172 (bch2_csum_type_is_encryption(op->csum_type) &&
1173 !(op->flags & BCH_WRITE_PAGES_OWNED))) {
1174 dst = bch2_write_bio_alloc(c, wp, src,
1180 saved_iter = dst->bi_iter;
1183 struct bch_extent_crc_unpacked crc = { 0 };
1184 struct bversion version = op->version;
1185 size_t dst_len, src_len;
1187 if (page_alloc_failed &&
1188 dst->bi_iter.bi_size < (wp->sectors_free << 9) &&
1189 dst->bi_iter.bi_size < c->opts.encoded_extent_max)
1192 BUG_ON(op->compression_opt &&
1193 (op->flags & BCH_WRITE_DATA_ENCODED) &&
1194 bch2_csum_type_is_encryption(op->crc.csum_type));
1195 BUG_ON(op->compression_opt && !bounce);
1197 crc.compression_type = op->incompressible
1198 ? BCH_COMPRESSION_TYPE_incompressible
1199 : op->compression_opt
1200 ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
1201 op->compression_opt)
1203 if (!crc_is_compressed(crc)) {
1204 dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
1205 dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
1208 dst_len = min_t(unsigned, dst_len,
1209 c->opts.encoded_extent_max);
1212 swap(dst->bi_iter.bi_size, dst_len);
1213 bio_copy_data(dst, src);
1214 swap(dst->bi_iter.bi_size, dst_len);
1220 BUG_ON(!src_len || !dst_len);
1222 if (bch2_csum_type_is_encryption(op->csum_type)) {
1223 if (bversion_zero(version)) {
1224 version.lo = atomic64_inc_return(&c->key_version);
1226 crc.nonce = op->nonce;
1227 op->nonce += src_len >> 9;
1231 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
1232 !crc_is_compressed(crc) &&
1233 bch2_csum_type_is_encryption(op->crc.csum_type) ==
1234 bch2_csum_type_is_encryption(op->csum_type)) {
1235 u8 compression_type = crc.compression_type;
1236 u16 nonce = crc.nonce;
1238 * Note: when we're using rechecksum(), we need to be
1239 * checksumming @src because it has all the data our
1240 * existing checksum covers - if we bounced (because we
1241 * were trying to compress), @dst will only have the
1242 * part of the data the new checksum will cover.
1244 * But normally we want to be checksumming post bounce,
1245 * because part of the reason for bouncing is so the
1246 * data can't be modified (by userspace) while it's in
1249 if (bch2_rechecksum_bio(c, src, version, op->crc,
1252 bio_sectors(src) - (src_len >> 9),
1256 * rchecksum_bio sets compression_type on crc from op->crc,
1257 * this isn't always correct as sometimes we're changing
1258 * an extent from uncompressed to incompressible.
1260 crc.compression_type = compression_type;
1263 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
1264 bch2_rechecksum_bio(c, src, version, op->crc,
1267 bio_sectors(src) - (src_len >> 9),
1271 crc.compressed_size = dst_len >> 9;
1272 crc.uncompressed_size = src_len >> 9;
1273 crc.live_size = src_len >> 9;
1275 swap(dst->bi_iter.bi_size, dst_len);
1276 ret = bch2_encrypt_bio(c, op->csum_type,
1277 extent_nonce(version, crc), dst);
1281 crc.csum = bch2_checksum_bio(c, op->csum_type,
1282 extent_nonce(version, crc), dst);
1283 crc.csum_type = op->csum_type;
1284 swap(dst->bi_iter.bi_size, dst_len);
1287 init_append_extent(op, wp, version, crc);
1290 bio_advance(dst, dst_len);
1291 bio_advance(src, src_len);
1292 total_output += dst_len;
1293 total_input += src_len;
1294 } while (dst->bi_iter.bi_size &&
1295 src->bi_iter.bi_size &&
1297 !bch2_keylist_realloc(&op->insert_keys,
1299 ARRAY_SIZE(op->inline_keys),
1300 BKEY_EXTENT_U64s_MAX));
1302 more = src->bi_iter.bi_size != 0;
1304 dst->bi_iter = saved_iter;
1306 if (dst == src && more) {
1307 BUG_ON(total_output != total_input);
1309 dst = bio_split(src, total_input >> 9,
1310 GFP_NOFS, &c->bio_write);
1311 wbio_init(dst)->put_bio = true;
1312 /* copy WRITE_SYNC flag */
1313 dst->bi_opf = src->bi_opf;
1316 dst->bi_iter.bi_size = total_output;
1321 bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
1324 if (to_wbio(dst)->bounce)
1325 bch2_bio_free_pages_pool(c, dst);
1326 if (to_wbio(dst)->put_bio)
1332 static bool bch2_extent_is_writeable(struct bch_write_op *op,
1335 struct bch_fs *c = op->c;
1336 struct bkey_s_c_extent e;
1337 struct extent_ptr_decoded p;
1338 const union bch_extent_entry *entry;
1339 unsigned replicas = 0;
1341 if (k.k->type != KEY_TYPE_extent)
1344 e = bkey_s_c_to_extent(k);
1345 extent_for_each_ptr_decode(e, p, entry) {
1346 if (p.crc.csum_type ||
1347 crc_is_compressed(p.crc) ||
1351 replicas += bch2_extent_ptr_durability(c, &p);
1354 return replicas >= op->opts.data_replicas;
1357 static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
1359 struct bch_fs *c = op->c;
1360 const struct bch_extent_ptr *ptr;
1363 for_each_keylist_key(&op->insert_keys, k) {
1364 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
1366 bkey_for_each_ptr(ptrs, ptr)
1367 bch2_bucket_nocow_unlock(&c->nocow_locks,
1368 PTR_BUCKET_POS(c, ptr),
1369 BUCKET_NOCOW_LOCK_UPDATE);
1373 static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
1374 struct btree_iter *iter,
1375 struct bkey_i *orig,
1380 struct bkey_ptrs ptrs;
1381 struct bch_extent_ptr *ptr;
1384 if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
1389 new = bch2_bkey_make_mut_noupdate(trans, k);
1390 ret = PTR_ERR_OR_ZERO(new);
1394 bch2_cut_front(bkey_start_pos(&orig->k), new);
1395 bch2_cut_back(orig->k.p, new);
1397 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
1398 bkey_for_each_ptr(ptrs, ptr)
1402 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
1403 * that was done when we kicked off the write, and here it's important
1404 * that we update the extent that we wrote to - even if a snapshot has
1405 * since been created. The write is still outstanding, so we're ok
1406 * w.r.t. snapshot atomicity:
1408 return bch2_extent_update_i_size_sectors(trans, iter,
1409 min(new->k.p.offset << 9, new_i_size), 0) ?:
1410 bch2_trans_update(trans, iter, new,
1411 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1414 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
1416 struct bch_fs *c = op->c;
1417 struct btree_trans trans;
1418 struct btree_iter iter;
1419 struct bkey_i *orig;
1423 bch2_trans_init(&trans, c, 0, 0);
1425 for_each_keylist_key(&op->insert_keys, orig) {
1426 ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
1427 bkey_start_pos(&orig->k), orig->k.p,
1428 BTREE_ITER_INTENT, k,
1429 NULL, NULL, BTREE_INSERT_NOFAIL, ({
1430 bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
1433 if (ret && !bch2_err_matches(ret, EROFS)) {
1434 struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
1436 bch_err_inum_offset_ratelimited(c,
1437 k->k.p.inode, k->k.p.offset << 9,
1438 "write error while doing btree update: %s",
1448 bch2_trans_exit(&trans);
1451 static void __bch2_nocow_write_done(struct bch_write_op *op)
1453 bch2_nocow_write_unlock(op);
1455 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
1457 } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
1458 bch2_nocow_write_convert_unwritten(op);
1461 static void bch2_nocow_write_done(struct closure *cl)
1463 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1465 __bch2_nocow_write_done(op);
1466 bch2_write_done(cl);
1469 static void bch2_nocow_write(struct bch_write_op *op)
1471 struct bch_fs *c = op->c;
1472 struct btree_trans trans;
1473 struct btree_iter iter;
1475 struct bkey_ptrs_c ptrs;
1476 const struct bch_extent_ptr *ptr;
1480 struct nocow_lock_bucket *l;
1481 } buckets[BCH_REPLICAS_MAX];
1482 unsigned nr_buckets = 0;
1486 if (op->flags & BCH_WRITE_MOVE)
1489 bch2_trans_init(&trans, c, 0, 0);
1491 bch2_trans_begin(&trans);
1493 ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
1497 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1498 SPOS(op->pos.inode, op->pos.offset, snapshot),
1501 struct bio *bio = &op->wbio.bio;
1505 k = bch2_btree_iter_peek_slot(&iter);
1510 /* fall back to normal cow write path? */
1511 if (unlikely(k.k->p.snapshot != snapshot ||
1512 !bch2_extent_is_writeable(op, k)))
1515 if (bch2_keylist_realloc(&op->insert_keys,
1517 ARRAY_SIZE(op->inline_keys),
1521 /* Get iorefs before dropping btree locks: */
1522 ptrs = bch2_bkey_ptrs_c(k);
1523 bkey_for_each_ptr(ptrs, ptr) {
1524 buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
1525 buckets[nr_buckets].gen = ptr->gen;
1526 buckets[nr_buckets].l =
1527 bucket_nocow_lock(&c->nocow_locks,
1528 bucket_to_u64(buckets[nr_buckets].b));
1530 prefetch(buckets[nr_buckets].l);
1532 if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
1538 op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
1541 /* Unlock before taking nocow locks, doing IO: */
1542 bkey_reassemble(op->insert_keys.top, k);
1543 bch2_trans_unlock(&trans);
1545 bch2_cut_front(op->pos, op->insert_keys.top);
1546 if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
1547 bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
1549 for (i = 0; i < nr_buckets; i++) {
1550 struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
1551 struct nocow_lock_bucket *l = buckets[i].l;
1554 __bch2_bucket_nocow_lock(&c->nocow_locks, l,
1555 bucket_to_u64(buckets[i].b),
1556 BUCKET_NOCOW_LOCK_UPDATE);
1559 stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
1562 if (unlikely(stale))
1563 goto err_bucket_stale;
1566 bio = &op->wbio.bio;
1567 if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
1568 bio = bio_split(bio, k.k->p.offset - op->pos.offset,
1569 GFP_KERNEL, &c->bio_write);
1570 wbio_init(bio)->put_bio = true;
1571 bio->bi_opf = op->wbio.bio.bi_opf;
1573 op->flags |= BCH_WRITE_DONE;
1576 op->pos.offset += bio_sectors(bio);
1577 op->written += bio_sectors(bio);
1579 bio->bi_end_io = bch2_write_endio;
1580 bio->bi_private = &op->cl;
1581 bio->bi_opf |= REQ_OP_WRITE;
1582 closure_get(&op->cl);
1583 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
1584 op->insert_keys.top, true);
1586 bch2_keylist_push(&op->insert_keys);
1587 if (op->flags & BCH_WRITE_DONE)
1589 bch2_btree_iter_advance(&iter);
1592 bch2_trans_iter_exit(&trans, &iter);
1594 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1598 bch_err_inum_offset_ratelimited(c,
1600 op->pos.offset << 9,
1601 "%s: btree lookup error %s",
1602 __func__, bch2_err_str(ret));
1604 op->flags |= BCH_WRITE_DONE;
1607 bch2_trans_exit(&trans);
1609 /* fallback to cow write path? */
1610 if (!(op->flags & BCH_WRITE_DONE)) {
1611 closure_sync(&op->cl);
1612 __bch2_nocow_write_done(op);
1613 op->insert_keys.top = op->insert_keys.keys;
1614 } else if (op->flags & BCH_WRITE_SYNC) {
1615 closure_sync(&op->cl);
1616 bch2_nocow_write_done(&op->cl);
1620 * needs to run out of process context because ei_quota_lock is
1623 continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
1627 for (i = 0; i < nr_buckets; i++)
1628 percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
1630 /* Fall back to COW path: */
1634 bch2_bucket_nocow_unlock(&c->nocow_locks,
1636 BUCKET_NOCOW_LOCK_UPDATE);
1637 for (i = 0; i < nr_buckets; i++)
1638 percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
1640 /* We can retry this: */
1641 ret = -BCH_ERR_transaction_restart;
1645 static void __bch2_write(struct bch_write_op *op)
1647 struct bch_fs *c = op->c;
1648 struct write_point *wp = NULL;
1649 struct bio *bio = NULL;
1650 unsigned nofs_flags;
1653 nofs_flags = memalloc_nofs_save();
1655 if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
1656 bch2_nocow_write(op);
1657 if (op->flags & BCH_WRITE_DONE)
1658 goto out_nofs_restore;
1661 memset(&op->failed, 0, sizeof(op->failed));
1664 struct bkey_i *key_to_write;
1665 unsigned key_to_write_offset = op->insert_keys.top_p -
1666 op->insert_keys.keys_p;
1668 /* +1 for possible cache device: */
1669 if (op->open_buckets.nr + op->nr_replicas + 1 >
1670 ARRAY_SIZE(op->open_buckets.v))
1673 if (bch2_keylist_realloc(&op->insert_keys,
1675 ARRAY_SIZE(op->inline_keys),
1676 BKEY_EXTENT_U64s_MAX))
1680 * The copygc thread is now global, which means it's no longer
1681 * freeing up space on specific disks, which means that
1682 * allocations for specific disks may hang arbitrarily long:
1684 ret = bch2_trans_do(c, NULL, NULL, 0,
1685 bch2_alloc_sectors_start_trans(&trans,
1687 op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
1691 op->nr_replicas_required,
1694 (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
1695 BCH_WRITE_ONLY_SPECIFIED_DEVS))
1696 ? NULL : &op->cl, &wp));
1697 if (unlikely(ret)) {
1698 if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
1706 bch2_open_bucket_get(c, wp, &op->open_buckets);
1707 ret = bch2_write_extent(op, wp, &bio);
1709 bch2_alloc_sectors_done_inlined(c, wp);
1712 op->flags |= BCH_WRITE_DONE;
1720 bio->bi_end_io = bch2_write_endio;
1721 bio->bi_private = &op->cl;
1722 bio->bi_opf |= REQ_OP_WRITE;
1724 closure_get(bio->bi_private);
1726 key_to_write = (void *) (op->insert_keys.keys_p +
1727 key_to_write_offset);
1729 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
1730 key_to_write, false);
1736 * If we're running asynchronously, wne may still want to block
1737 * synchronously here if we weren't able to submit all of the IO at
1738 * once, as that signals backpressure to the caller.
1740 if ((op->flags & BCH_WRITE_SYNC) ||
1741 (!(op->flags & BCH_WRITE_DONE) &&
1742 !(op->flags & BCH_WRITE_IN_WORKER))) {
1743 closure_sync(&op->cl);
1744 __bch2_write_index(op);
1746 if (!(op->flags & BCH_WRITE_DONE))
1748 bch2_write_done(&op->cl);
1750 bch2_write_queue(op, wp);
1751 continue_at(&op->cl, bch2_write_index, NULL);
1754 memalloc_nofs_restore(nofs_flags);
1757 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1759 struct bio *bio = &op->wbio.bio;
1760 struct bvec_iter iter;
1761 struct bkey_i_inline_data *id;
1765 op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1766 op->flags |= BCH_WRITE_DONE;
1768 bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
1770 ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1771 ARRAY_SIZE(op->inline_keys),
1772 BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1778 sectors = bio_sectors(bio);
1779 op->pos.offset += sectors;
1781 id = bkey_inline_data_init(op->insert_keys.top);
1783 id->k.version = op->version;
1784 id->k.size = sectors;
1786 iter = bio->bi_iter;
1787 iter.bi_size = data_len;
1788 memcpy_from_bio(id->v.data, bio, iter);
1790 while (data_len & 7)
1791 id->v.data[data_len++] = '\0';
1792 set_bkey_val_bytes(&id->k, data_len);
1793 bch2_keylist_push(&op->insert_keys);
1795 __bch2_write_index(op);
1797 bch2_write_done(&op->cl);
1801 * bch_write - handle a write to a cache device or flash only volume
1803 * This is the starting point for any data to end up in a cache device; it could
1804 * be from a normal write, or a writeback write, or a write to a flash only
1805 * volume - it's also used by the moving garbage collector to compact data in
1806 * mostly empty buckets.
1808 * It first writes the data to the cache, creating a list of keys to be inserted
1809 * (if the data won't fit in a single open bucket, there will be multiple keys);
1810 * after the data is written it calls bch_journal, and after the keys have been
1811 * added to the next journal write they're inserted into the btree.
1813 * If op->discard is true, instead of inserting the data it invalidates the
1814 * region of the cache represented by op->bio and op->inode.
1816 void bch2_write(struct closure *cl)
1818 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1819 struct bio *bio = &op->wbio.bio;
1820 struct bch_fs *c = op->c;
1823 EBUG_ON(op->cl.parent);
1824 BUG_ON(!op->nr_replicas);
1825 BUG_ON(!op->write_point.v);
1826 BUG_ON(bkey_eq(op->pos, POS_MAX));
1828 op->start_time = local_clock();
1829 bch2_keylist_init(&op->insert_keys, op->inline_keys);
1830 wbio_init(bio)->put_bio = false;
1832 if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
1833 bch_err_inum_offset_ratelimited(c,
1835 op->pos.offset << 9,
1836 "misaligned write");
1841 if (c->opts.nochanges) {
1842 op->error = -BCH_ERR_erofs_no_writes;
1846 if (!(op->flags & BCH_WRITE_MOVE) &&
1847 !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
1848 op->error = -BCH_ERR_erofs_no_writes;
1852 this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
1853 bch2_increment_clock(c, bio_sectors(bio), WRITE);
1855 data_len = min_t(u64, bio->bi_iter.bi_size,
1856 op->new_i_size - (op->pos.offset << 9));
1858 if (c->opts.inline_data &&
1859 data_len <= min(block_bytes(c) / 2, 1024U)) {
1860 bch2_write_data_inline(op, data_len);
1867 bch2_disk_reservation_put(c, &op->res);
1869 closure_debug_destroy(&op->cl);
1874 static const char * const bch2_write_flags[] = {
1881 void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
1883 prt_str(out, "pos: ");
1884 bch2_bpos_to_text(out, op->pos);
1886 printbuf_indent_add(out, 2);
1888 prt_str(out, "started: ");
1889 bch2_pr_time_units(out, local_clock() - op->start_time);
1892 prt_str(out, "flags: ");
1893 prt_bitflags(out, bch2_write_flags, op->flags);
1896 prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
1899 printbuf_indent_sub(out, 2);
1902 /* Cache promotion on read */
1905 struct rcu_head rcu;
1908 struct rhash_head hash;
1911 struct data_update write;
1912 struct bio_vec bi_inline_vecs[0]; /* must be last */
1915 static const struct rhashtable_params bch_promote_params = {
1916 .head_offset = offsetof(struct promote_op, hash),
1917 .key_offset = offsetof(struct promote_op, pos),
1918 .key_len = sizeof(struct bpos),
1921 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1923 struct bch_io_opts opts,
1926 if (!(flags & BCH_READ_MAY_PROMOTE))
1929 if (!opts.promote_target)
1932 if (bch2_bkey_has_target(c, k, opts.promote_target))
1935 if (bkey_extent_is_unwritten(k))
1938 if (bch2_target_congested(c, opts.promote_target)) {
1939 /* XXX trace this */
1943 if (rhashtable_lookup_fast(&c->promote_table, &pos,
1944 bch_promote_params))
1950 static void promote_free(struct bch_fs *c, struct promote_op *op)
1954 bch2_data_update_exit(&op->write);
1956 ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1957 bch_promote_params);
1959 bch2_write_ref_put(c, BCH_WRITE_REF_promote);
1963 static void promote_done(struct bch_write_op *wop)
1965 struct promote_op *op =
1966 container_of(wop, struct promote_op, write.op);
1967 struct bch_fs *c = op->write.op.c;
1969 bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1971 promote_free(c, op);
1974 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1976 struct bio *bio = &op->write.op.wbio.bio;
1978 trace_and_count(op->write.op.c, read_promote, &rbio->bio);
1980 /* we now own pages: */
1981 BUG_ON(!rbio->bounce);
1982 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1984 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1985 sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1986 swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1988 bch2_data_update_read_done(&op->write, rbio->pick.crc);
1991 static struct promote_op *__promote_alloc(struct btree_trans *trans,
1992 enum btree_id btree_id,
1995 struct extent_ptr_decoded *pick,
1996 struct bch_io_opts opts,
1998 struct bch_read_bio **rbio)
2000 struct bch_fs *c = trans->c;
2001 struct promote_op *op = NULL;
2003 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
2006 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
2009 op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
2013 op->start_time = local_clock();
2017 * We don't use the mempool here because extents that aren't
2018 * checksummed or compressed can be too big for the mempool:
2020 *rbio = kzalloc(sizeof(struct bch_read_bio) +
2021 sizeof(struct bio_vec) * pages,
2026 rbio_init(&(*rbio)->bio, opts);
2027 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
2029 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
2033 (*rbio)->bounce = true;
2034 (*rbio)->split = true;
2035 (*rbio)->kmalloc = true;
2037 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
2038 bch_promote_params))
2041 bio = &op->write.op.wbio.bio;
2042 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
2044 ret = bch2_data_update_init(trans, NULL, &op->write,
2045 writepoint_hashed((unsigned long) current),
2047 (struct data_update_opts) {
2048 .target = opts.promote_target,
2049 .extra_replicas = 1,
2050 .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
2054 * possible errors: -BCH_ERR_nocow_lock_blocked,
2055 * -BCH_ERR_ENOSPC_disk_reservation:
2058 ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
2059 bch_promote_params);
2064 op->write.op.end_io = promote_done;
2069 bio_free_pages(&(*rbio)->bio);
2073 bch2_write_ref_put(c, BCH_WRITE_REF_promote);
2078 static struct promote_op *promote_alloc(struct btree_trans *trans,
2079 struct bvec_iter iter,
2081 struct extent_ptr_decoded *pick,
2082 struct bch_io_opts opts,
2084 struct bch_read_bio **rbio,
2088 struct bch_fs *c = trans->c;
2089 bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
2090 /* data might have to be decompressed in the write path: */
2091 unsigned sectors = promote_full
2092 ? max(pick->crc.compressed_size, pick->crc.live_size)
2093 : bvec_iter_sectors(iter);
2094 struct bpos pos = promote_full
2095 ? bkey_start_pos(k.k)
2096 : POS(k.k->p.inode, iter.bi_sector);
2097 struct promote_op *promote;
2099 if (!should_promote(c, k, pos, opts, flags))
2102 promote = __promote_alloc(trans,
2103 k.k->type == KEY_TYPE_reflink_v
2106 k, pos, pick, opts, sectors, rbio);
2111 *read_full = promote_full;
2117 #define READ_RETRY_AVOID 1
2118 #define READ_RETRY 2
2123 RBIO_CONTEXT_HIGHPRI,
2124 RBIO_CONTEXT_UNBOUND,
2127 static inline struct bch_read_bio *
2128 bch2_rbio_parent(struct bch_read_bio *rbio)
2130 return rbio->split ? rbio->parent : rbio;
2134 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
2135 enum rbio_context context,
2136 struct workqueue_struct *wq)
2138 if (context <= rbio->context) {
2141 rbio->work.func = fn;
2142 rbio->context = context;
2143 queue_work(wq, &rbio->work);
2147 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
2149 BUG_ON(rbio->bounce && !rbio->split);
2152 promote_free(rbio->c, rbio->promote);
2153 rbio->promote = NULL;
2156 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
2159 struct bch_read_bio *parent = rbio->parent;
2164 bio_put(&rbio->bio);
2173 * Only called on a top level bch_read_bio to complete an entire read request,
2176 static void bch2_rbio_done(struct bch_read_bio *rbio)
2178 if (rbio->start_time)
2179 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
2181 bio_endio(&rbio->bio);
2184 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
2185 struct bvec_iter bvec_iter,
2186 struct bch_io_failures *failed,
2189 struct btree_trans trans;
2190 struct btree_iter iter;
2195 flags &= ~BCH_READ_LAST_FRAGMENT;
2196 flags |= BCH_READ_MUST_CLONE;
2198 bch2_bkey_buf_init(&sk);
2199 bch2_trans_init(&trans, c, 0, 0);
2201 bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
2202 rbio->read_pos, BTREE_ITER_SLOTS);
2204 rbio->bio.bi_status = 0;
2206 k = bch2_btree_iter_peek_slot(&iter);
2210 bch2_bkey_buf_reassemble(&sk, c, k);
2211 k = bkey_i_to_s_c(sk.k);
2212 bch2_trans_unlock(&trans);
2214 if (!bch2_bkey_matches_ptr(c, k,
2216 rbio->data_pos.offset -
2217 rbio->pick.crc.offset)) {
2218 /* extent we wanted to read no longer exists: */
2223 ret = __bch2_read_extent(&trans, rbio, bvec_iter,
2226 k, 0, failed, flags);
2227 if (ret == READ_RETRY)
2232 bch2_rbio_done(rbio);
2233 bch2_trans_iter_exit(&trans, &iter);
2234 bch2_trans_exit(&trans);
2235 bch2_bkey_buf_exit(&sk, c);
2238 rbio->bio.bi_status = BLK_STS_IOERR;
2242 static void bch2_rbio_retry(struct work_struct *work)
2244 struct bch_read_bio *rbio =
2245 container_of(work, struct bch_read_bio, work);
2246 struct bch_fs *c = rbio->c;
2247 struct bvec_iter iter = rbio->bvec_iter;
2248 unsigned flags = rbio->flags;
2249 subvol_inum inum = {
2250 .subvol = rbio->subvol,
2251 .inum = rbio->read_pos.inode,
2253 struct bch_io_failures failed = { .nr = 0 };
2255 trace_and_count(c, read_retry, &rbio->bio);
2257 if (rbio->retry == READ_RETRY_AVOID)
2258 bch2_mark_io_failure(&failed, &rbio->pick);
2260 rbio->bio.bi_status = 0;
2262 rbio = bch2_rbio_free(rbio);
2264 flags |= BCH_READ_IN_RETRY;
2265 flags &= ~BCH_READ_MAY_PROMOTE;
2267 if (flags & BCH_READ_NODECODE) {
2268 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
2270 flags &= ~BCH_READ_LAST_FRAGMENT;
2271 flags |= BCH_READ_MUST_CLONE;
2273 __bch2_read(c, rbio, iter, inum, &failed, flags);
2277 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
2280 rbio->retry = retry;
2282 if (rbio->flags & BCH_READ_IN_RETRY)
2285 if (retry == READ_ERR) {
2286 rbio = bch2_rbio_free(rbio);
2288 rbio->bio.bi_status = error;
2289 bch2_rbio_done(rbio);
2291 bch2_rbio_punt(rbio, bch2_rbio_retry,
2292 RBIO_CONTEXT_UNBOUND, system_unbound_wq);
2296 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
2297 struct bch_read_bio *rbio)
2299 struct bch_fs *c = rbio->c;
2300 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
2301 struct bch_extent_crc_unpacked new_crc;
2302 struct btree_iter iter;
2307 if (crc_is_compressed(rbio->pick.crc))
2310 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
2311 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
2312 if ((ret = bkey_err(k)))
2315 if (bversion_cmp(k.k->version, rbio->version) ||
2316 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
2319 /* Extent was merged? */
2320 if (bkey_start_offset(k.k) < data_offset ||
2321 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
2324 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
2325 rbio->pick.crc, NULL, &new_crc,
2326 bkey_start_offset(k.k) - data_offset, k.k->size,
2327 rbio->pick.crc.csum_type)) {
2328 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
2334 * going to be temporarily appending another checksum entry:
2336 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
2337 sizeof(struct bch_extent_crc128));
2338 if ((ret = PTR_ERR_OR_ZERO(new)))
2341 bkey_reassemble(new, k);
2343 if (!bch2_bkey_narrow_crcs(new, new_crc))
2346 ret = bch2_trans_update(trans, &iter, new,
2347 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
2349 bch2_trans_iter_exit(trans, &iter);
2353 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
2355 bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
2356 __bch2_rbio_narrow_crcs(&trans, rbio));
2359 /* Inner part that may run in process context */
2360 static void __bch2_read_endio(struct work_struct *work)
2362 struct bch_read_bio *rbio =
2363 container_of(work, struct bch_read_bio, work);
2364 struct bch_fs *c = rbio->c;
2365 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
2366 struct bio *src = &rbio->bio;
2367 struct bio *dst = &bch2_rbio_parent(rbio)->bio;
2368 struct bvec_iter dst_iter = rbio->bvec_iter;
2369 struct bch_extent_crc_unpacked crc = rbio->pick.crc;
2370 struct nonce nonce = extent_nonce(rbio->version, crc);
2371 unsigned nofs_flags;
2372 struct bch_csum csum;
2375 nofs_flags = memalloc_nofs_save();
2377 /* Reset iterator for checksumming and copying bounced data: */
2379 src->bi_iter.bi_size = crc.compressed_size << 9;
2380 src->bi_iter.bi_idx = 0;
2381 src->bi_iter.bi_bvec_done = 0;
2383 src->bi_iter = rbio->bvec_iter;
2386 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
2387 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
2392 * We need to rework the narrow_crcs path to deliver the read completion
2393 * first, and then punt to a different workqueue, otherwise we're
2394 * holding up reads while doing btree updates which is bad for memory
2397 if (unlikely(rbio->narrow_crcs))
2398 bch2_rbio_narrow_crcs(rbio);
2400 if (rbio->flags & BCH_READ_NODECODE)
2403 /* Adjust crc to point to subset of data we want: */
2404 crc.offset += rbio->offset_into_extent;
2405 crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
2407 if (crc_is_compressed(crc)) {
2408 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
2412 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
2413 !c->opts.no_data_io)
2414 goto decompression_err;
2416 /* don't need to decrypt the entire bio: */
2417 nonce = nonce_add(nonce, crc.offset << 9);
2418 bio_advance(src, crc.offset << 9);
2420 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
2421 src->bi_iter.bi_size = dst_iter.bi_size;
2423 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
2428 struct bvec_iter src_iter = src->bi_iter;
2430 bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
2434 if (rbio->promote) {
2436 * Re encrypt data we decrypted, so it's consistent with
2439 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
2443 promote_start(rbio->promote, rbio);
2444 rbio->promote = NULL;
2447 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
2448 rbio = bch2_rbio_free(rbio);
2449 bch2_rbio_done(rbio);
2452 memalloc_nofs_restore(nofs_flags);
2456 * Checksum error: if the bio wasn't bounced, we may have been
2457 * reading into buffers owned by userspace (that userspace can
2458 * scribble over) - retry the read, bouncing it this time:
2460 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
2461 rbio->flags |= BCH_READ_MUST_BOUNCE;
2462 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
2466 bch_err_inum_offset_ratelimited(ca,
2467 rbio->read_pos.inode,
2468 rbio->read_pos.offset << 9,
2469 "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
2470 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
2471 csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
2473 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2476 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
2477 rbio->read_pos.offset << 9,
2478 "decompression error");
2479 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
2482 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
2483 rbio->read_pos.offset << 9,
2485 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
2489 static void bch2_read_endio(struct bio *bio)
2491 struct bch_read_bio *rbio =
2492 container_of(bio, struct bch_read_bio, bio);
2493 struct bch_fs *c = rbio->c;
2494 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
2495 struct workqueue_struct *wq = NULL;
2496 enum rbio_context context = RBIO_CONTEXT_NULL;
2498 if (rbio->have_ioref) {
2499 bch2_latency_acct(ca, rbio->submit_time, READ);
2500 percpu_ref_put(&ca->io_ref);
2504 rbio->bio.bi_end_io = rbio->end_io;
2506 if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
2507 rbio->read_pos.inode,
2508 rbio->read_pos.offset,
2509 "data read error: %s",
2510 bch2_blk_status_to_str(bio->bi_status))) {
2511 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
2515 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
2516 ptr_stale(ca, &rbio->pick.ptr)) {
2517 trace_and_count(c, read_reuse_race, &rbio->bio);
2519 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
2520 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
2522 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
2526 if (rbio->narrow_crcs ||
2528 crc_is_compressed(rbio->pick.crc) ||
2529 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
2530 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
2531 else if (rbio->pick.crc.csum_type)
2532 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
2534 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
2537 int __bch2_read_indirect_extent(struct btree_trans *trans,
2538 unsigned *offset_into_extent,
2539 struct bkey_buf *orig_k)
2541 struct btree_iter iter;
2546 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
2547 *offset_into_extent;
2549 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
2550 POS(0, reflink_offset), 0);
2555 if (k.k->type != KEY_TYPE_reflink_v &&
2556 k.k->type != KEY_TYPE_indirect_inline_data) {
2557 bch_err_inum_offset_ratelimited(trans->c,
2558 orig_k->k->k.p.inode,
2559 orig_k->k->k.p.offset << 9,
2560 "%llu len %u points to nonexistent indirect extent %llu",
2561 orig_k->k->k.p.offset,
2564 bch2_inconsistent_error(trans->c);
2569 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
2570 bch2_bkey_buf_reassemble(orig_k, trans->c, k);
2572 bch2_trans_iter_exit(trans, &iter);
2576 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
2578 struct bch_extent_ptr ptr)
2580 struct bch_fs *c = trans->c;
2581 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
2582 struct btree_iter iter;
2583 struct printbuf buf = PRINTBUF;
2586 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2587 PTR_BUCKET_POS(c, &ptr),
2590 prt_printf(&buf, "Attempting to read from stale dirty pointer:");
2591 printbuf_indent_add(&buf, 2);
2594 bch2_bkey_val_to_text(&buf, c, k);
2597 prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
2599 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
2602 bch2_bkey_val_to_text(&buf, c, k);
2605 bch2_fs_inconsistent(c, "%s", buf.buf);
2607 bch2_trans_iter_exit(trans, &iter);
2608 printbuf_exit(&buf);
2611 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
2612 struct bvec_iter iter, struct bpos read_pos,
2613 enum btree_id data_btree, struct bkey_s_c k,
2614 unsigned offset_into_extent,
2615 struct bch_io_failures *failed, unsigned flags)
2617 struct bch_fs *c = trans->c;
2618 struct extent_ptr_decoded pick;
2619 struct bch_read_bio *rbio = NULL;
2620 struct bch_dev *ca = NULL;
2621 struct promote_op *promote = NULL;
2622 bool bounce = false, read_full = false, narrow_crcs = false;
2623 struct bpos data_pos = bkey_start_pos(k.k);
2626 if (bkey_extent_is_inline_data(k.k)) {
2627 unsigned bytes = min_t(unsigned, iter.bi_size,
2628 bkey_inline_data_bytes(k.k));
2630 swap(iter.bi_size, bytes);
2631 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
2632 swap(iter.bi_size, bytes);
2633 bio_advance_iter(&orig->bio, &iter, bytes);
2634 zero_fill_bio_iter(&orig->bio, iter);
2638 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
2640 /* hole or reservation - just zero fill: */
2645 bch_err_inum_offset_ratelimited(c,
2646 read_pos.inode, read_pos.offset << 9,
2647 "no device to read from");
2651 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
2654 * Stale dirty pointers are treated as IO errors, but @failed isn't
2655 * allocated unless we're in the retry path - so if we're not in the
2656 * retry path, don't check here, it'll be caught in bch2_read_endio()
2657 * and we'll end up in the retry path:
2659 if ((flags & BCH_READ_IN_RETRY) &&
2661 unlikely(ptr_stale(ca, &pick.ptr))) {
2662 read_from_stale_dirty_pointer(trans, k, pick.ptr);
2663 bch2_mark_io_failure(failed, &pick);
2668 * Unlock the iterator while the btree node's lock is still in
2669 * cache, before doing the IO:
2671 bch2_trans_unlock(trans);
2673 if (flags & BCH_READ_NODECODE) {
2675 * can happen if we retry, and the extent we were going to read
2676 * has been merged in the meantime:
2678 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
2681 iter.bi_size = pick.crc.compressed_size << 9;
2685 if (!(flags & BCH_READ_LAST_FRAGMENT) ||
2686 bio_flagged(&orig->bio, BIO_CHAIN))
2687 flags |= BCH_READ_MUST_CLONE;
2689 narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
2690 bch2_can_narrow_extent_crcs(k, pick.crc);
2692 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
2693 flags |= BCH_READ_MUST_BOUNCE;
2695 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
2697 if (crc_is_compressed(pick.crc) ||
2698 (pick.crc.csum_type != BCH_CSUM_none &&
2699 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2700 (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2701 (flags & BCH_READ_USER_MAPPED)) ||
2702 (flags & BCH_READ_MUST_BOUNCE)))) {
2707 if (orig->opts.promote_target)
2708 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
2709 &rbio, &bounce, &read_full);
2712 EBUG_ON(crc_is_compressed(pick.crc));
2713 EBUG_ON(pick.crc.csum_type &&
2714 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2715 bvec_iter_sectors(iter) != pick.crc.live_size ||
2717 offset_into_extent));
2719 data_pos.offset += offset_into_extent;
2720 pick.ptr.offset += pick.crc.offset +
2722 offset_into_extent = 0;
2723 pick.crc.compressed_size = bvec_iter_sectors(iter);
2724 pick.crc.uncompressed_size = bvec_iter_sectors(iter);
2725 pick.crc.offset = 0;
2726 pick.crc.live_size = bvec_iter_sectors(iter);
2727 offset_into_extent = 0;
2732 * promote already allocated bounce rbio:
2733 * promote needs to allocate a bio big enough for uncompressing
2734 * data in the write path, but we're not going to use it all
2737 EBUG_ON(rbio->bio.bi_iter.bi_size <
2738 pick.crc.compressed_size << 9);
2739 rbio->bio.bi_iter.bi_size =
2740 pick.crc.compressed_size << 9;
2741 } else if (bounce) {
2742 unsigned sectors = pick.crc.compressed_size;
2744 rbio = rbio_init(bio_alloc_bioset(NULL,
2745 DIV_ROUND_UP(sectors, PAGE_SECTORS),
2748 &c->bio_read_split),
2751 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2752 rbio->bounce = true;
2754 } else if (flags & BCH_READ_MUST_CLONE) {
2756 * Have to clone if there were any splits, due to error
2757 * reporting issues (if a split errored, and retrying didn't
2758 * work, when it reports the error to its parent (us) we don't
2759 * know if the error was from our bio, and we should retry, or
2760 * from the whole bio, in which case we don't want to retry and
2763 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
2764 &c->bio_read_split),
2766 rbio->bio.bi_iter = iter;
2770 rbio->bio.bi_iter = iter;
2771 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2774 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2777 rbio->submit_time = local_clock();
2779 rbio->parent = orig;
2781 rbio->end_io = orig->bio.bi_end_io;
2782 rbio->bvec_iter = iter;
2783 rbio->offset_into_extent= offset_into_extent;
2784 rbio->flags = flags;
2785 rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2786 rbio->narrow_crcs = narrow_crcs;
2790 /* XXX: only initialize this if needed */
2791 rbio->devs_have = bch2_bkey_devs(k);
2793 rbio->subvol = orig->subvol;
2794 rbio->read_pos = read_pos;
2795 rbio->data_btree = data_btree;
2796 rbio->data_pos = data_pos;
2797 rbio->version = k.k->version;
2798 rbio->promote = promote;
2799 INIT_WORK(&rbio->work, NULL);
2801 rbio->bio.bi_opf = orig->bio.bi_opf;
2802 rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2803 rbio->bio.bi_end_io = bch2_read_endio;
2806 trace_and_count(c, read_bounce, &rbio->bio);
2808 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
2809 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2812 * If it's being moved internally, we don't want to flag it as a cache
2815 if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
2816 bch2_bucket_io_time_reset(trans, pick.ptr.dev,
2817 PTR_BUCKET_NR(ca, &pick.ptr), READ);
2819 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2820 bio_inc_remaining(&orig->bio);
2821 trace_and_count(c, read_split, &orig->bio);
2824 if (!rbio->pick.idx) {
2825 if (!rbio->have_ioref) {
2826 bch_err_inum_offset_ratelimited(c,
2828 read_pos.offset << 9,
2829 "no device to read from");
2830 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2834 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
2835 bio_sectors(&rbio->bio));
2836 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2838 if (unlikely(c->opts.no_data_io)) {
2839 if (likely(!(flags & BCH_READ_IN_RETRY)))
2840 bio_endio(&rbio->bio);
2842 if (likely(!(flags & BCH_READ_IN_RETRY)))
2843 submit_bio(&rbio->bio);
2845 submit_bio_wait(&rbio->bio);
2849 * We just submitted IO which may block, we expect relock fail
2850 * events and shouldn't count them:
2852 trans->notrace_relock_fail = true;
2854 /* Attempting reconstruct read: */
2855 if (bch2_ec_read_extent(c, rbio)) {
2856 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2860 if (likely(!(flags & BCH_READ_IN_RETRY)))
2861 bio_endio(&rbio->bio);
2864 if (likely(!(flags & BCH_READ_IN_RETRY))) {
2869 rbio->context = RBIO_CONTEXT_UNBOUND;
2870 bch2_read_endio(&rbio->bio);
2873 rbio = bch2_rbio_free(rbio);
2875 if (ret == READ_RETRY_AVOID) {
2876 bch2_mark_io_failure(failed, &pick);
2887 if (flags & BCH_READ_IN_RETRY)
2890 orig->bio.bi_status = BLK_STS_IOERR;
2895 * won't normally happen in the BCH_READ_NODECODE
2896 * (bch2_move_extent()) path, but if we retry and the extent we wanted
2897 * to read no longer exists we have to signal that:
2899 if (flags & BCH_READ_NODECODE)
2902 zero_fill_bio_iter(&orig->bio, iter);
2904 if (flags & BCH_READ_LAST_FRAGMENT)
2905 bch2_rbio_done(orig);
2909 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
2910 struct bvec_iter bvec_iter, subvol_inum inum,
2911 struct bch_io_failures *failed, unsigned flags)
2913 struct btree_trans trans;
2914 struct btree_iter iter;
2920 BUG_ON(flags & BCH_READ_NODECODE);
2922 bch2_bkey_buf_init(&sk);
2923 bch2_trans_init(&trans, c, 0, 0);
2925 bch2_trans_begin(&trans);
2926 iter = (struct btree_iter) { NULL };
2928 ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
2932 bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
2933 SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
2936 unsigned bytes, sectors, offset_into_extent;
2937 enum btree_id data_btree = BTREE_ID_extents;
2940 * read_extent -> io_time_reset may cause a transaction restart
2941 * without returning an error, we need to check for that here:
2943 ret = bch2_trans_relock(&trans);
2947 bch2_btree_iter_set_pos(&iter,
2948 POS(inum.inum, bvec_iter.bi_sector));
2950 k = bch2_btree_iter_peek_slot(&iter);
2955 offset_into_extent = iter.pos.offset -
2956 bkey_start_offset(k.k);
2957 sectors = k.k->size - offset_into_extent;
2959 bch2_bkey_buf_reassemble(&sk, c, k);
2961 ret = bch2_read_indirect_extent(&trans, &data_btree,
2962 &offset_into_extent, &sk);
2966 k = bkey_i_to_s_c(sk.k);
2969 * With indirect extents, the amount of data to read is the min
2970 * of the original extent and the indirect extent:
2972 sectors = min(sectors, k.k->size - offset_into_extent);
2974 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
2975 swap(bvec_iter.bi_size, bytes);
2977 if (bvec_iter.bi_size == bytes)
2978 flags |= BCH_READ_LAST_FRAGMENT;
2980 ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
2982 offset_into_extent, failed, flags);
2986 if (flags & BCH_READ_LAST_FRAGMENT)
2989 swap(bvec_iter.bi_size, bytes);
2990 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
2992 ret = btree_trans_too_many_iters(&trans);
2997 bch2_trans_iter_exit(&trans, &iter);
2999 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
3000 ret == READ_RETRY ||
3001 ret == READ_RETRY_AVOID)
3004 bch2_trans_exit(&trans);
3005 bch2_bkey_buf_exit(&sk, c);
3008 bch_err_inum_offset_ratelimited(c, inum.inum,
3009 bvec_iter.bi_sector << 9,
3010 "read error %i from btree lookup", ret);
3011 rbio->bio.bi_status = BLK_STS_IOERR;
3012 bch2_rbio_done(rbio);
3016 void bch2_fs_io_exit(struct bch_fs *c)
3018 if (c->promote_table.tbl)
3019 rhashtable_destroy(&c->promote_table);
3020 mempool_exit(&c->bio_bounce_pages);
3021 bioset_exit(&c->bio_write);
3022 bioset_exit(&c->bio_read_split);
3023 bioset_exit(&c->bio_read);
3026 int bch2_fs_io_init(struct bch_fs *c)
3028 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
3030 return -BCH_ERR_ENOMEM_bio_read_init;
3032 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
3034 return -BCH_ERR_ENOMEM_bio_read_split_init;
3036 if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
3038 return -BCH_ERR_ENOMEM_bio_write_init;
3040 if (mempool_init_page_pool(&c->bio_bounce_pages,
3042 c->opts.btree_node_size,
3043 c->opts.encoded_extent_max) /
3045 return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
3047 if (rhashtable_init(&c->promote_table, &bch_promote_params))
3048 return -BCH_ERR_ENOMEM_promote_table_init;