]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/io.c
Update bcachefs sources to da7d42a9a2 bcachefs: Add new assertions for shutdown path
[bcachefs-tools-debian] / libbcachefs / io.c
index ea0fd6310b6e09353fd0eef999abffaa90889cf5..33762e4a0f05b262588f43ab66bb739d37e620fb 100644 (file)
 #include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
+#include "trace.h"
 
 #include <linux/blkdev.h>
 #include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
-#include <trace/events/bcachefs.h>
-
 const char *bch2_blk_status_to_str(blk_status_t status)
 {
        if (status == BLK_STS_REMOVED)
@@ -164,7 +163,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
        struct page *page;
 
        if (likely(!*using_mempool)) {
-               page = alloc_page(GFP_NOIO);
+               page = alloc_page(GFP_NOFS);
                if (unlikely(!page)) {
                        mutex_lock(&c->bio_bounce_pages_lock);
                        *using_mempool = true;
@@ -173,7 +172,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
                }
        } else {
 pool_alloc:
-               page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+               page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
        }
 
        return page;
@@ -218,7 +217,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 
        bch2_trans_copy_iter(&iter, extent_iter);
 
-       for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
+       for_each_btree_key_upto_continue_norestart(iter,
+                               new->k.p, BTREE_ITER_SLOTS, old, ret) {
                s64 sectors = min(new->k.p.offset, old.k->p.offset) -
                        max(bkey_start_offset(&new->k),
                            bkey_start_offset(old.k));
@@ -257,15 +257,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
        unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
        int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-                            SPOS(0,
-                                 extent_iter->pos.inode,
-                                 extent_iter->snapshot),
-                            BTREE_ITER_INTENT|BTREE_ITER_CACHED);
-       k = bch2_bkey_get_mut(trans, &iter);
+       k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+                             SPOS(0,
+                                  extent_iter->pos.inode,
+                                  extent_iter->snapshot),
+                             BTREE_ITER_CACHED);
        ret = PTR_ERR_OR_ZERO(k);
        if (unlikely(ret))
-               goto err;
+               return ret;
 
        if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
                k = bch2_inode_to_v3(trans, k);
@@ -384,6 +383,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
        struct open_buckets open_buckets;
        struct bkey_s_c k;
        struct bkey_buf old, new;
+       unsigned sectors_allocated;
        bool have_reservation = false;
        bool unwritten = opts.nocow &&
            c->sb.version >= bcachefs_metadata_version_unwritten_extents;
@@ -394,6 +394,8 @@ int bch2_extent_fallocate(struct btree_trans *trans,
        closure_init_stack(&cl);
        open_buckets.nr = 0;
 retry:
+       sectors_allocated = 0;
+
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
@@ -449,16 +451,17 @@ retry:
                                &devs_have,
                                opts.data_replicas,
                                opts.data_replicas,
-                               RESERVE_none, 0, &cl, &wp);
-               if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+                               BCH_WATERMARK_normal, 0, &cl, &wp);
+               if (ret) {
                        bch2_trans_unlock(trans);
                        closure_sync(&cl);
-                       goto retry;
-               }
-               if (ret)
+                       if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+                               goto retry;
                        return ret;
+               }
 
                sectors = min(sectors, wp->sectors_free);
+               sectors_allocated = sectors;
 
                bch2_key_resize(&e->k, sectors);
 
@@ -485,6 +488,9 @@ out:
                goto retry;
        }
 
+       if (!ret && sectors_allocated)
+               bch2_increment_clock(c, sectors_allocated, WRITE);
+
        bch2_open_buckets_put(c, &open_buckets);
        bch2_disk_reservation_put(c, &disk_res);
        bch2_bkey_buf_exit(&new, c);
@@ -654,7 +660,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
                if (to_entry(ptr + 1) < ptrs.end) {
                        n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
-                                               GFP_NOIO, &ca->replica_set));
+                                               GFP_NOFS, &ca->replica_set));
 
                        n->bio.bi_end_io        = wbio->bio.bi_end_io;
                        n->bio.bi_private       = wbio->bio.bi_private;
@@ -705,7 +711,8 @@ static void bch2_write_done(struct closure *cl)
        struct bch_fs *c = op->c;
 
        bch2_disk_reservation_put(c, &op->res);
-       bch2_write_ref_put(c, BCH_WRITE_REF_write);
+       if (!(op->flags & BCH_WRITE_MOVE))
+               bch2_write_ref_put(c, BCH_WRITE_REF_write);
        bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
        bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@@ -834,36 +841,30 @@ static void bch2_write_index(struct closure *cl)
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct write_point *wp = op->wp;
        struct workqueue_struct *wq = index_update_wq(op);
+       unsigned long flags;
 
        if ((op->flags & BCH_WRITE_DONE) &&
            (op->flags & BCH_WRITE_MOVE))
                bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
 
-       barrier();
-
-       /*
-        * We're not using wp->writes_lock here, so this is racey: that's ok,
-        * because this is just for diagnostic purposes, and we're running out
-        * of interrupt context here so if we were to take the log we'd have to
-        * switch to spin_lock_irq()/irqsave(), which is not free:
-        */
+       spin_lock_irqsave(&wp->writes_lock, flags);
        if (wp->state == WRITE_POINT_waiting_io)
                __wp_update_state(wp, WRITE_POINT_waiting_work);
+       list_add_tail(&op->wp_list, &wp->writes);
+       spin_unlock_irqrestore (&wp->writes_lock, flags);
 
-       op->btree_update_ready = true;
        queue_work(wq, &wp->index_update_work);
 }
 
 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
 {
-       op->btree_update_ready = false;
        op->wp = wp;
 
-       spin_lock(&wp->writes_lock);
-       list_add_tail(&op->wp_list, &wp->writes);
-       if (wp->state == WRITE_POINT_stopped)
+       if (wp->state == WRITE_POINT_stopped) {
+               spin_lock_irq(&wp->writes_lock);
                __wp_update_state(wp, WRITE_POINT_waiting_io);
-       spin_unlock(&wp->writes_lock);
+               spin_unlock_irq(&wp->writes_lock);
+       }
 }
 
 void bch2_write_point_do_index_updates(struct work_struct *work)
@@ -873,16 +874,12 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
        struct bch_write_op *op;
 
        while (1) {
-               spin_lock(&wp->writes_lock);
-               list_for_each_entry(op, &wp->writes, wp_list)
-                       if (op->btree_update_ready) {
-                               list_del(&op->wp_list);
-                               goto unlock;
-                       }
-               op = NULL;
-unlock:
+               spin_lock_irq(&wp->writes_lock);
+               op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+               if (op)
+                       list_del(&op->wp_list);
                wp_update_state(wp, op != NULL);
-               spin_unlock(&wp->writes_lock);
+               spin_unlock_irq(&wp->writes_lock);
 
                if (!op)
                        break;
@@ -979,7 +976,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
        pages = min(pages, BIO_MAX_VECS);
 
        bio = bio_alloc_bioset(NULL, pages, 0,
-                              GFP_NOIO, &c->bio_write);
+                              GFP_NOFS, &c->bio_write);
        wbio                    = wbio_init(bio);
        wbio->put_bio           = true;
        /* copy WRITE_SYNC flag */
@@ -1317,7 +1314,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                BUG_ON(total_output != total_input);
 
                dst = bio_split(src, total_input >> 9,
-                               GFP_NOIO, &c->bio_write);
+                               GFP_NOFS, &c->bio_write);
                wbio_init(dst)->put_bio = true;
                /* copy WRITE_SYNC flag */
                dst->bi_opf             = src->bi_opf;
@@ -1396,7 +1393,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
                return 0;
        }
 
-       new = bch2_bkey_make_mut(trans, k);
+       new = bch2_bkey_make_mut_noupdate(trans, k);
        ret = PTR_ERR_OR_ZERO(new);
        if (ret)
                return ret;
@@ -1483,7 +1480,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_ptrs_c ptrs;
-       const struct bch_extent_ptr *ptr, *ptr2;
+       const struct bch_extent_ptr *ptr;
        struct {
                struct bpos     b;
                unsigned        gen;
@@ -1538,11 +1535,12 @@ retry:
                                                  bucket_to_u64(buckets[nr_buckets].b));
 
                        prefetch(buckets[nr_buckets].l);
-                       nr_buckets++;
 
                        if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
                                goto err_get_ioref;
 
+                       nr_buckets++;
+
                        if (ptr->unwritten)
                                op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
                }
@@ -1633,12 +1631,8 @@ err:
        }
        return;
 err_get_ioref:
-       bkey_for_each_ptr(ptrs, ptr2) {
-               if (ptr2 == ptr)
-                       break;
-
-               percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
-       }
+       for (i = 0; i < nr_buckets; i++)
+               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
        /* Fall back to COW path: */
        goto out;
@@ -1647,12 +1641,11 @@ err_bucket_stale:
                bch2_bucket_nocow_unlock(&c->nocow_locks,
                                         buckets[i].b,
                                         BUCKET_NOCOW_LOCK_UPDATE);
-
-       bkey_for_each_ptr(ptrs, ptr2)
-               percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
+       for (i = 0; i < nr_buckets; i++)
+               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
        /* We can retry this: */
-       ret = BCH_ERR_transaction_restart;
+       ret = -BCH_ERR_transaction_restart;
        goto out;
 }
 
@@ -1673,7 +1666,6 @@ static void __bch2_write(struct bch_write_op *op)
        }
 again:
        memset(&op->failed, 0, sizeof(op->failed));
-       op->btree_update_ready = false;
 
        do {
                struct bkey_i *key_to_write;
@@ -1704,7 +1696,7 @@ again:
                                &op->devs_have,
                                op->nr_replicas,
                                op->nr_replicas_required,
-                               op->alloc_reserve,
+                               op->watermark,
                                op->flags,
                                (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
                                              BCH_WRITE_ONLY_SPECIFIED_DEVS))
@@ -1853,7 +1845,12 @@ void bch2_write(struct closure *cl)
                goto err;
        }
 
-       if (c->opts.nochanges ||
+       if (c->opts.nochanges) {
+               op->error = -BCH_ERR_erofs_no_writes;
+               goto err;
+       }
+
+       if (!(op->flags & BCH_WRITE_MOVE) &&
            !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
                op->error = -BCH_ERR_erofs_no_writes;
                goto err;
@@ -1881,6 +1878,34 @@ err:
                op->end_io(op);
 }
 
+static const char * const bch2_write_flags[] = {
+#define x(f)   #f,
+       BCH_WRITE_FLAGS()
+#undef x
+       NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+       prt_str(out, "pos: ");
+       bch2_bpos_to_text(out, op->pos);
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_str(out, "started: ");
+       bch2_pr_time_units(out, local_clock() - op->start_time);
+       prt_newline(out);
+
+       prt_str(out, "flags: ");
+       prt_bitflags(out, bch2_write_flags, op->flags);
+       prt_newline(out);
+
+       prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+       prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
+}
+
 /* Cache promotion on read */
 
 struct promote_op {
@@ -1988,7 +2013,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
        if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
                return NULL;
 
-       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
        if (!op)
                goto err;
 
@@ -2001,7 +2026,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
         */
        *rbio = kzalloc(sizeof(struct bch_read_bio) +
                        sizeof(struct bio_vec) * pages,
-                       GFP_NOIO);
+                       GFP_NOFS);
        if (!*rbio)
                goto err;
 
@@ -2009,7 +2034,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
        bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
 
        if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
-                                GFP_NOIO))
+                                GFP_NOFS))
                goto err;
 
        (*rbio)->bounce         = true;
@@ -2032,14 +2057,17 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
                                .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
                        },
                        btree_id, k);
-       if (ret == -BCH_ERR_nocow_lock_blocked) {
+       /*
+        * possible errors: -BCH_ERR_nocow_lock_blocked,
+        * -BCH_ERR_ENOSPC_disk_reservation:
+        */
+       if (ret) {
                ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
                                        bch_promote_params);
                BUG_ON(ret);
                goto err;
        }
 
-       BUG_ON(ret);
        op->write.op.end_io = promote_done;
 
        return op;
@@ -2286,9 +2314,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (crc_is_compressed(rbio->pick.crc))
                return 0;
 
-       bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(&iter);
+       k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
        if ((ret = bkey_err(k)))
                goto out;
 
@@ -2524,10 +2551,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
        reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
                *offset_into_extent;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
-                            POS(0, reflink_offset),
-                            BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(&iter);
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+                              POS(0, reflink_offset), 0);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -2724,7 +2749,7 @@ get_bio:
                rbio = rbio_init(bio_alloc_bioset(NULL,
                                                  DIV_ROUND_UP(sectors, PAGE_SECTORS),
                                                  0,
-                                                 GFP_NOIO,
+                                                 GFP_NOFS,
                                                  &c->bio_read_split),
                                 orig->opts);
 
@@ -2740,7 +2765,7 @@ get_bio:
                 * from the whole bio, in which case we don't want to retry and
                 * lose the error)
                 */
-               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
                                                 &c->bio_read_split),
                                 orig->opts);
                rbio->bio.bi_iter = iter;
@@ -3006,18 +3031,26 @@ void bch2_fs_io_exit(struct bch_fs *c)
 int bch2_fs_io_init(struct bch_fs *c)
 {
        if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS) ||
-           bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS) ||
-           bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-                       BIOSET_NEED_BVECS) ||
-           mempool_init_page_pool(&c->bio_bounce_pages,
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_init;
+
+       if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+       if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_write_init;
+
+       if (mempool_init_page_pool(&c->bio_bounce_pages,
                                   max_t(unsigned,
                                         c->opts.btree_node_size,
                                         c->opts.encoded_extent_max) /
-                                  PAGE_SIZE, 0) ||
-           rhashtable_init(&c->promote_table, &bch_promote_params))
-               return -ENOMEM;
+                                  PAGE_SIZE, 0))
+               return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+       if (rhashtable_init(&c->promote_table, &bch_promote_params))
+               return -BCH_ERR_ENOMEM_promote_table_init;
 
        return 0;
 }