]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/journal.c
Update bcachefs sources to 799716df00 bcachefs: Delete an incorrect bch2_trans_unlock()
[bcachefs-tools-debian] / libbcachefs / journal.c
index 1cbca187cb15d2829cb1c8e09c694c4131f86c0d..433c97844f36f7058f5501600e6651f256706add 100644 (file)
@@ -17,8 +17,7 @@
 #include "journal_reclaim.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
-
-#include <trace/events/bcachefs.h>
+#include "trace.h"
 
 #define x(n)   #n,
 static const char * const bch2_journal_watermarks[] = {
@@ -68,13 +67,75 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 
 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
-       INIT_LIST_HEAD(&p->list);
-       INIT_LIST_HEAD(&p->key_cache_list);
+       unsigned i;
+       for (i = 0; i < ARRAY_SIZE(p->list); i++)
+               INIT_LIST_HEAD(&p->list[i]);
        INIT_LIST_HEAD(&p->flushed);
        atomic_set(&p->count, count);
        p->devs.nr = 0;
 }
 
+/*
+ * Detect stuck journal conditions and trigger shutdown. Technically the journal
+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal
+ * reservation lockup, etc. Since this is a fatal error with potentially
+ * unpredictable characteristics, we want to be fairly conservative before we
+ * decide to shut things down.
+ *
+ * Consider the journal stuck when it appears full with no ability to commit
+ * btree transactions, to discard journal buckets, nor acquire priority
+ * (reserved watermark) reservation.
+ */
+static inline bool
+journal_error_check_stuck(struct journal *j, int error, unsigned flags)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       bool stuck = false;
+       struct printbuf buf = PRINTBUF;
+
+       if (!(error == JOURNAL_ERR_journal_full ||
+             error == JOURNAL_ERR_journal_pin_full) ||
+           nr_unwritten_journal_entries(j) ||
+           (flags & JOURNAL_WATERMARK_MASK) != JOURNAL_WATERMARK_reserved)
+               return stuck;
+
+       spin_lock(&j->lock);
+
+       if (j->can_discard) {
+               spin_unlock(&j->lock);
+               return stuck;
+       }
+
+       stuck = true;
+
+       /*
+        * The journal shutdown path will set ->err_seq, but do it here first to
+        * serialize against concurrent failures and avoid duplicate error
+        * reports.
+        */
+       if (j->err_seq) {
+               spin_unlock(&j->lock);
+               return stuck;
+       }
+       j->err_seq = journal_cur_seq(j);
+       spin_unlock(&j->lock);
+
+       bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
+               bch2_journal_errors[error]);
+       bch2_journal_debug_to_text(&buf, j);
+       bch_err(c, "%s", buf.buf);
+
+       printbuf_reset(&buf);
+       bch2_journal_pins_to_text(&buf, j);
+       bch_err(c, "Journal pins:\n%s", buf.buf);
+       printbuf_exit(&buf);
+
+       bch2_fatal_error(c);
+       dump_stack();
+
+       return stuck;
+}
+
 /* journal entry close/open: */
 
 void __bch2_journal_buf_put(struct journal *j)
@@ -162,6 +223,7 @@ void bch2_journal_halt(struct journal *j)
        __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
        if (!j->err_seq)
                j->err_seq = journal_cur_seq(j);
+       journal_wake(j);
        spin_unlock(&j->lock);
 }
 
@@ -225,7 +287,7 @@ static int journal_entry_open(struct journal *j)
        if (!fifo_free(&j->pin))
                return JOURNAL_ERR_journal_pin_full;
 
-       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1)
+       if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
                return JOURNAL_ERR_max_in_flight;
 
        BUG_ON(!j->cur_entry_sectors);
@@ -362,6 +424,12 @@ retry:
 
        spin_lock(&j->lock);
 
+       /* check once more in case somebody else shut things down... */
+       if (bch2_journal_error(j)) {
+               spin_unlock(&j->lock);
+               return -BCH_ERR_erofs_journal_err;
+       }
+
        /*
         * Recheck after taking the lock, so we don't race with another thread
         * that just did journal_entry_open() and call journal_entry_close()
@@ -409,28 +477,8 @@ unlock:
 
        if (!ret)
                goto retry;
-
-       if ((ret == JOURNAL_ERR_journal_full ||
-            ret == JOURNAL_ERR_journal_pin_full) &&
-           !can_discard &&
-           !nr_unwritten_journal_entries(j) &&
-           (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
-               struct printbuf buf = PRINTBUF;
-
-               bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
-                       bch2_journal_errors[ret]);
-
-               bch2_journal_debug_to_text(&buf, j);
-               bch_err(c, "%s", buf.buf);
-
-               printbuf_reset(&buf);
-               bch2_journal_pins_to_text(&buf, j);
-               bch_err(c, "Journal pins:\n%s", buf.buf);
-
-               printbuf_exit(&buf);
-               bch2_fatal_error(c);
-               dump_stack();
-       }
+       if (journal_error_check_stuck(j, ret, flags))
+               ret = -BCH_ERR_journal_res_get_blocked;
 
        /*
         * Journal is full - can't rely on reclaim from work item due to
@@ -758,26 +806,18 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        u64 *new_bucket_seq = NULL, *new_buckets = NULL;
        struct open_bucket **ob = NULL;
        long *bu = NULL;
-       unsigned i, nr_got = 0, nr_want = nr - ja->nr;
-       unsigned old_nr                 = ja->nr;
-       unsigned old_discard_idx        = ja->discard_idx;
-       unsigned old_dirty_idx_ondisk   = ja->dirty_idx_ondisk;
-       unsigned old_dirty_idx          = ja->dirty_idx;
-       unsigned old_cur_idx            = ja->cur_idx;
+       unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
        int ret = 0;
 
-       if (c) {
-               bch2_journal_flush_all_pins(&c->journal);
-               bch2_journal_block(&c->journal);
-       }
+       BUG_ON(nr <= ja->nr);
 
        bu              = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
        ob              = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
        new_buckets     = kcalloc(nr, sizeof(u64), GFP_KERNEL);
        new_bucket_seq  = kcalloc(nr, sizeof(u64), GFP_KERNEL);
        if (!bu || !ob || !new_buckets || !new_bucket_seq) {
-               ret = -ENOMEM;
-               goto err_unblock;
+               ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+               goto err_free;
        }
 
        for (nr_got = 0; nr_got < nr_want; nr_got++) {
@@ -788,87 +828,97 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                break;
                        }
                } else {
-                       ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
-                                              false, cl);
+                       ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, cl);
                        ret = PTR_ERR_OR_ZERO(ob[nr_got]);
                        if (ret)
                                break;
 
+                       ret = bch2_trans_run(c,
+                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                                               ob[nr_got]->bucket, BCH_DATA_journal,
+                                               ca->mi.bucket_size));
+                       if (ret) {
+                               bch2_open_bucket_put(c, ob[nr_got]);
+                               bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+                               break;
+                       }
+
                        bu[nr_got] = ob[nr_got]->bucket;
                }
        }
 
        if (!nr_got)
-               goto err_unblock;
+               goto err_free;
 
-       /*
-        * We may be called from the device add path, before the new device has
-        * actually been added to the running filesystem:
-        */
-       if (!new_fs)
-               spin_lock(&c->journal.lock);
+       /* Don't return an error if we successfully allocated some buckets: */
+       ret = 0;
+
+       if (c) {
+               bch2_journal_flush_all_pins(&c->journal);
+               bch2_journal_block(&c->journal);
+               mutex_lock(&c->sb_lock);
+       }
 
        memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
        memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
-       swap(new_buckets,       ja->buckets);
-       swap(new_bucket_seq,    ja->bucket_seq);
+
+       BUG_ON(ja->discard_idx > ja->nr);
+
+       pos = ja->discard_idx ?: ja->nr;
+
+       memmove(new_buckets + pos + nr_got,
+               new_buckets + pos,
+               sizeof(new_buckets[0]) * (ja->nr - pos));
+       memmove(new_bucket_seq + pos + nr_got,
+               new_bucket_seq + pos,
+               sizeof(new_bucket_seq[0]) * (ja->nr - pos));
 
        for (i = 0; i < nr_got; i++) {
-               unsigned pos = ja->discard_idx ?: ja->nr;
-               long b = bu[i];
-
-               __array_insert_item(ja->buckets,                ja->nr, pos);
-               __array_insert_item(ja->bucket_seq,             ja->nr, pos);
-               ja->nr++;
-
-               ja->buckets[pos] = b;
-               ja->bucket_seq[pos] = 0;
-
-               if (pos <= ja->discard_idx)
-                       ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-               if (pos <= ja->dirty_idx_ondisk)
-                       ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-               if (pos <= ja->dirty_idx)
-                       ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-               if (pos <= ja->cur_idx)
-                       ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+               new_buckets[pos + i] = bu[i];
+               new_bucket_seq[pos + i] = 0;
        }
 
-       ret = bch2_journal_buckets_to_sb(c, ca);
-       if (ret) {
-               /* Revert: */
-               swap(new_buckets,       ja->buckets);
-               swap(new_bucket_seq,    ja->bucket_seq);
-               ja->nr                  = old_nr;
-               ja->discard_idx         = old_discard_idx;
-               ja->dirty_idx_ondisk    = old_dirty_idx_ondisk;
-               ja->dirty_idx           = old_dirty_idx;
-               ja->cur_idx             = old_cur_idx;
-       }
+       nr = ja->nr + nr_got;
+
+       ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+       if (ret)
+               goto err_unblock;
 
        if (!new_fs)
-               spin_unlock(&c->journal.lock);
+               bch2_write_super(c);
 
+       /* Commit: */
        if (c)
-               bch2_journal_unblock(&c->journal);
+               spin_lock(&c->journal.lock);
 
-       if (ret)
-               goto err;
+       swap(new_buckets,       ja->buckets);
+       swap(new_bucket_seq,    ja->bucket_seq);
+       ja->nr = nr;
 
-       if (!new_fs) {
-               for (i = 0; i < nr_got; i++) {
-                       ret = bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
-                                               bu[i], BCH_DATA_journal,
-                                               ca->mi.bucket_size));
-                       if (ret) {
-                               bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
-                               goto err;
-                       }
-               }
+       if (pos <= ja->discard_idx)
+               ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+       if (pos <= ja->dirty_idx_ondisk)
+               ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+       if (pos <= ja->dirty_idx)
+               ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+       if (pos <= ja->cur_idx)
+               ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
+
+       if (c)
+               spin_unlock(&c->journal.lock);
+err_unblock:
+       if (c) {
+               bch2_journal_unblock(&c->journal);
+               mutex_unlock(&c->sb_lock);
        }
-err:
-       if (ob && !new_fs)
+
+       if (ret && !new_fs)
+               for (i = 0; i < nr_got; i++)
+                       bch2_trans_run(c,
+                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                                               bu[i], BCH_DATA_free, 0));
+err_free:
+       if (!new_fs)
                for (i = 0; i < nr_got; i++)
                        bch2_open_bucket_put(c, ob[i]);
 
@@ -876,12 +926,7 @@ err:
        kfree(new_buckets);
        kfree(ob);
        kfree(bu);
-
        return ret;
-err_unblock:
-       if (c)
-               bch2_journal_unblock(&c->journal);
-       goto err;
 }
 
 /*
@@ -893,56 +938,58 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 {
        struct journal_device *ja = &ca->journal;
        struct closure cl;
-       unsigned current_nr;
        int ret = 0;
 
-       /* don't handle reducing nr of buckets yet: */
-       if (nr < ja->nr)
-               return 0;
-
        closure_init_stack(&cl);
 
-       while (ja->nr != nr && (ret == 0 || ret == -BCH_ERR_bucket_alloc_blocked)) {
-               struct disk_reservation disk_res = { 0, 0 };
+       down_write(&c->state_lock);
 
-               closure_sync(&cl);
+       /* don't handle reducing nr of buckets yet: */
+       if (nr < ja->nr)
+               goto unlock;
 
-               mutex_lock(&c->sb_lock);
-               current_nr = ja->nr;
+       while (ja->nr < nr) {
+               struct disk_reservation disk_res = { 0, 0 };
 
                /*
                 * note: journal buckets aren't really counted as _sectors_ used yet, so
                 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
                 * when space used goes up without a reservation - but we do need the
                 * reservation to ensure we'll actually be able to allocate:
+                *
+                * XXX: that's not right, disk reservations only ensure a
+                * filesystem-wide allocation will succeed, this is a device
+                * specific allocation - we can hang here:
                 */
 
                ret = bch2_disk_reservation_get(c, &disk_res,
                                                bucket_to_sector(ca, nr - ja->nr), 1, 0);
-               if (ret) {
-                       mutex_unlock(&c->sb_lock);
-                       return ret;
-               }
+               if (ret)
+                       break;
 
                ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
 
                bch2_disk_reservation_put(c, &disk_res);
 
-               if (ja->nr != current_nr)
-                       bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
+               closure_sync(&cl);
+
+               if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
+                       break;
        }
 
+       if (ret)
+               bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
+unlock:
+       up_write(&c->state_lock);
        return ret;
 }
 
 int bch2_dev_journal_alloc(struct bch_dev *ca)
 {
        unsigned nr;
-       int ret;
 
        if (dynamic_fault("bcachefs:add:journal_alloc"))
-               return -ENOMEM;
+               return -BCH_ERR_ENOMEM_set_nr_journal_buckets;
 
        /* 1/128th of the device by default: */
        nr = ca->mi.nbuckets >> 7;
@@ -956,15 +1003,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
                     min(1 << 13,
                         (1 << 24) / ca->mi.bucket_size));
 
-       if (ca->fs)
-               mutex_lock(&ca->fs->sb_lock);
-
-       ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
-
-       if (ca->fs)
-               mutex_unlock(&ca->fs->sb_lock);
-
-       return ret;
+       return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }
 
 /* startup/shutdown: */
@@ -980,7 +1019,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
             seq++) {
                struct journal_buf *buf = journal_seq_to_buf(j, seq);
 
-               if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
+               if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
                        ret = true;
        }
        spin_unlock(&j->lock);
@@ -1042,7 +1081,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
                init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
                if (!j->pin.data) {
                        bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-                       return -ENOMEM;
+                       return -BCH_ERR_ENOMEM_journal_pin_fifo;
                }
        }
 
@@ -1136,19 +1175,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
        ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
        if (!ja->bucket_seq)
-               return -ENOMEM;
+               return -BCH_ERR_ENOMEM_dev_journal_init;
 
        nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
        ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
        if (!ca->journal.bio)
-               return -ENOMEM;
+               return -BCH_ERR_ENOMEM_dev_journal_init;
 
        bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
 
        ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
        if (!ja->buckets)
-               return -ENOMEM;
+               return -BCH_ERR_ENOMEM_dev_journal_init;
 
        if (journal_buckets_v2) {
                unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
@@ -1202,7 +1241,7 @@ int bch2_fs_journal_init(struct journal *j)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
        if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
-               ret = -ENOMEM;
+               ret = -BCH_ERR_ENOMEM_journal_pin_fifo;
                goto out;
        }
 
@@ -1210,7 +1249,7 @@ int bch2_fs_journal_init(struct journal *j)
                j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
                j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
                if (!j->buf[i].data) {
-                       ret = -ENOMEM;
+                       ret = -BCH_ERR_ENOMEM_journal_buf;
                        goto out;
                }
        }
@@ -1356,6 +1395,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *pin;
+       unsigned i;
 
        spin_lock(&j->lock);
        *seq = max(*seq, j->pin.front);
@@ -1373,15 +1413,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
        prt_newline(out);
        printbuf_indent_add(out, 2);
 
-       list_for_each_entry(pin, &pin_list->list, list) {
-               prt_printf(out, "\t%px %ps", pin, pin->flush);
-               prt_newline(out);
-       }
-
-       list_for_each_entry(pin, &pin_list->key_cache_list, list) {
-               prt_printf(out, "\t%px %ps", pin, pin->flush);
-               prt_newline(out);
-       }
+       for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+               list_for_each_entry(pin, &pin_list->list[i], list) {
+                       prt_printf(out, "\t%px %ps", pin, pin->flush);
+                       prt_newline(out);
+               }
 
        if (!list_empty(&pin_list->flushed)) {
                prt_printf(out, "flushed:");