]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/journal.c
New upstream snapshot
[bcachefs-tools-debian] / libbcachefs / journal.c
index ac4071fc4e80f05d9ec73f0ab85761dcfb98ee6d..158df42e5e10487caca016cf52478ab5377e5152 100644 (file)
@@ -88,8 +88,6 @@ static void bch2_journal_buf_init(struct journal *j)
        buf->must_flush = false;
        buf->separate_flush = false;
 
-       memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
        memset(buf->data, 0, sizeof(*buf->data));
        buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
        buf->data->u64s = 0;
@@ -109,7 +107,12 @@ void bch2_journal_halt(struct journal *j)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       j->err_seq = journal_cur_seq(j);
+       /*
+        * XXX: we're not using j->lock here because this can be called from
+        * interrupt context, this can race with journal_write_done()
+        */
+       if (!j->err_seq)
+               j->err_seq = journal_cur_seq(j);
        journal_wake(j);
        closure_wake_up(&journal_cur_buf(j)->wait);
 }
@@ -308,7 +311,7 @@ static int journal_entry_open(struct journal *j)
 
        mod_delayed_work(c->io_complete_wq,
                         &j->write_work,
-                        msecs_to_jiffies(j->write_delay_ms));
+                        msecs_to_jiffies(c->opts.journal_flush_delay));
        journal_wake(j);
        return 0;
 }
@@ -335,55 +338,6 @@ static void journal_write_work(struct work_struct *work)
        journal_entry_close(j);
 }
 
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
-{
-       size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-       union journal_res_state s;
-       unsigned i;
-       u64 seq;
-
-
-       spin_lock(&j->lock);
-       seq = journal_cur_seq(j);
-       s = READ_ONCE(j->reservations);
-       i = s.idx;
-
-       while (1) {
-               if (test_bit(h, j->buf[i].has_inode))
-                       goto out;
-
-               if (i == s.unwritten_idx)
-                       break;
-
-               i = (i - 1) & JOURNAL_BUF_MASK;
-               seq--;
-       }
-
-       seq = 0;
-out:
-       spin_unlock(&j->lock);
-
-       return seq;
-}
-
-void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
-{
-       size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-       struct journal_buf *buf;
-
-       spin_lock(&j->lock);
-
-       if ((buf = journal_seq_to_buf(j, seq)))
-               set_bit(h, buf->has_inode);
-
-       spin_unlock(&j->lock);
-}
-
 static int __journal_res_get(struct journal *j, struct journal_res *res,
                             unsigned flags)
 {
@@ -602,7 +556,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
        spin_lock(&j->lock);
 
-       BUG_ON(seq > journal_cur_seq(j));
+       if (WARN_ONCE(seq > journal_cur_seq(j),
+                     "requested to flush journal seq %llu, but currently at %llu",
+                     seq, journal_cur_seq(j)))
+               goto out;
 
        /* Recheck under lock: */
        if (j->err_seq && seq >= j->err_seq) {
@@ -669,6 +626,12 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
        u64 start_time = local_clock();
        int ret, ret2;
 
+       /*
+        * Don't update time_stats when @seq is already flushed:
+        */
+       if (seq <= j->flushed_seq_ondisk)
+               return 0;
+
        ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
        if (!ret)
@@ -679,6 +642,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 
 int bch2_journal_meta(struct journal *j)
 {
+       struct journal_buf *buf;
        struct journal_res res;
        int ret;
 
@@ -688,6 +652,10 @@ int bch2_journal_meta(struct journal *j)
        if (ret)
                return ret;
 
+       buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+       buf->must_flush = true;
+       set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
        bch2_journal_res_put(j, &res);
 
        return bch2_journal_flush_seq(j, res.seq);
@@ -737,6 +705,44 @@ int bch2_journal_flush(struct journal *j)
        return bch2_journal_flush_seq(j, seq);
 }
 
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       u64 unwritten_seq;
+       bool ret = false;
+
+       if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+               return false;
+
+       if (seq <= c->journal.flushed_seq_ondisk)
+               return false;
+
+       spin_lock(&j->lock);
+       if (seq <= c->journal.flushed_seq_ondisk)
+               goto out;
+
+       for (unwritten_seq = last_unwritten_seq(j);
+            unwritten_seq < seq;
+            unwritten_seq++) {
+               struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+               /* journal write is already in flight, and was a flush write: */
+               if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+                       goto out;
+
+               buf->noflush = true;
+       }
+
+       ret = true;
+out:
+       spin_unlock(&j->lock);
+       return ret;
+}
+
 /* block/unlock the journal: */
 
 void bch2_journal_unblock(struct journal *j)
@@ -807,11 +813,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                long b;
 
                if (new_fs) {
-                       if (c)
-                               percpu_down_read(&c->mark_lock);
                        b = bch2_bucket_alloc_new_fs(ca);
                        if (b < 0) {
-                               percpu_up_read(&c->mark_lock);
                                ret = -ENOSPC;
                                goto err;
                        }
@@ -825,7 +828,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                goto err;
                        }
 
-                       b = sector_to_bucket(ca, ob->ptr.offset);
+                       b = ob->bucket;
                }
 
                if (c)
@@ -859,14 +862,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                if (c)
                        spin_unlock(&c->journal.lock);
 
-               if (new_fs) {
-                       bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
-                                                 ca->mi.bucket_size,
-                                                 gc_phase(GC_PHASE_SB),
-                                                 0);
-                       if (c)
-                               percpu_up_read(&c->mark_lock);
-               } else {
+               if (!new_fs) {
                        ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
                                bch2_trans_mark_metadata_bucket(&trans, ca,
                                                b, BCH_DATA_journal,
@@ -1032,10 +1028,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        j->replay_journal_seq   = last_seq;
        j->replay_journal_seq_end = cur_seq;
        j->last_seq_ondisk      = last_seq;
+       j->flushed_seq_ondisk   = cur_seq - 1;
        j->pin.front            = last_seq;
        j->pin.back             = cur_seq;
        atomic64_set(&j->seq, cur_seq - 1);
 
+       if (list_empty(journal_entries))
+               j->last_empty_seq = cur_seq - 1;
+
        fifo_for_each_entry_ptr(p, &j->pin, seq)
                journal_pin_list_init(p, 1);
 
@@ -1048,6 +1048,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
                if (seq < last_seq)
                        continue;
 
+               if (journal_entry_empty(&i->j))
+                       j->last_empty_seq = le64_to_cpu(i->j.seq);
+
                p = journal_seq_pin(j, seq);
 
                p->devs.nr = 0;
@@ -1055,6 +1058,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
                        bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
        }
 
+       if (list_empty(journal_entries))
+               j->last_empty_seq = cur_seq;
+
        spin_lock(&j->lock);
 
        set_bit(JOURNAL_STARTED, &j->flags);
@@ -1144,9 +1150,6 @@ int bch2_fs_journal_init(struct journal *j)
 
        lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-       j->write_delay_ms       = 1000;
-       j->reclaim_delay_ms     = 100;
-
        atomic64_set(&j->reservations.counter,
                ((union journal_res_state)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
@@ -1178,44 +1181,29 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        union journal_res_state s;
        struct bch_dev *ca;
+       unsigned long now = jiffies;
        unsigned i;
 
        rcu_read_lock();
        s = READ_ONCE(j->reservations);
 
-       pr_buf(out,
-              "active journal entries:\t%llu\n"
-              "seq:\t\t\t%llu\n"
-              "last_seq:\t\t%llu\n"
-              "last_seq_ondisk:\t%llu\n"
-              "flushed_seq_ondisk:\t%llu\n"
-              "prereserved:\t\t%u/%u\n"
-              "each entry reserved:\t%u\n"
-              "nr flush writes:\t%llu\n"
-              "nr noflush writes:\t%llu\n"
-              "nr direct reclaim:\t%llu\n"
-              "nr background reclaim:\t%llu\n"
-              "reclaim kicked:\t\t%u\n"
-              "reclaim runs in:\t%u ms\n"
-              "current entry sectors:\t%u\n"
-              "current entry error:\t%u\n"
-              "current entry:\t\t",
-              fifo_used(&j->pin),
-              journal_cur_seq(j),
-              journal_last_seq(j),
-              j->last_seq_ondisk,
-              j->flushed_seq_ondisk,
-              j->prereserved.reserved,
-              j->prereserved.remaining,
-              j->entry_u64s_reserved,
-              j->nr_flush_writes,
-              j->nr_noflush_writes,
-              j->nr_direct_reclaim,
-              j->nr_background_reclaim,
-              j->reclaim_kicked,
-              jiffies_to_msecs(j->next_reclaim - jiffies),
-              j->cur_entry_sectors,
-              j->cur_entry_error);
+       pr_buf(out, "active journal entries:\t%llu\n",  fifo_used(&j->pin));
+       pr_buf(out, "seq:\t\t\t%llu\n",                 journal_cur_seq(j));
+       pr_buf(out, "last_seq:\t\t%llu\n",              journal_last_seq(j));
+       pr_buf(out, "last_seq_ondisk:\t%llu\n",         j->last_seq_ondisk);
+       pr_buf(out, "flushed_seq_ondisk:\t%llu\n",      j->flushed_seq_ondisk);
+       pr_buf(out, "prereserved:\t\t%u/%u\n",          j->prereserved.reserved, j->prereserved.remaining);
+       pr_buf(out, "each entry reserved:\t%u\n",       j->entry_u64s_reserved);
+       pr_buf(out, "nr flush writes:\t%llu\n",         j->nr_flush_writes);
+       pr_buf(out, "nr noflush writes:\t%llu\n",       j->nr_noflush_writes);
+       pr_buf(out, "nr direct reclaim:\t%llu\n",       j->nr_direct_reclaim);
+       pr_buf(out, "nr background reclaim:\t%llu\n",   j->nr_background_reclaim);
+       pr_buf(out, "reclaim kicked:\t\t%u\n",          j->reclaim_kicked);
+       pr_buf(out, "reclaim runs in:\t%u ms\n",        time_after(j->next_reclaim, now)
+              ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+       pr_buf(out, "current entry sectors:\t%u\n",     j->cur_entry_sectors);
+       pr_buf(out, "current entry error:\t%u\n",       j->cur_entry_error);
+       pr_buf(out, "current entry:\t\t");
 
        switch (s.cur_entry_offset) {
        case JOURNAL_ENTRY_ERROR_VAL:
@@ -1225,15 +1213,11 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                pr_buf(out, "closed\n");
                break;
        default:
-               pr_buf(out, "%u/%u\n",
-                      s.cur_entry_offset,
-                      j->cur_entry_u64s);
+               pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
                break;
        }
 
-       pr_buf(out,
-              "current entry:\t\tidx %u refcount %u\n",
-              s.idx, journal_state_count(s, s.idx));
+       pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
 
        i = s.idx;
        while (i != s.unwritten_idx) {
@@ -1273,22 +1257,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                if (!ja->nr)
                        continue;
 
-               pr_buf(out,
-                      "dev %u:\n"
-                      "\tnr\t\t%u\n"
-                      "\tbucket size\t%u\n"
-                      "\tavailable\t%u:%u\n"
-                      "\tdiscard_idx\t%u\n"
-                      "\tdirty_ondisk\t%u (seq %llu)\n"
-                      "\tdirty_idx\t%u (seq %llu)\n"
-                      "\tcur_idx\t\t%u (seq %llu)\n",
-                      i, ja->nr, ca->mi.bucket_size,
-                      bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
-                      ja->sectors_free,
-                      ja->discard_idx,
-                      ja->dirty_idx_ondisk,    ja->bucket_seq[ja->dirty_idx_ondisk],
-                      ja->dirty_idx,           ja->bucket_seq[ja->dirty_idx],
-                      ja->cur_idx,             ja->bucket_seq[ja->cur_idx]);
+               pr_buf(out, "dev %u:\n",                i);
+               pr_buf(out, "\tnr\t\t%u\n",             ja->nr);
+               pr_buf(out, "\tbucket size\t%u\n",      ca->mi.bucket_size);
+               pr_buf(out, "\tavailable\t%u:%u\n",     bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+               pr_buf(out, "\tdiscard_idx\t%u\n",      ja->discard_idx);
+               pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,    ja->bucket_seq[ja->dirty_idx_ondisk]);
+               pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,              ja->bucket_seq[ja->dirty_idx]);
+               pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,                ja->bucket_seq[ja->cur_idx]);
        }
 
        rcu_read_unlock();