Update bcachefs sources to 3e93567c51 bcachefs: Switch to local_clock() for fastpath...

author Kent Overstreet <kent.overstreet@linux.dev>

Wed, 12 Oct 2022 20:29:56 +0000 (16:29 -0400)

committer Kent Overstreet <kent.overstreet@linux.dev>

Sat, 15 Oct 2022 05:04:30 +0000 (01:04 -0400)
author Kent Overstreet <kent.overstreet@linux.dev>
Wed, 12 Oct 2022 20:29:56 +0000 (16:29 -0400)
committer Kent Overstreet <kent.overstreet@linux.dev>
Sat, 15 Oct 2022 05:04:30 +0000 (01:04 -0400)
diff --git a/.bcachefs_revision b/.bcachefs_revision

index 2c908e6b7d9a5887af285bdd5ccc3559f5545122..11194e5efb90b48b446ab2e8652aa18f5d2f5057 100644 (file)
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-6ee8a33cee5dfb74a1fb6ff348578fd43aae3a14
+3e93567c5196ef0c80e2ac3c08295130d858dfd6
diff --git a/Makefile b/Makefile

index a5a74fedef110475421f9883b82ec0f6bb46566e..d460a6d3d0b3e6ab742ec670f950b65c2fb403ad 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -199,6 +199,12 @@ update-bcachefs-sources:
         git add include/linux/printbuf.h
         cp $(LINUX_DIR)/lib/printbuf.c linux/
         git add linux/printbuf.c
+       cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/
+       git add linux/mean_and_variance.c
+       cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/
+       git add include/linux/mean_and_variance.h
+       cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/
+       git add linux/int_sqrt.c
         cp $(LINUX_DIR)/scripts/Makefile.compiler ./
         git add Makefile.compiler
         $(RM) libbcachefs/*.mod.c
diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h

new file mode 100644 (file)

index 0000000..3d62abe
--- /dev/null
+++ b/include/linux/mean_and_variance.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math64.h>
+#include <linux/printbuf.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+typedef unsigned __int128 u128;
+
+static inline u128 u64_to_u128(u64 a)
+{
+       return (u128)a;
+}
+
+static inline u64 u128_to_u64(u128 a)
+{
+       return (u64)a;
+}
+
+static inline u64 u128_shr64_to_u64(u128 a)
+{
+       return (u64)(a >> 64);
+}
+
+static inline u128 u128_add(u128 a, u128 b)
+{
+       return a + b;
+}
+
+static inline u128 u128_sub(u128 a, u128 b)
+{
+       return a - b;
+}
+
+static inline u128 u128_shl(u128 i, s8 shift)
+{
+       return i << shift;
+}
+
+static inline u128 u128_shl64_add(u64 a, u64 b)
+{
+       return ((u128)a << 64) + b;
+}
+
+static inline u128 u128_square(u64 i)
+{
+       return i*i;
+}
+
+#else
+
+typedef struct {
+       u64 hi, lo;
+} u128;
+
+static inline u128 u64_to_u128(u64 a)
+{
+       return (u128){ .lo = a };
+}
+
+static inline u64 u128_to_u64(u128 a)
+{
+       return a.lo;
+}
+
+static inline u64 u128_shr64_to_u64(u128 a)
+{
+       return a.hi;
+}
+
+static inline u128 u128_add(u128 a, u128 b)
+{
+       u128 c;
+
+       c.lo = a.lo + b.lo;
+       c.hi = a.hi + b.hi + (c.lo < a.lo);
+       return c;
+}
+
+static inline u128 u128_sub(u128 a, u128 b)
+{
+       u128 c;
+
+       c.lo = a.lo - b.lo;
+       c.hi = a.hi - b.hi - (c.lo > a.lo);
+       return c;
+}
+
+static inline u128 u128_shl(u128 i, s8 shift)
+{
+       u128 r;
+
+       r.lo = i.lo << shift;
+       if (shift < 64)
+               r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+       else {
+               r.hi = i.lo << (shift - 64);
+               r.lo = 0;
+       }
+       return r;
+}
+
+static inline u128 u128_shl64_add(u64 a, u64 b)
+{
+       return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b));
+}
+
+static inline u128 u128_square(u64 i)
+{
+       u128 r;
+       u64  h = i >> 32, l = i & (u64)U32_MAX;
+
+       r =             u128_shl(u64_to_u128(h*h), 64);
+       r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+       r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+       r = u128_add(r,          u64_to_u128(l*l));
+       return r;
+}
+
+#endif
+
+static inline u128 u128_div(u128 n, u64 d)
+{
+       u128 r;
+       u64 rem;
+       u64 hi = u128_shr64_to_u64(n);
+       u64 lo = u128_to_u64(n);
+       u64  h =  hi & ((u64)U32_MAX  << 32);
+       u64  l = (hi &  (u64)U32_MAX) << 32;
+
+       r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+       r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+       r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+       return r;
+}
+
+struct mean_and_variance {
+       s64 n;
+       s64 sum;
+       u128 sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+       bool init;
+       u8 w;
+       s64 mean;
+       u64 variance;
+};
+
+inline s64 fast_divpow2(s64 n, u8 d);
+
+struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
+       s64              mean_and_variance_get_mean(struct mean_and_variance s);
+       u64              mean_and_variance_get_variance(struct mean_and_variance s1);
+       u32              mean_and_variance_get_stddev(struct mean_and_variance s);
+
+struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1);
+       s64                       mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+       u64                       mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+       u32                       mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+
+#endif // MEAN_AND_VAIRANCE_H_
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c

index b0e30594c4877ddeaafa822e310670394e251ece..a537768c4600b8c32bfa7b9a4280177f7c13b7a3 100644 (file)
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -414,7 +414,8 @@ err:
  int bch2_get_next_backpointer(struct btree_trans *trans,
                               struct bpos bucket, int gen,
                               u64 *bp_offset,
-                             struct bch_backpointer *dst)
+                             struct bch_backpointer *dst,
+                             unsigned iter_flags)
  {
         struct bch_fs *c = trans->c;
         struct bpos bp_pos, bp_end_pos;
@@ -1023,7 +1024,7 @@ static int check_one_backpointer(struct btree_trans *trans,
         struct printbuf buf = PRINTBUF;
         int ret;
  
-       ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp);
+       ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0);
         if (ret || *bp_offset == U64_MAX)
                 return ret;
  
diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h

index fe42af296e9ca301854aa0d61e778147d2a03540..1c97e364c53200170d89485752124c6c15bd420b 100644 (file)
--- a/libbcachefs/backpointers.h
+++ b/libbcachefs/backpointers.h
@@ -25,7 +25,7 @@ int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *,
  int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *,
                                 struct bch_backpointer, struct bkey_s_c);
  int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
-                             u64 *, struct bch_backpointer *);
+                             u64 *, struct bch_backpointer *, unsigned);
  struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
                                          struct bpos, u64, struct bch_backpointer);
  struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c

index fd89165e08a98ea639618639035b542265fa450d..a4d6998fcd00fb97735d3e6b75e8b51e5a1c0f0e 100644 (file)
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -1979,10 +1979,10 @@ int bch2_gc_gens(struct bch_fs *c)
                                         NULL, NULL,
                                         BTREE_INSERT_NOFAIL,
                                 gc_btree_gens_key(&trans, &iter, k));
-                       if (ret) {
+                       if (ret && ret != -EROFS)
                                 bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+                       if (ret)
                                 goto err;
-                       }
                 }
  
         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
@@ -1992,10 +1992,10 @@ int bch2_gc_gens(struct bch_fs *c)
                         NULL, NULL,
                         BTREE_INSERT_NOFAIL,
                 bch2_alloc_write_oldest_gen(&trans, &iter, k));
-       if (ret) {
+       if (ret && ret != -EROFS)
                 bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+       if (ret)
                 goto err;
-       }
  
         c->gc_gens_btree        = 0;
         c->gc_gens_pos          = POS_MIN;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c

index af658390bd679f9dd56de95b1e75c327c896c6eb..0dfde9fad2975786c5c1ea7a85550cdbcc79fe61 100644 (file)
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -772,7 +772,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
  
         bch2_bkey_buf_init(&tmp);
  
-       while (nr && !ret) {
+       while (nr-- && !ret) {
                 if (!bch2_btree_node_relock(trans, path, path->level))
                         break;
  
@@ -807,7 +807,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
  
         bch2_bkey_buf_init(&tmp);
  
-       while (nr && !ret) {
+       while (nr-- && !ret) {
                 if (!bch2_btree_node_relock(trans, path, path->level))
                         break;
  
@@ -2386,6 +2386,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                 }
  
                 k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+               if (unlikely(!k.k))
+                       goto out_no_locked;
         } else {
                 struct bpos next;
  
@@ -2783,7 +2785,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
  
         if (!trans->restarted &&
             (need_resched() ||
-            ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+            local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
                 bch2_trans_unlock(trans);
                 cond_resched();
                 bch2_trans_relock(trans);
@@ -2793,7 +2795,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
         if (trans->restarted)
                 bch2_btree_path_traverse_all(trans);
  
-       trans->last_begin_time = ktime_get_ns();
+       trans->last_begin_time = local_clock();
         return trans->restart_count;
  }
  
@@ -2850,7 +2852,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
         memset(trans, 0, sizeof(*trans));
         trans->c                = c;
         trans->fn               = fn;
-       trans->last_begin_time  = ktime_get_ns();
+       trans->last_begin_time  = local_clock();
         trans->fn_idx           = bch2_trans_get_fn_idx(trans, c, fn);
         trans->locking_wait.task = current;
         closure_init_stack(&trans->ref);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c

index 35e941949f49aa07d32b10ec1f5bed8c9a461d7d..958feac49563b638b9f578c5f9b8116ab713f3f2 100644 (file)
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -112,6 +112,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
  
         if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
                 preempt_disable();
                 f = this_cpu_ptr(bc->pcpu_freed);
  
@@ -136,6 +137,11 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
                         list_move_tail(&ck->list, &bc->freed_nonpcpu);
                         mutex_unlock(&bc->lock);
                 }
+#else
+               mutex_lock(&bc->lock);
+               list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               mutex_unlock(&bc->lock);
+#endif
         } else {
                 mutex_lock(&bc->lock);
                 list_move_tail(&ck->list, &bc->freed_pcpu);
@@ -174,6 +180,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
         bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
  
         if (!pcpu_readers) {
+#ifdef __KERNEL__
                 preempt_disable();
                 f = this_cpu_ptr(bc->pcpu_freed);
                 if (f->nr)
@@ -196,6 +203,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
                         preempt_enable();
                         mutex_unlock(&bc->lock);
                 }
+#else
+               mutex_lock(&bc->lock);
+               if (!list_empty(&bc->freed_nonpcpu)) {
+                       ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+                       list_del_init(&ck->list);
+               }
+               mutex_unlock(&bc->lock);
+#endif
         } else {
                 mutex_lock(&bc->lock);
                 if (!list_empty(&bc->freed_pcpu)) {
@@ -228,6 +243,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
                 return ck;
         }
  
+       /* GFP_NOFS because we're holding btree locks: */
         ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
         if (likely(ck)) {
                 INIT_LIST_HEAD(&ck->list);
@@ -252,6 +268,7 @@ bkey_cached_reuse(struct btree_key_cache *c)
         struct bkey_cached *ck;
         unsigned i;
  
+       mutex_lock(&c->lock);
         rcu_read_lock();
         tbl = rht_dereference_rcu(c->table.tbl, &c->table);
         for (i = 0; i < tbl->size; i++)
@@ -259,13 +276,14 @@ bkey_cached_reuse(struct btree_key_cache *c)
                         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
                             bkey_cached_lock_for_evict(ck)) {
                                 bkey_cached_evict(c, ck);
-                               rcu_read_unlock();
-                               return ck;
+                               goto out;
                         }
                 }
+       ck = NULL;
+out:
         rcu_read_unlock();
-
-       return NULL;
+       mutex_unlock(&c->lock);
+       return ck;
  }
  
  static struct bkey_cached *
@@ -759,12 +777,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
         unsigned start, flags;
         int srcu_idx;
  
-       /* Return -1 if we can't do anything right now */
-       if (sc->gfp_mask & __GFP_FS)
-               mutex_lock(&bc->lock);
-       else if (!mutex_trylock(&bc->lock))
-               return -1;
-
+       mutex_lock(&bc->lock);
         srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
         flags = memalloc_nofs_save();
  
@@ -869,7 +882,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
         struct bkey_cached *ck, *n;
         struct rhash_head *pos;
         unsigned i;
+#ifdef __KERNEL__
         int cpu;
+#endif
  
         if (bc->shrink.list.next)
                 unregister_shrinker(&bc->shrink);
@@ -886,6 +901,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                         }
         rcu_read_unlock();
  
+#ifdef __KERNEL__
         for_each_possible_cpu(cpu) {
                 struct btree_key_cache_freelist *f =
                         per_cpu_ptr(bc->pcpu_freed, cpu);
@@ -895,6 +911,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                         list_add(&ck->list, &bc->freed_nonpcpu);
                 }
         }
+#endif
  
         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
  
@@ -910,10 +927,15 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                 kmem_cache_free(bch2_key_cache, ck);
         }
  
-       BUG_ON(atomic_long_read(&bc->nr_dirty) &&
-              !bch2_journal_error(&c->journal) &&
-              test_bit(BCH_FS_WAS_RW, &c->flags));
-       BUG_ON(atomic_long_read(&bc->nr_keys));
+       if (atomic_long_read(&bc->nr_dirty) &&
+           !bch2_journal_error(&c->journal) &&
+           test_bit(BCH_FS_WAS_RW, &c->flags))
+               panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+                     atomic_long_read(&bc->nr_dirty));
+
+       if (atomic_long_read(&bc->nr_keys))
+               panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+                     atomic_long_read(&bc->nr_keys));
  
         mutex_unlock(&bc->lock);
  
@@ -943,9 +965,11 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
         int ret;
  
+#ifdef __KERNEL__
         bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
         if (!bc->pcpu_freed)
                 return -ENOMEM;
+#endif
  
         ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
         if (ret)
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c

index 9a525d34c7f46a28a8f59291a70435437fe6a50b..93a6ebed3aba25ebf2873845d02d9b5da0510a3d 100644 (file)
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -94,6 +94,37 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
         prt_newline(out);
  }
  
+static void lock_graph_up(struct lock_graph *g)
+{
+       closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+       closure_get(&trans->ref);
+
+       g->g[g->nr++] = (struct trans_waiting_for_lock) {
+               .trans          = trans,
+               .node_want      = trans->locking,
+               .lock_want      = trans->locking_wait.lock_want,
+       };
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+       struct trans_waiting_for_lock *i;
+
+       for (i = g->g + 1; i < g->g + g->nr; i++)
+               if (i->trans->locking != i->node_want ||
+                   i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+                       while (g->g + g->nr > i)
+                               lock_graph_up(g);
+                       return true;
+               }
+
+       return false;
+}
+
  static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
  {
         if (i == g->g) {
@@ -106,40 +137,42 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
         }
  }
  
-static noinline int break_cycle(struct lock_graph *g)
+static int btree_trans_abort_preference(struct btree_trans *trans)
  {
-       struct trans_waiting_for_lock *i;
-
-       /*
-        * We'd like to prioritize aborting transactions that have done less
-        * work - but it appears breaking cycles by telling other transactions
-        * to abort may still be buggy:
-        */
-#if 0
-       for (i = g->g; i < g->g + g->nr; i++) {
-               if (i->trans->lock_may_not_fail ||
-                   i->trans->locking_wait.lock_want == SIX_LOCK_write)
-                       continue;
+       if (trans->lock_may_not_fail)
+               return 0;
+       if (trans->locking_wait.lock_want == SIX_LOCK_write)
+               return 1;
+       if (!trans->in_traverse_all)
+               return 2;
+       return 3;
+}
  
-               return abort_lock(g, i);
-       }
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+       struct trans_waiting_for_lock *i, *abort = NULL;
+       unsigned best = 0, pref;
+       int ret;
  
-       for (i = g->g; i < g->g + g->nr; i++) {
-               if (i->trans->lock_may_not_fail ||
-                   !i->trans->in_traverse_all)
-                       continue;
+       if (lock_graph_remove_non_waiters(g))
+               return 0;
  
-               return abort_lock(g, i);
+       /* Only checking, for debugfs: */
+       if (cycle) {
+               print_cycle(cycle, g);
+               ret = -1;
+               goto out;
         }
-#endif
-       for (i = g->g; i < g->g + g->nr; i++) {
-               if (i->trans->lock_may_not_fail)
-                       continue;
  
-               return abort_lock(g, i);
+       for (i = g->g; i < g->g + g->nr; i++) {
+               pref = btree_trans_abort_preference(i->trans);
+               if (pref > best) {
+                       abort = i;
+                       best = pref;
+               }
         }
  
-       {
+       if (unlikely(!best)) {
                 struct bch_fs *c = g->g->trans->c;
                 struct printbuf buf = PRINTBUF;
  
@@ -162,21 +195,13 @@ static noinline int break_cycle(struct lock_graph *g)
                 printbuf_exit(&buf);
                 BUG();
         }
-}
-
-static void lock_graph_pop(struct lock_graph *g)
-{
-       closure_put(&g->g[--g->nr].trans->ref);
-}
-
-static void lock_graph_pop_above(struct lock_graph *g, struct trans_waiting_for_lock *above,
-                                struct printbuf *cycle)
-{
-       if (g->nr > 1 && cycle)
-               print_chain(cycle, g);
  
-       while (g->g + g->nr > above)
-               lock_graph_pop(g);
+       ret = abort_lock(g, abort);
+out:
+       if (ret)
+               while (g->nr)
+                       lock_graph_up(g);
+       return ret;
  }
  
  static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
@@ -184,67 +209,23 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
  {
         struct btree_trans *orig_trans = g->g->trans;
         struct trans_waiting_for_lock *i;
-       int ret = 0;
-
-       for (i = g->g; i < g->g + g->nr; i++) {
-               if (i->trans->locking != i->node_want) {
-                       lock_graph_pop_above(g, i - 1, cycle);
-                       return 0;
-               }
  
-               if (i->trans == trans) {
-                       if (cycle) {
-                               /* Only checking: */
-                               print_cycle(cycle, g);
-                               ret = -1;
-                       } else {
-                               ret = break_cycle(g);
-                       }
-
-                       if (ret)
-                               goto deadlock;
-                       /*
-                        * If we didn't abort (instead telling another
-                        * transaction to abort), keep checking:
-                        */
-               }
-       }
+       for (i = g->g; i < g->g + g->nr; i++)
+               if (i->trans == trans)
+                       return break_cycle(g, cycle);
  
         if (g->nr == ARRAY_SIZE(g->g)) {
                 if (orig_trans->lock_may_not_fail)
                         return 0;
  
+               while (g->nr)
+                       lock_graph_up(g);
                 trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
-               ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
-               goto deadlock;
+               return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
         }
  
-       closure_get(&trans->ref);
-
-       g->g[g->nr++] = (struct trans_waiting_for_lock) {
-               .trans          = trans,
-               .node_want      = trans->locking,
-               .lock_want      = trans->locking_wait.lock_want,
-       };
-
+       lock_graph_down(g, trans);
         return 0;
-deadlock:
-       lock_graph_pop_above(g, g->g, cycle);
-       return ret;
-}
-
-static noinline void lock_graph_remove_non_waiters(struct lock_graph *g,
-                                                  struct printbuf *cycle)
-{
-       struct trans_waiting_for_lock *i;
-
-       for (i = g->g + 1; i < g->g + g->nr; i++)
-               if (i->trans->locking != i->node_want ||
-                   i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-                       lock_graph_pop_above(g, i - 1, cycle);
-                       return;
-               }
-       BUG();
  }
  
  static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
@@ -266,8 +247,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
         }
  
         g.nr = 0;
-       ret = lock_graph_descend(&g, trans, cycle);
-       BUG_ON(ret);
+       lock_graph_down(&g, trans);
  next:
         if (!g.nr)
                 return 0;
@@ -295,7 +275,7 @@ next:
                         b = &READ_ONCE(path->l[top->level].b)->c;
  
                         if (unlikely(IS_ERR_OR_NULL(b))) {
-                               lock_graph_remove_non_waiters(&g, cycle);
+                               BUG_ON(!lock_graph_remove_non_waiters(&g));
                                 goto next;
                         }
  
@@ -321,7 +301,7 @@ next:
                                 raw_spin_unlock(&b->lock.wait_lock);
  
                                 if (ret)
-                                       return ret < 0 ? ret : 0;
+                                       return ret;
                                 goto next;
  
                         }
@@ -331,7 +311,7 @@ next:
  
         if (g.nr > 1 && cycle)
                 print_chain(cycle, &g);
-       lock_graph_pop(&g);
+       lock_graph_up(&g);
         goto next;
  }
  
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h

index d91b42bf1de15ad7516db4532dd78bd8590b15ad..bf8d1880673b224a09390e911dc50dfff03bb3b5 100644 (file)
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -88,7 +88,7 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
  {
         mark_btree_node_locked_noreset(path, level, type);
  #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-       path->l[level].lock_taken_time = ktime_get_ns();
+       path->l[level].lock_taken_time = local_clock();
  #endif
  }
  
@@ -120,7 +120,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
         if (s)
                 __bch2_time_stats_update(&s->lock_hold_times,
                                          path->l[level].lock_taken_time,
-                                        ktime_get_ns());
+                                        local_clock());
  #endif
  }
  
@@ -260,7 +260,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
             btree_node_lock_increment(trans, b, level, type) ||
             !(ret = btree_node_lock_nopath(trans, b, type))) {
  #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-               path->l[b->level].lock_taken_time = ktime_get_ns();
+               path->l[b->level].lock_taken_time = local_clock();
  #endif
         }
  
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c

index d3fa2d7ae1a2a860438563dfed81123300f2022c..dfe37965d5165c6d823b99a253a04b9f6870528c 100644 (file)
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -872,7 +872,9 @@ retry:
         while (1) {
                 bch2_trans_begin(trans);
  
-               ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen, &bp_offset, &bp);
+               ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen,
+                                               &bp_offset, &bp,
+                                               BTREE_ITER_CACHED);
                 if (ret)
                         break;
                 if (bp_offset == U64_MAX)
diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c

index 53ffc684223cf30486693f615b6de27054de347b..e9dd1d13ec7e97d39acf2b2472092934b06508b5 100644 (file)
--- a/libbcachefs/fs-common.c
+++ b/libbcachefs/fs-common.c
@@ -212,6 +212,11 @@ int bch2_link_trans(struct btree_trans *trans,
         if (ret)
                 goto err;
  
+       if (bch2_reinherit_attrs(inode_u, dir_u)) {
+               ret = -EXDEV;
+               goto err;
+       }
+
         dir_u->bi_mtime = dir_u->bi_ctime = now;
  
         dir_hash = bch2_hash_info_init(c, dir_u);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c

index 2ea6e79f9f0ac146b3243f91d966dd485e0e9793..02ef3430a30b69f4d321d162f3b2d52b52fedfb4 100644 (file)
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -1527,7 +1527,7 @@ out:
         if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
                 ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
                 if (ret)
-                       goto out;
+                       goto err;
         }
  
         ret = bch2_page_reservation_get(c, inode, page, res,
@@ -3102,6 +3102,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
         inode_dio_wait(&inode->v);
         bch2_pagecache_block_get(&inode->ei_pagecache_lock);
  
+       ret = file_modified(file);
+       if (ret)
+               goto err;
+
         if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
                 ret = bchfs_fallocate(inode, mode, offset, len);
         else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -3112,8 +3116,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
                 ret = bchfs_fcollapse_finsert(inode, offset, len, false);
         else
                 ret = -EOPNOTSUPP;
-
-
+err:
         bch2_pagecache_block_put(&inode->ei_pagecache_lock);
         inode_unlock(&inode->v);
         percpu_ref_put(&c->writes);
diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c

index bab0707bc2f4f003395b2ece347f919b5979e64c..2bb680827b44763783f663e6c2a5b223852430e4 100644 (file)
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@@ -26,6 +26,9 @@ struct flags_set {
         unsigned                flags;
  
         unsigned                projid;
+
+       bool                    set_projinherit;
+       bool                    projinherit;
  };
  
  static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -50,6 +53,11 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
             (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
                 return -EINVAL;
  
+       if (s->set_projinherit) {
+               bi->bi_fields_set &= ~(1 << Inode_opt_project);
+               bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+       }
+
         bi->bi_flags &= ~s->mask;
         bi->bi_flags |= newflags;
  
@@ -107,6 +115,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
         struct fsxattr fa = { 0 };
  
         fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+       if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+               fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
         fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
  
         return copy_to_user(arg, &fa, sizeof(fa));
@@ -138,6 +150,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
         if (copy_from_user(&fa, arg, sizeof(fa)))
                 return -EFAULT;
  
+       s.set_projinherit = true;
+       s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+       fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
         s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
         if (fa.fsx_xflags)
                 return -EOPNOTSUPP;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c

index 57e6e21896e1e41c76b0e7d144da5d52765b8589..bf82737d4ec2deb7927a650bb904c2698942cd5f 100644 (file)
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -419,7 +419,7 @@ static int bch2_mknod(struct user_namespace *mnt_userns,
                               (subvol_inum) { 0 }, 0);
  
         if (IS_ERR(inode))
-               return PTR_ERR(inode);
+               return bch2_err_class(PTR_ERR(inode));
  
         d_instantiate(dentry, &inode->v);
         return 0;
@@ -529,7 +529,7 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
         inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
         if (unlikely(IS_ERR(inode)))
-               return PTR_ERR(inode);
+               return bch2_err_class(PTR_ERR(inode));
  
         inode_lock(&inode->v);
         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
@@ -838,7 +838,7 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns,
                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
  
         if (IS_ERR(inode))
-               return PTR_ERR(inode);
+               return bch2_err_class(PTR_ERR(inode));
  
         d_mark_tmpfile(dentry, &inode->v);
         d_instantiate(dentry, &inode->v);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c

index 253a6ae201598aa9198c93bdf3514241279589cb..68113a08f1b9c4a42cab61b57277c9fb4bafe537 100644 (file)
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -17,6 +17,23 @@
  
  #include <trace/events/bcachefs.h>
  
+static struct nonce journal_nonce(const struct jset *jset)
+{
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = ((__le32 *) &jset->seq)[0],
+               [2] = ((__le32 *) &jset->seq)[1],
+               [3] = BCH_NONCE_JOURNAL,
+       }};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+{
+       return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+               !bch2_crc_cmp(j->csum,
+                             csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
  static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
  {
         return (seq - c->journal_entries_base_seq) & (~0U >> 1);
@@ -59,8 +76,7 @@ struct journal_list {
   */
  static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
                              struct journal_ptr entry_ptr,
-                            struct journal_list *jlist, struct jset *j,
-                            bool bad)
+                            struct journal_list *jlist, struct jset *j)
  {
         struct genradix_iter iter;
         struct journal_replay **_i, *i, *dup;
@@ -111,38 +127,53 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
          */
         dup = *_i;
         if (dup) {
-               if (dup->bad) {
-                       /* we'll replace @dup: */
-               } else if (bad) {
+               if (bytes == vstruct_bytes(&dup->j) &&
+                   !memcmp(j, &dup->j, bytes)) {
                         i = dup;
                         goto found;
-               } else {
-                       fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
-                                   memcmp(j, &dup->j, bytes), c,
-                                   "found duplicate but non identical journal entries (seq %llu)",
-                                   le64_to_cpu(j->seq));
+               }
+
+               if (!entry_ptr.csum_good) {
                         i = dup;
                         goto found;
                 }
-       }
  
+               if (!dup->csum_good)
+                       goto replace;
+
+               fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+                        le64_to_cpu(j->seq));
+               i = dup;
+               goto found;
+       }
+replace:
         i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
         if (!i)
                 return -ENOMEM;
  
-       i->nr_ptrs       = 0;
-       i->bad          = bad;
+       i->nr_ptrs      = 0;
+       i->csum_good    = entry_ptr.csum_good;
         i->ignore       = false;
         memcpy(&i->j, j, bytes);
+       i->ptrs[i->nr_ptrs++] = entry_ptr;
  
         if (dup) {
-               i->nr_ptrs = dup->nr_ptrs;
-               memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
+               if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+                       bch_err(c, "found too many copies of journal entry %llu",
+                               le64_to_cpu(i->j.seq));
+                       dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+               }
+
+               /* The first ptr should represent the jset we kept: */
+               memcpy(i->ptrs + i->nr_ptrs,
+                      dup->ptrs,
+                      sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+               i->nr_ptrs += dup->nr_ptrs;
                 __journal_replay_free(c, dup);
         }
  
-
         *_i = i;
+       return 0;
  found:
         for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
                 if (ptr->dev == ca->dev_idx) {
@@ -164,16 +195,6 @@ fsck_err:
         return ret;
  }
  
-static struct nonce journal_nonce(const struct jset *jset)
-{
-       return (struct nonce) {{
-               [0] = 0,
-               [1] = ((__le32 *) &jset->seq)[0],
-               [2] = ((__le32 *) &jset->seq)[1],
-               [3] = BCH_NONCE_JOURNAL,
-       }};
-}
-
  /* this fills in a range with empty jset_entries: */
  static void journal_entry_null_range(void *start, void *end)
  {
@@ -715,12 +736,8 @@ fsck_err:
  static int jset_validate(struct bch_fs *c,
                          struct bch_dev *ca,
                          struct jset *jset, u64 sector,
-                        unsigned bucket_sectors_left,
-                        unsigned sectors_read,
                          int write)
  {
-       size_t bytes = vstruct_bytes(jset);
-       struct bch_csum csum;
         unsigned version;
         int ret = 0;
  
@@ -737,21 +754,7 @@ static int jset_validate(struct bch_fs *c,
                         sector, le64_to_cpu(jset->seq),
                         version)) {
                 /* don't try to continue: */
-               return EINVAL;
-       }
-
-       if (bytes > (sectors_read << 9) &&
-           sectors_read < bucket_sectors_left)
-               return JOURNAL_ENTRY_REREAD;
-
-       if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-                                c, jset, NULL,
-                       "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-                       ca ? ca->name : c->name,
-                       sector, le64_to_cpu(jset->seq), bytes)) {
-               ret = JOURNAL_ENTRY_BAD;
-               le32_add_cpu(&jset->u64s,
-                            -((bytes - (bucket_sectors_left << 9)) / 8));
+               return -EINVAL;
         }
  
         if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
@@ -759,28 +762,9 @@ static int jset_validate(struct bch_fs *c,
                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
                         ca ? ca->name : c->name,
                         sector, le64_to_cpu(jset->seq),
-                       JSET_CSUM_TYPE(jset))) {
-               ret = JOURNAL_ENTRY_BAD;
-               goto csum_done;
-       }
-
-       if (write)
-               goto csum_done;
-
-       csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
-       if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum),
-                                c, jset, NULL,
-                                "%s sector %llu seq %llu: journal checksum bad",
-                                ca ? ca->name : c->name,
-                                sector, le64_to_cpu(jset->seq)))
+                       JSET_CSUM_TYPE(jset)))
                 ret = JOURNAL_ENTRY_BAD;
  
-       ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-                    jset->encrypted_start,
-                    vstruct_end(jset) - (void *) jset->encrypted_start);
-       bch2_fs_fatal_err_on(ret, c,
-                       "error decrypting journal entry: %i", ret);
-csum_done:
         /* last_seq is ignored when JSET_NO_FLUSH is true */
         if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
                                  le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
@@ -791,16 +775,52 @@ csum_done:
                 jset->last_seq = jset->seq;
                 return JOURNAL_ENTRY_BAD;
         }
+
+       ret = jset_validate_entries(c, jset, write);
  fsck_err:
         return ret;
  }
  
-static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+static int jset_validate_early(struct bch_fs *c,
+                        struct bch_dev *ca,
+                        struct jset *jset, u64 sector,
+                        unsigned bucket_sectors_left,
+                        unsigned sectors_read)
  {
-       unsigned sectors = vstruct_sectors(jset, c->block_bits);
+       size_t bytes = vstruct_bytes(jset);
+       unsigned version;
+       int write = READ;
+       int ret = 0;
+
+       if (le64_to_cpu(jset->magic) != jset_magic(c))
+               return JOURNAL_ENTRY_NONE;
+
+       version = le32_to_cpu(jset->version);
+       if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+                                 version < bcachefs_metadata_version_min) ||
+                                version >= bcachefs_metadata_version_max,
+                                c, jset, NULL,
+                       "%s sector %llu seq %llu: unknown journal entry version %u",
+                       ca ? ca->name : c->name,
+                       sector, le64_to_cpu(jset->seq),
+                       version)) {
+               /* don't try to continue: */
+               return -EINVAL;
+       }
  
-       return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
-               jset_validate_entries(c, jset, WRITE);
+       if (bytes > (sectors_read << 9) &&
+           sectors_read < bucket_sectors_left)
+               return JOURNAL_ENTRY_REREAD;
+
+       if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+                                c, jset, NULL,
+                       "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+                       ca ? ca->name : c->name,
+                       sector, le64_to_cpu(jset->seq), bytes))
+               le32_add_cpu(&jset->u64s,
+                            -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+       return ret;
  }
  
  struct journal_read_buf {
@@ -839,7 +859,7 @@ static int journal_read_bucket(struct bch_dev *ca,
         unsigned sectors, sectors_read = 0;
         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
             end = offset + ca->mi.bucket_size;
-       bool saw_bad = false;
+       bool saw_bad = false, csum_good;
         int ret = 0;
  
         pr_debug("reading %u", bucket);
@@ -878,9 +898,8 @@ reread:
                         j = buf->data;
                 }
  
-               ret = jset_validate(c, ca, j, offset,
-                                   end - offset, sectors_read,
-                                   READ);
+               ret = jset_validate_early(c, ca, j, offset,
+                                   end - offset, sectors_read);
                 switch (ret) {
                 case 0:
                         sectors = vstruct_sectors(j, c->block_bits);
@@ -896,17 +915,13 @@ reread:
                 case JOURNAL_ENTRY_NONE:
                         if (!saw_bad)
                                 return 0;
-                       sectors = block_sectors(c);
-                       goto next_block;
-               case JOURNAL_ENTRY_BAD:
-                       saw_bad = true;
                         /*
                          * On checksum error we don't really trust the size
                          * field of the journal entry we read, so try reading
                          * again at next block boundary:
                          */
                         sectors = block_sectors(c);
-                       break;
+                       goto next_block;
                 default:
                         return ret;
                 }
@@ -922,14 +937,25 @@ reread:
  
                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
  
+               csum_good = jset_csum_good(c, j);
+               if (!csum_good)
+                       saw_bad = true;
+
+               ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+                            j->encrypted_start,
+                            vstruct_end(j) - (void *) j->encrypted_start);
+               bch2_fs_fatal_err_on(ret, c,
+                               "error decrypting journal entry: %i", ret);
+
                 mutex_lock(&jlist->lock);
                 ret = journal_entry_add(c, ca, (struct journal_ptr) {
+                                       .csum_good      = csum_good,
                                         .dev            = ca->dev_idx,
                                         .bucket         = bucket,
                                         .bucket_offset  = offset -
                                                 bucket_to_sector(ca, ja->buckets[bucket]),
                                         .sector         = offset,
-                                       }, jlist, j, ret != 0);
+                                       }, jlist, j);
                 mutex_unlock(&jlist->lock);
  
                 switch (ret) {
@@ -1128,6 +1154,19 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                         *start_seq = le64_to_cpu(i->j.seq) + 1;
  
                 if (!JSET_NO_FLUSH(&i->j)) {
+                       int write = READ;
+                       if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+                                                c, &i->j, NULL,
+                                                "invalid journal entry: last_seq > seq (%llu > %llu)",
+                                                le64_to_cpu(i->j.last_seq),
+                                                le64_to_cpu(i->j.seq)))
+                               i->j.last_seq = i->j.seq;
+
+                       pr_info("last flush %llu-%llu csum good %u",
+                               le64_to_cpu(i->j.last_seq),
+                               le64_to_cpu(i->j.seq),
+                               i->csum_good);
+
                         last_seq        = le64_to_cpu(i->j.last_seq);
                         *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
                         break;
@@ -1231,7 +1270,21 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                 if (!i || i->ignore)
                         continue;
  
-               ret = jset_validate_entries(c, &i->j, READ);
+               for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+                       if (!i->ptrs[ptr].csum_good)
+                               printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
+                                      ca->name, i->ptrs[ptr].sector,
+                                      le64_to_cpu(i->j.seq),
+                                      i->csum_good ? " (had good copy on another device)" : "");
+               }
+
+               ret = jset_validate(c,
+                                   bch_dev_bkey_exists(c, i->ptrs[0].dev),
+                                   &i->j,
+                                   i->ptrs[0].sector,
+                                   READ);
                 if (ret)
                         goto err;
  
@@ -1667,7 +1720,7 @@ void bch2_journal_write(struct closure *cl)
                 validate_before_checksum = true;
  
         if (validate_before_checksum &&
-           jset_validate_for_write(c, jset))
+           jset_validate(c, NULL, jset, 0, WRITE))
                 goto err;
  
         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -1681,7 +1734,7 @@ void bch2_journal_write(struct closure *cl)
                                   journal_nonce(jset), jset);
  
         if (!validate_before_checksum &&
-           jset_validate_for_write(c, jset))
+           jset_validate(c, NULL, jset, 0, WRITE))
                 goto err;
  
         sectors = vstruct_sectors(jset, c->block_bits);
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h

index 1a91f2c0a26ce1abf14def7555b1428994cd9587..2f8bbf06b28951301d8c0469eed1e0f934adf18b 100644 (file)
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@@ -8,6 +8,7 @@
   */
  struct journal_replay {
         struct journal_ptr {
+               bool            csum_good;
                 u8              dev;
                 u32             bucket;
                 u32             bucket_offset;
@@ -15,8 +16,7 @@ struct journal_replay {
         }                       ptrs[BCH_REPLICAS_MAX];
         unsigned                nr_ptrs;
  
-       /* checksum error, but we may want to try using it anyways: */
-       bool                    bad;
+       bool                    csum_good;
         bool                    ignore;
         /* must be last: */
         struct jset             j;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c

index 4f4dfaa7bfb7cc30084a63fbfa5049e9f09229c3..55fdacad9b9818e8cadf2173f8b57de0fdab69d9 100644 (file)
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -628,7 +628,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                 bch2_trans_begin(&trans);
  
                 ret = bch2_get_next_backpointer(&trans, bucket, gen,
-                                               &bp_offset, &bp);
+                                               &bp_offset, &bp,
+                                               BTREE_ITER_CACHED);
                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                         continue;
                 if (ret)
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c

index 35958c6bb4a6d3278ec6c81b233bcedf4fdcfe72..044eca879afced62016f63076c6fe47f39eba069 100644 (file)
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -162,7 +162,7 @@ static int bch2_copygc(struct bch_fs *c)
  
         bch2_moving_ctxt_exit(&ctxt);
  
-       if (ret < 0)
+       if (ret < 0 && ret != -EROFS)
                 bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
  
         trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c

index c12d715fb75813cefad722dcb9005daccd3c3e5b..ad7130a1469167ea5e165ebd0d8806e82dd96dd6 100644 (file)
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -95,6 +95,113 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
  #include <linux/fs.h>
  #include <linux/quota.h>
  
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, 20);
+
+       prt_str(out, "i_fieldmask");
+       prt_tab(out);
+       prt_printf(out, "%x", i->i_fieldmask);
+       prt_newline(out);
+
+       prt_str(out, "i_flags");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_flags);
+       prt_newline(out);
+
+       prt_str(out, "i_spc_timelimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_spc_timelimit);
+       prt_newline(out);
+
+       prt_str(out, "i_ino_timelimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_ino_timelimit);
+       prt_newline(out);
+
+       prt_str(out, "i_rt_spc_timelimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_rt_spc_timelimit);
+       prt_newline(out);
+
+       prt_str(out, "i_spc_warnlimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_spc_warnlimit);
+       prt_newline(out);
+
+       prt_str(out, "i_ino_warnlimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_ino_warnlimit);
+       prt_newline(out);
+
+       prt_str(out, "i_rt_spc_warnlimit");
+       prt_tab(out);
+       prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+       prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, 20);
+
+       prt_str(out, "d_fieldmask");
+       prt_tab(out);
+       prt_printf(out, "%x", q->d_fieldmask);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_hardlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_spc_hardlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_softlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_spc_softlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_hardlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_hardlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_softlimit");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_softlimit);
+       prt_newline(out);
+
+       prt_str(out, "d_space");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_space);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_count");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_count);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_timer");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_ino_timer);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_timer");
+       prt_tab(out);
+       prt_printf(out, "%llu", q->d_spc_timer);
+       prt_newline(out);
+
+       prt_str(out, "d_ino_warns");
+       prt_tab(out);
+       prt_printf(out, "%i", q->d_ino_warns);
+       prt_newline(out);
+
+       prt_str(out, "d_spc_warns");
+       prt_tab(out);
+       prt_printf(out, "%i", q->d_spc_warns);
+       prt_newline(out);
+}
+
  static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
  {
         qtypes >>= i;
@@ -413,6 +520,26 @@ void bch2_fs_quota_init(struct bch_fs *c)
                 mutex_init(&c->quotas[i].lock);
  }
  
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+       struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+
+       if (sb_quota)
+               return sb_quota;
+
+       sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+       if (sb_quota) {
+               unsigned qtype, qc;
+
+               for (qtype = 0; qtype < QTYP_NR; qtype++)
+                       for (qc = 0; qc < Q_COUNTERS; qc++)
+                               sb_quota->q[qtype].c[qc].timelimit =
+                                       cpu_to_le32(7 * 24 * 60 * 60);
+       }
+
+       return sb_quota;
+}
+
  static void bch2_sb_quota_read(struct bch_fs *c)
  {
         struct bch_sb_field_quota *sb_quota;
@@ -471,12 +598,19 @@ advance:
  
  int bch2_fs_quota_read(struct bch_fs *c)
  {
+       struct bch_sb_field_quota *sb_quota;
         struct btree_trans trans;
         struct btree_iter iter;
         struct bkey_s_c k;
         int ret;
  
         mutex_lock(&c->sb_lock);
+       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+       if (!sb_quota) {
+               mutex_unlock(&c->sb_lock);
+               return -BCH_ERR_ENOSPC_sb_quota;
+       }
+
         bch2_sb_quota_read(c);
         mutex_unlock(&c->sb_lock);
  
@@ -500,6 +634,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
  static int bch2_quota_enable(struct super_block        *sb, unsigned uflags)
  {
         struct bch_fs *c = sb->s_fs_info;
+       struct bch_sb_field_quota *sb_quota;
+       int ret = 0;
  
         if (sb->s_flags & SB_RDONLY)
                 return -EROFS;
@@ -519,6 +655,12 @@ static int bch2_quota_enable(struct super_block    *sb, unsigned uflags)
                 return -EINVAL;
  
         mutex_lock(&c->sb_lock);
+       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+       if (!sb_quota) {
+               ret = -BCH_ERR_ENOSPC_sb_quota;
+               goto unlock;
+       }
+
         if (uflags & FS_QUOTA_UDQ_ENFD)
                 SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
  
@@ -529,9 +671,10 @@ static int bch2_quota_enable(struct super_block    *sb, unsigned uflags)
                 SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
  
         bch2_write_super(c);
+unlock:
         mutex_unlock(&c->sb_lock);
  
-       return 0;
+       return bch2_err_class(ret);
  }
  
  static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
@@ -643,6 +786,15 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
         struct bch_fs *c = sb->s_fs_info;
         struct bch_sb_field_quota *sb_quota;
         struct bch_memquota_type *q;
+       int ret = 0;
+
+       if (0) {
+               struct printbuf buf = PRINTBUF;
+
+               qc_info_to_text(&buf, info);
+               pr_info("setting:\n%s", buf.buf);
+               printbuf_exit(&buf);
+       }
  
         if (sb->s_flags & SB_RDONLY)
                 return -EROFS;
@@ -660,12 +812,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
         q = &c->quotas[type];
  
         mutex_lock(&c->sb_lock);
-       sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+       sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
         if (!sb_quota) {
-               sb_quota = bch2_sb_resize_quota(&c->disk_sb,
-                                       sizeof(*sb_quota) / sizeof(u64));
-               if (!sb_quota)
-                       return -BCH_ERR_ENOSPC_sb_quota;
+               ret = -BCH_ERR_ENOSPC_sb_quota;
+               goto unlock;
         }
  
         if (info->i_fieldmask & QC_SPC_TIMER)
@@ -687,9 +837,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
         bch2_sb_quota_read(c);
  
         bch2_write_super(c);
+unlock:
         mutex_unlock(&c->sb_lock);
  
-       return 0;
+       return bch2_err_class(ret);
  }
  
  /* Get/set individual quotas: */
@@ -794,6 +945,14 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
         struct bkey_i_quota new_quota;
         int ret;
  
+       if (0) {
+               struct printbuf buf = PRINTBUF;
+
+               qc_dqblk_to_text(&buf, qdq);
+               pr_info("setting:\n%s", buf.buf);
+               printbuf_exit(&buf);
+       }
+
         if (sb->s_flags & SB_RDONLY)
                 return -EROFS;
  
diff --git a/libbcachefs/super.c b/libbcachefs/super.c

index 9df08289a004069845468805bfddaa861001514a..3f674bf061ff40554e38c5eb4666a4764be750fa 100644 (file)
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -895,6 +895,12 @@ int bch2_fs_start(struct bch_fs *c)
                 bch2_dev_allocator_add(c, ca);
         bch2_recalc_capacity(c);
  
+       for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
+               mutex_lock(&c->btree_transaction_stats[i].lock);
+               bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
+               mutex_unlock(&c->btree_transaction_stats[i].lock);
+       }
+
         ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
                 ? bch2_fs_recovery(c)
                 : bch2_fs_initialize(c);
diff --git a/libbcachefs/util.c b/libbcachefs/util.c

index d19193508c6dc3cdfdaded288a805c203357fd2e..f08215af359f83766132965f7178f64875400742 100644 (file)
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -22,6 +22,7 @@
  #include <linux/string.h>
  #include <linux/types.h>
  #include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>
  
  #include "eytzinger.h"
  #include "util.h"
@@ -323,38 +324,44 @@ static void bch2_time_stats_update_one(struct time_stats *stats,
  {
         u64 duration, freq;
  
-       duration        = time_after64(end, start)
-               ? end - start : 0;
-       freq            = time_after64(end, stats->last_event)
-               ? end - stats->last_event : 0;
-
-       stats->count++;
-
-       stats->average_duration = stats->average_duration
-               ? ewma_add(stats->average_duration, duration, 6)
-               : duration;
-
-       stats->average_frequency = stats->average_frequency
-               ? ewma_add(stats->average_frequency, freq, 6)
-               : freq;
-
-       stats->max_duration = max(stats->max_duration, duration);
-
-       stats->last_event = end;
+       if (time_after64(end, start)) {
+               duration = end - start;
+               stats->duration_stats = mean_and_variance_update(stats->duration_stats,
+                                                                duration);
+               stats->duration_stats_weighted = mean_and_variance_weighted_update(
+                       stats->duration_stats_weighted,
+                       duration);
+               stats->max_duration = max(stats->max_duration, duration);
+               stats->min_duration = min(stats->min_duration, duration);
+               bch2_quantiles_update(&stats->quantiles, duration);
+       }
  
-       bch2_quantiles_update(&stats->quantiles, duration);
+       if (time_after64(end, stats->last_event)) {
+               freq = end - stats->last_event;
+               stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
+               stats->freq_stats_weighted = mean_and_variance_weighted_update(
+                       stats->freq_stats_weighted,
+                       freq);
+               stats->max_freq = max(stats->max_freq, freq);
+               stats->min_freq = min(stats->min_freq, freq);
+               stats->last_event = end;
+       }
  }
  
  void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
  {
         unsigned long flags;
  
+       WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+                      "time_stats: min_duration = %llu, min_freq = %llu",
+                      stats->min_duration, stats->min_freq);
+
         if (!stats->buffer) {
                 spin_lock_irqsave(&stats->lock, flags);
                 bch2_time_stats_update_one(stats, start, end);
  
-               if (stats->average_frequency < 32 &&
-                   stats->count > 1024)
+               if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+                   stats->duration_stats.n > 1024)
                         stats->buffer =
                                 alloc_percpu_gfp(struct time_stat_buffer,
                                                  GFP_ATOMIC);
@@ -389,12 +396,15 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
  
  static const struct time_unit {
         const char      *name;
-       u32             nsecs;
+       u64             nsecs;
  } time_units[] = {
-       { "ns",         1               },
-       { "us",         NSEC_PER_USEC   },
-       { "ms",         NSEC_PER_MSEC   },
-       { "sec",        NSEC_PER_SEC    },
+       { "ns",         1                },
+       { "us",         NSEC_PER_USEC    },
+       { "ms",         NSEC_PER_MSEC    },
+       { "s",          NSEC_PER_SEC     },
+       { "m",          NSEC_PER_SEC * 60},
+       { "h",          NSEC_PER_SEC * 3600},
+       { "eon",        U64_MAX          },
  };
  
  static const struct time_unit *pick_time_units(u64 ns)
@@ -414,38 +424,117 @@ static void pr_time_units(struct printbuf *out, u64 ns)
  {
         const struct time_unit *u = pick_time_units(ns);
  
-       prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+       prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+       prt_tab_rjust(out);
+       prt_printf(out, "%s", u->name);
+}
+
+#define TABSTOP_SIZE 12
+
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+       prt_printf(out, name);
+       prt_tab(out);
+       pr_time_units(out, ns);
+       prt_newline(out);
  }
  
  void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
  {
         const struct time_unit *u;
-       u64 freq = READ_ONCE(stats->average_frequency);
-       u64 q, last_q = 0;
+       s64 f_mean = 0, d_mean = 0;
+       u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
         int i;
+       /*
+        * avoid divide by zero
+        */
+       if (stats->freq_stats.n) {
+               f_mean = mean_and_variance_get_mean(stats->freq_stats);
+               f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+               d_mean = mean_and_variance_get_mean(stats->duration_stats);
+               d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+       }
  
-       prt_printf(out, "count:\t\t%llu",
-                        stats->count);
+       printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+       prt_printf(out, "count:");
+       prt_tab(out);
+       prt_printf(out, "%llu ",
+                        stats->duration_stats.n);
+       printbuf_tabstop_pop(out);
         prt_newline(out);
-       prt_printf(out, "rate:\t\t%llu/sec",
-              freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+
+       printbuf_tabstops_reset(out);
+
+       printbuf_tabstop_push(out, out->indent + 20);
+       printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+       printbuf_tabstop_push(out, 0);
+       printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+       prt_tab(out);
+       prt_printf(out, "since mount");
+       prt_tab_rjust(out);
+       prt_tab(out);
+       prt_printf(out, "recent");
+       prt_tab_rjust(out);
         prt_newline(out);
  
-       prt_printf(out, "frequency:\t");
-       pr_time_units(out, freq);
+       printbuf_tabstops_reset(out);
+       printbuf_tabstop_push(out, out->indent + 20);
+       printbuf_tabstop_push(out, TABSTOP_SIZE);
+       printbuf_tabstop_push(out, 2);
+       printbuf_tabstop_push(out, TABSTOP_SIZE);
  
+       prt_printf(out, "duration of events");
         prt_newline(out);
-       prt_printf(out, "avg duration:\t");
-       pr_time_units(out, stats->average_duration);
+       printbuf_indent_add(out, 2);
+
+       pr_name_and_units(out, "min:", stats->min_duration);
+       pr_name_and_units(out, "max:", stats->max_duration);
  
+       prt_printf(out, "mean:");
+       prt_tab(out);
+       pr_time_units(out, d_mean);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
         prt_newline(out);
-       prt_printf(out, "max duration:\t");
-       pr_time_units(out, stats->max_duration);
+
+       prt_printf(out, "stddev:");
+       prt_tab(out);
+       pr_time_units(out, d_stddev);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+       printbuf_indent_sub(out, 2);
+       prt_newline(out);
+
+       prt_printf(out, "time between events");
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       pr_name_and_units(out, "min:", stats->min_freq);
+       pr_name_and_units(out, "max:", stats->max_freq);
+
+       prt_printf(out, "mean:");
+       prt_tab(out);
+       pr_time_units(out, f_mean);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+       prt_newline(out);
+
+       prt_printf(out, "stddev:");
+       prt_tab(out);
+       pr_time_units(out, f_stddev);
+       prt_tab(out);
+       pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+       printbuf_indent_sub(out, 2);
+       prt_newline(out);
+
+       printbuf_tabstops_reset(out);
  
         i = eytzinger0_first(NR_QUANTILES);
         u = pick_time_units(stats->quantiles.entries[i].m);
  
-       prt_newline(out);
         prt_printf(out, "quantiles (%s):\t", u->name);
         eytzinger0_for_each(i, NR_QUANTILES) {
                 bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
@@ -467,6 +556,10 @@ void bch2_time_stats_exit(struct time_stats *stats)
  void bch2_time_stats_init(struct time_stats *stats)
  {
         memset(stats, 0, sizeof(*stats));
+       stats->duration_stats_weighted.w = 8;
+       stats->freq_stats_weighted.w = 8;
+       stats->min_duration = U64_MAX;
+       stats->min_freq = U64_MAX;
         spin_lock_init(&stats->lock);
  }
  
diff --git a/libbcachefs/util.h b/libbcachefs/util.h

index a7f68e1755aa26d60fa3d98da2bc08d2d5a31ac4..846e6024a80b377c2eb81b92e2209d2166941123 100644 (file)
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -18,6 +18,7 @@
  #include <linux/slab.h>
  #include <linux/vmalloc.h>
  #include <linux/workqueue.h>
+#include <linux/mean_and_variance.h>
  
  struct closure;
  
@@ -380,14 +381,18 @@ struct time_stat_buffer {
  
  struct time_stats {
         spinlock_t      lock;
-       u64             count;
         /* all fields are in nanoseconds */
-       u64             average_duration;
-       u64             average_frequency;
         u64             max_duration;
+       u64             min_duration;
+       u64             max_freq;
+       u64             min_freq;
         u64             last_event;
         struct quantiles quantiles;
  
+       struct mean_and_variance          duration_stats;
+       struct mean_and_variance_weighted duration_stats_weighted;
+       struct mean_and_variance          freq_stats;
+       struct mean_and_variance_weighted freq_stats_weighted;
         struct time_stat_buffer __percpu *buffer;
  };
  
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c

index 6a5be6c9e1cac0a6c1888dd6e8f92ea109732e31..4fc1c3afab691065f4d6e3cc83394acdb6ec7a71 100644 (file)
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -371,8 +371,10 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
  {
         struct bch_inode_info *inode = to_bch_ei(vinode);
         struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       int ret;
  
-       return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+       ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+       return bch2_err_class(ret);
  }
  
  static int bch2_xattr_set_handler(const struct xattr_handler *handler,
@@ -384,11 +386,13 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
         struct bch_inode_info *inode = to_bch_ei(vinode);
         struct bch_fs *c = inode->v.i_sb->s_fs_info;
         struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+       int ret;
  
-       return bch2_trans_do(c, NULL, NULL, 0,
+       ret = bch2_trans_do(c, NULL, NULL, 0,
                         bch2_xattr_set(&trans, inode_inum(inode), &hash,
                                        name, value, size,
                                        handler->flags, flags));
+       return bch2_err_class(ret);
  }
  
  static const struct xattr_handler bch_xattr_user_handler = {
diff --git a/linux/int_sqrt.c b/linux/int_sqrt.c

new file mode 100644 (file)

index 0000000..a8170bb
--- /dev/null
+++ b/linux/int_sqrt.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
+ *
+ *  Based on the shift-and-subtract algorithm for computing integer
+ *  square root from Guy L. Steele.
+ */
+
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+
+/**
+ * int_sqrt - computes the integer square root
+ * @x: integer of which to calculate the sqrt
+ *
+ * Computes: floor(sqrt(x))
+ */
+unsigned long int_sqrt(unsigned long x)
+{
+       unsigned long b, m, y = 0;
+
+       if (x <= 1)
+               return x;
+
+       m = 1UL << (__fls(x) & ~1UL);
+       while (m != 0) {
+               b = y + m;
+               y >>= 1;
+
+               if (x >= b) {
+                       x -= b;
+                       y += m;
+               }
+               m >>= 2;
+       }
+
+       return y;
+}
+EXPORT_SYMBOL(int_sqrt);
+
+#if BITS_PER_LONG < 64
+/**
+ * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input
+ * is expected.
+ * @x: 64bit integer of which to calculate the sqrt
+ */
+u32 int_sqrt64(u64 x)
+{
+       u64 b, m, y = 0;
+
+       if (x <= ULONG_MAX)
+               return int_sqrt((unsigned long) x);
+
+       m = 1ULL << ((fls64(x) - 1) & ~1ULL);
+       while (m != 0) {
+               b = y + m;
+               y >>= 1;
+
+               if (x >= b) {
+                       x -= b;
+                       y += m;
+               }
+               m >>= 2;
+       }
+
+       return y;
+}
+EXPORT_SYMBOL(int_sqrt64);
+#endif
diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c

new file mode 100644 (file)

index 0000000..643e311
--- /dev/null
+++ b/linux/mean_and_variance.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Copyright © 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/mean_and_variance.h>
+#include <linux/module.h>
+#include <linux/printbuf.h>
+
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+inline s64 fast_divpow2(s64 n, u8 d)
+{
+       return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1)
+{
+       struct mean_and_variance s2;
+       u64 v2 = abs(v1);
+
+       s2.n           = s1.n + 1;
+       s2.sum         = s1.sum + v1;
+       s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2));
+       return s2;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_update);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+       return div64_u64(s.sum, s.n);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() -  get variance from @s1
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+       u128 s2 = u128_div(s1.sum_squares, s1.n);
+       u64  s3 = abs(mean_and_variance_get_mean(s1));
+
+       return u128_to_u64(u128_sub(s2, u128_square(s3)));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+       return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s1: ..
+ * @s2: ..
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1,
+                                                                   s64 x)
+{
+       struct mean_and_variance_weighted s2;
+       // previous weighted variance.
+       u64 var_w0 = s1.variance;
+       u8 w = s2.w = s1.w;
+       // new value weighted.
+       s64 x_w = x << w;
+       s64 diff_w = x_w - s1.mean;
+       s64 diff = fast_divpow2(diff_w, w);
+       // new mean weighted.
+       s64 u_w1     = s1.mean + diff;
+
+       BUG_ON(w % 2 != 0);
+
+       if (!s1.init) {
+               s2.mean = x_w;
+               s2.variance = 0;
+       } else {
+               s2.mean = u_w1;
+               s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+       }
+       s2.init = true;
+
+       return s2;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+{
+       return fast_divpow2(s.mean, s.w);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+{
+       // always positive don't need fast divpow2
+       return s.variance >> s.w;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+{
+       return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/linux/six.c b/linux/six.c

index b11660af245b8fc281da7a3d3d94affb16a48de9..39f7ea79fdb17d121b5dfa3f80dffe5bf3bb041b 100644 (file)
--- a/linux/six.c
+++ b/linux/six.c
@@ -148,6 +148,14 @@ static int __do_six_trylock_type(struct six_lock *lock,
                         atomic64_add(__SIX_VAL(write_locking, 1),
                                      &lock->state.counter);
                         smp_mb__after_atomic();
+               } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
+                       atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
+                                    &lock->state.counter);
+                       /*
+                        * pairs with barrier after unlock and before checking
+                        * for readers in unlock path
+                        */
+                       smp_mb__after_atomic();
                 }
  
                 ret = !pcpu_read_count(lock);
@@ -162,9 +170,6 @@ static int __do_six_trylock_type(struct six_lock *lock,
                 if (ret || try)
                         v -= __SIX_VAL(write_locking, 1);
  
-               if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write)))
-                       v += __SIX_VAL(waiters, 1 << SIX_LOCK_write);
-
                 if (try && !ret) {
                         old.v = atomic64_add_return(v, &lock->state.counter);
                         if (old.waiters & (1 << SIX_LOCK_read))
author	Kent Overstreet <kent.overstreet@linux.dev>
	Wed, 12 Oct 2022 20:29:56 +0000 (16:29 -0400)
committer	Kent Overstreet <kent.overstreet@linux.dev>
	Sat, 15 Oct 2022 05:04:30 +0000 (01:04 -0400)
.bcachefs_revision		patch \| blob \| history
Makefile		patch \| blob \| history
include/linux/mean_and_variance.h	[new file with mode: 0644]	patch \| blob
libbcachefs/backpointers.c		patch \| blob \| history
libbcachefs/backpointers.h		patch \| blob \| history
libbcachefs/btree_gc.c		patch \| blob \| history
libbcachefs/btree_iter.c		patch \| blob \| history
libbcachefs/btree_key_cache.c		patch \| blob \| history
libbcachefs/btree_locking.c		patch \| blob \| history
libbcachefs/btree_locking.h		patch \| blob \| history
libbcachefs/ec.c		patch \| blob \| history
libbcachefs/fs-common.c		patch \| blob \| history
libbcachefs/fs-io.c		patch \| blob \| history
libbcachefs/fs-ioctl.c		patch \| blob \| history
libbcachefs/fs.c		patch \| blob \| history
libbcachefs/journal_io.c		patch \| blob \| history
libbcachefs/journal_io.h		patch \| blob \| history
libbcachefs/move.c		patch \| blob \| history
libbcachefs/movinggc.c		patch \| blob \| history
libbcachefs/quota.c		patch \| blob \| history
libbcachefs/super.c		patch \| blob \| history
libbcachefs/util.c		patch \| blob \| history
libbcachefs/util.h		patch \| blob \| history
libbcachefs/xattr.c		patch \| blob \| history
linux/int_sqrt.c	[new file with mode: 0644]	patch \| blob
linux/mean_and_variance.c	[new file with mode: 0644]	patch \| blob
linux/six.c		patch \| blob \| history