]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to ad68801b93 bcachefs: Use pcpu mode of six locks for interi...
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 25 Mar 2021 02:13:00 +0000 (22:13 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 25 Mar 2021 02:13:00 +0000 (22:13 -0400)
.bcachefs_revision
include/linux/six.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_update_interior.c
libbcachefs/extents.c
libbcachefs/move.c
linux/six.c

index f045aac7b201495a24ccc6378d11a44e3a77ff0f..976139a36bdd3f422a9af7f0039c7d38b2600771 100644 (file)
@@ -1 +1 @@
-c7defb5793039b55066e8e9d41e76bae826a7894
+ad68801b939cdda0530f54cd07b3212e98fe1d75
index a16e94f482e972fe7fdafbd2f82b55cbc41e0b2d..0e6df059341ff0c6c50680b8b9116905df70efd5 100644 (file)
@@ -80,7 +80,8 @@ union six_lock_state {
        };
 
        struct {
-               unsigned        read_lock:28;
+               unsigned        read_lock:27;
+               unsigned        write_locking:1;
                unsigned        intent_lock:1;
                unsigned        waiters:3;
                /*
@@ -107,6 +108,7 @@ struct six_lock {
        unsigned                intent_lock_recurse;
        struct task_struct      *owner;
        struct optimistic_spin_queue osq;
+       unsigned __percpu       *readers;
 
        raw_spinlock_t          wait_lock;
        struct list_head        wait_list[2];
@@ -194,4 +196,7 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
+void six_lock_pcpu_free(struct six_lock *);
+void six_lock_pcpu_alloc(struct six_lock *);
+
 #endif /* _LINUX_SIX_H */
index 89b3b50907bc84357b767ee15053de92cf202a76..fc76e78806f93f43cd21cdcbfde380305ae4a356 100644 (file)
@@ -146,6 +146,11 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
        b->c.level      = level;
        b->c.btree_id   = id;
 
+       if (level)
+               six_lock_pcpu_alloc(&b->c.lock);
+       else
+               six_lock_pcpu_free(&b->c.lock);
+
        mutex_lock(&bc->lock);
        ret = __bch2_btree_node_hash_insert(bc, b);
        if (!ret)
@@ -386,6 +391,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
        while (!list_empty(&bc->freed)) {
                b = list_first_entry(&bc->freed, struct btree, list);
                list_del(&b->list);
+               six_lock_pcpu_free(&b->c.lock);
                kfree(b);
        }
 
index 5b8ec9407ae0b0c91d8d66a72dc1a5249ce1a680..6d5ed774d226a2ed95f0ecee7c6a425438c2ee1d 100644 (file)
@@ -167,7 +167,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
        const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
+       struct extent_ptr_decoded p = { 0 };
        bool do_update = false;
        int ret = 0;
 
index 4c0e3d7c8ddff3a30ea9eb235e637291e098e0b4..a661bc0cf98aad8ee227ae6e42efeb19769c2960 100644 (file)
@@ -988,6 +988,11 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        list_del_init(&b->list);
        mutex_unlock(&c->btree_cache.lock);
 
+       if (b->c.level)
+               six_lock_pcpu_alloc(&b->c.lock);
+       else
+               six_lock_pcpu_free(&b->c.lock);
+
        mutex_lock(&c->btree_root_lock);
        BUG_ON(btree_node_root(c, b) &&
               (b->c.level < btree_node_root(c, b)->c.level ||
index 3c9c36c82d29f256c321d53f2adb2e493ce804ae..a7e0408213a9506d8b6d16fc4059d0de68ac0d3e 100644 (file)
@@ -632,7 +632,7 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
+       struct extent_ptr_decoded p = { 0 };
        unsigned replicas = 0;
 
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
index b614b1512f15a6f6182886fd5800308494d47b6f..732e2dbbea87678deafdd5eeb97dccbb5bb46d2d 100644 (file)
@@ -883,7 +883,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
        ret = bch2_move_btree(c,
                              0,                POS_MIN,
                              BTREE_ID_NR,      POS_MAX,
-                             rewrite_old_nodes_pred, c, stats) ?: ret;
+                             rewrite_old_nodes_pred, c, stats);
        if (!ret) {
                mutex_lock(&c->sb_lock);
                c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
index 49d46ed2e18e9b8c5916e4f59e39b3ce24aa205b..532800443398ccef305ada1287bc781116b2aa47 100644 (file)
@@ -2,6 +2,7 @@
 
 #include <linux/export.h>
 #include <linux/log2.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -41,7 +42,7 @@ struct six_lock_vals {
 #define LOCK_VALS {                                                    \
        [SIX_LOCK_read] = {                                             \
                .lock_val       = __SIX_VAL(read_lock, 1),              \
-               .lock_fail      = __SIX_LOCK_HELD_write,                \
+               .lock_fail      = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
                .unlock_val     = -__SIX_VAL(read_lock, 1),             \
                .held_mask      = __SIX_LOCK_HELD_read,                 \
                .unlock_wakeup  = SIX_LOCK_write,                       \
@@ -76,36 +77,195 @@ static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
        }
 }
 
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+       unsigned read_count = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               read_count += *per_cpu_ptr(lock->readers, cpu);
+       return read_count;
+}
+
+struct six_lock_waiter {
+       struct list_head        list;
+       struct task_struct      *task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+                                  union six_lock_state state,
+                                  unsigned waitlist_id)
+{
+       if (waitlist_id == SIX_LOCK_write) {
+               if (state.write_locking && !state.read_lock) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+                       if (p)
+                               wake_up_process(p);
+               }
+       } else {
+               struct list_head *wait_list = &lock->wait_list[waitlist_id];
+               struct six_lock_waiter *w, *next;
+
+               if (!(state.waiters & (1 << waitlist_id)))
+                       return;
+
+               clear_bit(waitlist_bitnr(waitlist_id),
+                         (unsigned long *) &lock->state.v);
+
+               raw_spin_lock(&lock->wait_lock);
+
+               list_for_each_entry_safe(w, next, wait_list, list) {
+                       list_del_init(&w->list);
+
+                       if (wake_up_process(w->task) &&
+                           waitlist_id != SIX_LOCK_read) {
+                               if (!list_empty(wait_list))
+                                       set_bit(waitlist_bitnr(waitlist_id),
+                                               (unsigned long *) &lock->state.v);
+                               break;
+                       }
+               }
+
+               raw_spin_unlock(&lock->wait_lock);
+       }
+}
+
 static __always_inline bool do_six_trylock_type(struct six_lock *lock,
-                                               enum six_lock_type type)
+                                               enum six_lock_type type,
+                                               bool try)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
-       union six_lock_state old;
-       u64 v = READ_ONCE(lock->state.v);
+       union six_lock_state old, new;
+       bool ret;
+       u64 v;
 
        EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+       EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
 
-       do {
-               old.v = v;
+       EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
 
-               EBUG_ON(type == SIX_LOCK_write &&
-                       ((old.v & __SIX_LOCK_HELD_write) ||
-                        !(old.v & __SIX_LOCK_HELD_intent)));
+       /*
+        * Percpu reader mode:
+        *
+        * The basic idea behind this algorithm is that you can implement a lock
+        * between two threads without any atomics, just memory barriers:
+        *
+        * For two threads you'll need two variables, one variable for "thread a
+        * has the lock" and another for "thread b has the lock".
+        *
+        * To take the lock, a thread sets its variable indicating that it holds
+        * the lock, then issues a full memory barrier, then reads from the
+        * other thread's variable to check if the other thread thinks it has
+        * the lock. If we raced, we backoff and retry/sleep.
+        */
 
-               if (old.v & l[type].lock_fail)
-                       return false;
-       } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-                               old.v,
-                               old.v + l[type].lock_val)) != old.v);
+       if (type == SIX_LOCK_read && lock->readers) {
+retry:
+               preempt_disable();
+               this_cpu_inc(*lock->readers); /* signal that we own lock */
 
-       six_set_owner(lock, type, old);
-       return true;
+               smp_mb();
+
+               old.v = READ_ONCE(lock->state.v);
+               ret = !(old.v & l[type].lock_fail);
+
+               this_cpu_sub(*lock->readers, !ret);
+               preempt_enable();
+
+               /*
+                * If we failed because a writer was trying to take the
+                * lock, issue a wakeup because we might have caused a
+                * spurious trylock failure:
+                */
+               if (old.write_locking) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+
+                       if (p)
+                               wake_up_process(p);
+               }
+
+               /*
+                * If we failed from the lock path and the waiting bit wasn't
+                * set, set it:
+                */
+               if (!try && !ret) {
+                       v = old.v;
+
+                       do {
+                               new.v = old.v = v;
+
+                               if (!(old.v & l[type].lock_fail))
+                                       goto retry;
+
+                               if (new.waiters & (1 << type))
+                                       break;
+
+                               new.waiters |= 1 << type;
+                       } while ((v = atomic64_cmpxchg(&lock->state.counter,
+                                                      old.v, new.v)) != old.v);
+               }
+       } else if (type == SIX_LOCK_write && lock->readers) {
+               if (try) {
+                       atomic64_add(__SIX_VAL(write_locking, 1),
+                                    &lock->state.counter);
+                       smp_mb__after_atomic();
+               }
+
+               ret = !pcpu_read_count(lock);
+
+               /*
+                * On success, we increment lock->seq; also we clear
+                * write_locking unless we failed from the lock path:
+                */
+               v = 0;
+               if (ret)
+                       v += __SIX_VAL(seq, 1);
+               if (ret || try)
+                       v -= __SIX_VAL(write_locking, 1);
+
+               if (try && !ret) {
+                       old.v = atomic64_add_return(v, &lock->state.counter);
+                       six_lock_wakeup(lock, old, SIX_LOCK_read);
+               } else {
+                       atomic64_add(v, &lock->state.counter);
+               }
+       } else {
+               v = READ_ONCE(lock->state.v);
+               do {
+                       new.v = old.v = v;
+
+                       if (!(old.v & l[type].lock_fail)) {
+                               new.v += l[type].lock_val;
+
+                               if (type == SIX_LOCK_write)
+                                       new.write_locking = 0;
+                       } else if (!try && type != SIX_LOCK_write &&
+                                  !(new.waiters & (1 << type)))
+                               new.waiters |= 1 << type;
+                       else
+                               break; /* waiting bit already set */
+               } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+                                       old.v, new.v)) != old.v);
+
+               ret = !(old.v & l[type].lock_fail);
+       }
+
+       if (ret)
+               six_set_owner(lock, type, old);
+
+       EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+       EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+
+       return ret;
 }
 
 __always_inline __flatten
 static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
 {
-       if (!do_six_trylock_type(lock, type))
+       if (!do_six_trylock_type(lock, type, true))
                return false;
 
        if (type != SIX_LOCK_write)
@@ -119,8 +279,40 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 {
        const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state old;
-       u64 v = READ_ONCE(lock->state.v);
+       u64 v;
+
+       EBUG_ON(type == SIX_LOCK_write);
 
+       if (type == SIX_LOCK_read &&
+           lock->readers) {
+               bool ret;
+
+               preempt_disable();
+               this_cpu_inc(*lock->readers);
+
+               smp_mb();
+
+               old.v = READ_ONCE(lock->state.v);
+               ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+
+               this_cpu_sub(*lock->readers, !ret);
+               preempt_enable();
+
+               /*
+                * Similar to the lock path, we may have caused a spurious write
+                * lock fail and need to issue a wakeup:
+                */
+               if (old.write_locking) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+
+                       if (p)
+                               wake_up_process(p);
+               }
+
+               return ret;
+       }
+
+       v = READ_ONCE(lock->state.v);
        do {
                old.v = v;
 
@@ -136,14 +328,6 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
        return true;
 }
 
-struct six_lock_waiter {
-       struct list_head        list;
-       struct task_struct      *task;
-};
-
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
 #ifdef CONFIG_LOCK_SPIN_ON_OWNER
 
 static inline int six_can_spin_on_owner(struct six_lock *lock)
@@ -218,7 +402,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
                if (owner && !six_spin_on_owner(lock, owner))
                        break;
 
-               if (do_six_trylock_type(lock, type)) {
+               if (do_six_trylock_type(lock, type, false)) {
                        osq_unlock(&lock->osq);
                        preempt_enable();
                        return true;
@@ -270,18 +454,22 @@ noinline
 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
                                    six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-       const struct six_lock_vals l[] = LOCK_VALS;
-       union six_lock_state old, new;
+       union six_lock_state old;
        struct six_lock_waiter wait;
        int ret = 0;
-       u64 v;
+
+       if (type == SIX_LOCK_write) {
+               EBUG_ON(lock->state.write_locking);
+               atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+               smp_mb__after_atomic();
+       }
 
        ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
        if (ret)
-               return ret;
+               goto out_before_sleep;
 
        if (six_optimistic_spin(lock, type))
-               return 0;
+               goto out_before_sleep;
 
        lock_contended(&lock->dep_map, _RET_IP_);
 
@@ -298,32 +486,16 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                        raw_spin_unlock(&lock->wait_lock);
                }
 
-               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-               if (ret)
+               if (do_six_trylock_type(lock, type, false))
                        break;
 
-               v = READ_ONCE(lock->state.v);
-               do {
-                       new.v = old.v = v;
-
-                       if (!(old.v & l[type].lock_fail))
-                               new.v += l[type].lock_val;
-                       else if (!(new.waiters & (1 << type)))
-                               new.waiters |= 1 << type;
-                       else
-                               break; /* waiting bit already set */
-               } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-                                       old.v, new.v)) != old.v);
-
-               if (!(old.v & l[type].lock_fail))
+               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+               if (ret)
                        break;
 
                schedule();
        }
 
-       if (!ret)
-               six_set_owner(lock, type, old);
-
        __set_current_state(TASK_RUNNING);
 
        if (!list_empty_careful(&wait.list)) {
@@ -331,6 +503,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                list_del_init(&wait.list);
                raw_spin_unlock(&lock->wait_lock);
        }
+out_before_sleep:
+       if (ret && type == SIX_LOCK_write) {
+               old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
+                                           &lock->state.counter);
+               six_lock_wakeup(lock, old, SIX_LOCK_read);
+       }
 
        return ret;
 }
@@ -344,7 +522,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
        if (type != SIX_LOCK_write)
                six_acquire(&lock->dep_map, 0);
 
-       ret = do_six_trylock_type(lock, type) ? 0
+       ret = do_six_trylock_type(lock, type, true) ? 0
                : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
 
        if (ret && type != SIX_LOCK_write)
@@ -355,54 +533,12 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
        return ret;
 }
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-                                  union six_lock_state state,
-                                  unsigned waitlist_id)
-{
-       struct list_head *wait_list = &lock->wait_list[waitlist_id];
-       struct six_lock_waiter *w, *next;
-
-       if (waitlist_id == SIX_LOCK_write && state.read_lock)
-               return;
-
-       if (!(state.waiters & (1 << waitlist_id)))
-               return;
-
-       clear_bit(waitlist_bitnr(waitlist_id),
-                 (unsigned long *) &lock->state.v);
-
-       if (waitlist_id == SIX_LOCK_write) {
-               struct task_struct *p = READ_ONCE(lock->owner);
-
-               if (p)
-                       wake_up_process(p);
-               return;
-       }
-
-       raw_spin_lock(&lock->wait_lock);
-
-       list_for_each_entry_safe(w, next, wait_list, list) {
-               list_del_init(&w->list);
-
-               if (wake_up_process(w->task) &&
-                   waitlist_id != SIX_LOCK_read) {
-                       if (!list_empty(wait_list))
-                               set_bit(waitlist_bitnr(waitlist_id),
-                                       (unsigned long *) &lock->state.v);
-                       break;
-               }
-       }
-
-       raw_spin_unlock(&lock->wait_lock);
-}
-
 __always_inline __flatten
 static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state state;
 
-       EBUG_ON(!(lock->state.v & l[type].held_mask));
        EBUG_ON(type == SIX_LOCK_write &&
                !(lock->state.v & __SIX_LOCK_HELD_intent));
 
@@ -420,8 +556,17 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
                lock->owner = NULL;
        }
 
-       state.v = atomic64_add_return_release(l[type].unlock_val,
-                                             &lock->state.counter);
+       if (type == SIX_LOCK_read &&
+           lock->readers) {
+               smp_mb(); /* unlock barrier */
+               this_cpu_dec(*lock->readers);
+               state.v = READ_ONCE(lock->state.v);
+       } else {
+               EBUG_ON(!(lock->state.v & l[type].held_mask));
+               state.v = atomic64_add_return_release(l[type].unlock_val,
+                                                     &lock->state.counter);
+       }
+
        six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
@@ -467,26 +612,28 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade);
 
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-       const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state old, new;
        u64 v = READ_ONCE(lock->state.v);
 
        do {
                new.v = old.v = v;
 
-               EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask));
-
-               new.v += l[SIX_LOCK_read].unlock_val;
-
-               if (new.v & l[SIX_LOCK_intent].lock_fail)
+               if (new.intent_lock)
                        return false;
 
-               new.v += l[SIX_LOCK_intent].lock_val;
+               if (!lock->readers) {
+                       EBUG_ON(!new.read_lock);
+                       new.read_lock--;
+               }
+
+               new.intent_lock = 1;
        } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
                                old.v, new.v)) != old.v);
 
+       if (lock->readers)
+               this_cpu_dec(*lock->readers);
+
        six_set_owner(lock, SIX_LOCK_intent, old);
-       six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup);
 
        return true;
 }
@@ -518,16 +665,22 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
 
-       EBUG_ON(type == SIX_LOCK_write);
        six_acquire(&lock->dep_map, 0);
 
        /* XXX: assert already locked, and that we don't overflow: */
 
        switch (type) {
        case SIX_LOCK_read:
-               atomic64_add(l[type].lock_val, &lock->state.counter);
+               if (lock->readers) {
+                       this_cpu_inc(*lock->readers);
+               } else {
+                       EBUG_ON(!lock->state.read_lock &&
+                               !lock->state.intent_lock);
+                       atomic64_add(l[type].lock_val, &lock->state.counter);
+               }
                break;
        case SIX_LOCK_intent:
+               EBUG_ON(!lock->state.intent_lock);
                lock->intent_lock_recurse++;
                break;
        case SIX_LOCK_write:
@@ -551,3 +704,24 @@ void six_lock_wakeup_all(struct six_lock *lock)
        raw_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+void six_lock_pcpu_free(struct six_lock *lock)
+{
+       BUG_ON(lock->readers && pcpu_read_count(lock));
+       BUG_ON(lock->state.read_lock);
+
+       free_percpu(lock->readers);
+       lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
+
+void six_lock_pcpu_alloc(struct six_lock *lock)
+{
+       BUG_ON(lock->readers && pcpu_read_count(lock));
+       BUG_ON(lock->state.read_lock);
+#ifdef __KERNEL__
+       if (!lock->readers)
+               lock->readers = alloc_percpu(unsigned);
+#endif
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);