]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/six.c
bcache in userspace; userspace fsck
[bcachefs-tools-debian] / libbcache / six.c
1
2 #include <linux/sched.h>
3 #include <linux/sched/rt.h>
4
5 #include "six.h"
6
7 #define six_acquire(l, t)       lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
8 #define six_release(l)          lock_release(l, 0, _RET_IP_)
9
10 #define __SIX_LOCK_HELD_read            __SIX_VAL(read_lock, ~0)
11 #define __SIX_LOCK_HELD_intent          __SIX_VAL(intent_lock, ~0)
12 #define __SIX_LOCK_HELD_write           __SIX_VAL(seq, 1)
13
14 struct six_lock_vals {
15         /* Value we add to the lock in order to take the lock: */
16         u64                     lock_val;
17
18         /* If the lock has this value (used as a mask), taking the lock fails: */
19         u64                     lock_fail;
20
21         /* Value we add to the lock in order to release the lock: */
22         u64                     unlock_val;
23
24         /* Mask that indicates lock is held for this type: */
25         u64                     held_mask;
26
27         /* Waitlist we wakeup when releasing the lock: */
28         enum six_lock_type      unlock_wakeup;
29 };
30
31 #define LOCK_VALS {                                                     \
32         [SIX_LOCK_read] = {                                             \
33                 .lock_val       = __SIX_VAL(read_lock, 1),              \
34                 .lock_fail      = __SIX_LOCK_HELD_write,                \
35                 .unlock_val     = -__SIX_VAL(read_lock, 1),             \
36                 .held_mask      = __SIX_LOCK_HELD_read,                 \
37                 .unlock_wakeup  = SIX_LOCK_write,                       \
38         },                                                              \
39         [SIX_LOCK_intent] = {                                           \
40                 .lock_val       = __SIX_VAL(intent_lock, 1),            \
41                 .lock_fail      = __SIX_LOCK_HELD_intent,               \
42                 .unlock_val     = -__SIX_VAL(intent_lock, 1),           \
43                 .held_mask      = __SIX_LOCK_HELD_intent,               \
44                 .unlock_wakeup  = SIX_LOCK_intent,                      \
45         },                                                              \
46         [SIX_LOCK_write] = {                                            \
47                 .lock_val       = __SIX_VAL(seq, 1),                    \
48                 .lock_fail      = __SIX_LOCK_HELD_read,                 \
49                 .unlock_val     = __SIX_VAL(seq, 1),                    \
50                 .held_mask      = __SIX_LOCK_HELD_write,                \
51                 .unlock_wakeup  = SIX_LOCK_read,                        \
52         },                                                              \
53 }
54
55 static void six_set_owner(struct six_lock *lock, enum six_lock_type type)
56 {
57         if (type == SIX_LOCK_intent)
58                 lock->owner = current;
59 }
60
61 static void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
62 {
63         if (type == SIX_LOCK_intent)
64                 lock->owner = NULL;
65 }
66
67 static inline bool __six_trylock_type(struct six_lock *lock,
68                                       enum six_lock_type type)
69 {
70         const struct six_lock_vals l[] = LOCK_VALS;
71         union six_lock_state old;
72         u64 v = READ_ONCE(lock->state.v);
73
74         do {
75                 old.v = v;
76
77                 EBUG_ON(type == SIX_LOCK_write &&
78                         ((old.v & __SIX_LOCK_HELD_write) ||
79                          !(old.v & __SIX_LOCK_HELD_intent)));
80
81                 if (old.v & l[type].lock_fail)
82                         return false;
83         } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
84                                 old.v,
85                                 old.v + l[type].lock_val)) != old.v);
86         return true;
87 }
88
89 bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
90 {
91         bool ret = __six_trylock_type(lock, type);
92
93         if (ret) {
94                 six_acquire(&lock->dep_map, 1);
95                 six_set_owner(lock, type);
96         }
97
98         return ret;
99 }
100
101 bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
102                      unsigned seq)
103 {
104         const struct six_lock_vals l[] = LOCK_VALS;
105         union six_lock_state old;
106         u64 v = READ_ONCE(lock->state.v);
107
108         do {
109                 old.v = v;
110
111                 if (old.seq != seq || old.v & l[type].lock_fail)
112                         return false;
113         } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
114                                 old.v,
115                                 old.v + l[type].lock_val)) != old.v);
116
117         six_acquire(&lock->dep_map, 1);
118         six_set_owner(lock, type);
119         return true;
120 }
121
122 struct six_lock_waiter {
123         struct list_head        list;
124         struct task_struct      *task;
125 };
126
127 /* This is probably up there with the more evil things I've done */
128 #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
129
130 static inline int six_can_spin_on_owner(struct six_lock *lock)
131 {
132         struct task_struct *owner;
133         int retval = 1;
134
135         if (need_resched())
136                 return 0;
137
138         rcu_read_lock();
139         owner = READ_ONCE(lock->owner);
140         if (owner)
141                 retval = owner->on_cpu;
142         rcu_read_unlock();
143         /*
144          * if lock->owner is not set, the mutex owner may have just acquired
145          * it and not set the owner yet or the mutex has been released.
146          */
147         return retval;
148 }
149
150 static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
151 {
152         bool ret = true;
153
154         rcu_read_lock();
155         while (lock->owner == owner) {
156                 /*
157                  * Ensure we emit the owner->on_cpu, dereference _after_
158                  * checking lock->owner still matches owner. If that fails,
159                  * owner might point to freed memory. If it still matches,
160                  * the rcu_read_lock() ensures the memory stays valid.
161                  */
162                 barrier();
163
164                 if (!owner->on_cpu || need_resched()) {
165                         ret = false;
166                         break;
167                 }
168
169                 cpu_relax_lowlatency();
170         }
171         rcu_read_unlock();
172
173         return ret;
174 }
175
176 static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
177 {
178         struct task_struct *task = current;
179
180         if (type == SIX_LOCK_write)
181                 return false;
182
183         preempt_disable();
184         if (!six_can_spin_on_owner(lock))
185                 goto fail;
186
187         if (!osq_lock(&lock->osq))
188                 goto fail;
189
190         while (1) {
191                 struct task_struct *owner;
192
193                 /*
194                  * If there's an owner, wait for it to either
195                  * release the lock or go to sleep.
196                  */
197                 owner = READ_ONCE(lock->owner);
198                 if (owner && !six_spin_on_owner(lock, owner))
199                         break;
200
201                 if (__six_trylock_type(lock, type)) {
202                         osq_unlock(&lock->osq);
203                         preempt_enable();
204                         return true;
205                 }
206
207                 /*
208                  * When there's no owner, we might have preempted between the
209                  * owner acquiring the lock and setting the owner field. If
210                  * we're an RT task that will live-lock because we won't let
211                  * the owner complete.
212                  */
213                 if (!owner && (need_resched() || rt_task(task)))
214                         break;
215
216                 /*
217                  * The cpu_relax() call is a compiler barrier which forces
218                  * everything in this loop to be re-loaded. We don't need
219                  * memory barriers as we'll eventually observe the right
220                  * values at the cost of a few extra spins.
221                  */
222                 cpu_relax_lowlatency();
223         }
224
225         osq_unlock(&lock->osq);
226 fail:
227         preempt_enable();
228
229         /*
230          * If we fell out of the spin path because of need_resched(),
231          * reschedule now, before we try-lock again. This avoids getting
232          * scheduled out right after we obtained the lock.
233          */
234         if (need_resched())
235                 schedule();
236
237         return false;
238 }
239
240 void six_lock_type(struct six_lock *lock, enum six_lock_type type)
241 {
242         const struct six_lock_vals l[] = LOCK_VALS;
243         union six_lock_state old, new;
244         struct six_lock_waiter wait;
245         u64 v;
246
247         six_acquire(&lock->dep_map, 0);
248
249         if (__six_trylock_type(lock, type))
250                 goto done;
251
252         if (six_optimistic_spin(lock, type))
253                 goto done;
254
255         lock_contended(&lock->dep_map, _RET_IP_);
256
257         INIT_LIST_HEAD(&wait.list);
258         wait.task = current;
259
260         while (1) {
261                 set_current_state(TASK_UNINTERRUPTIBLE);
262                 if (list_empty_careful(&wait.list)) {
263                         raw_spin_lock(&lock->wait_lock);
264                         list_add_tail(&wait.list, &lock->wait_list[type]);
265                         raw_spin_unlock(&lock->wait_lock);
266                 }
267
268                 v = READ_ONCE(lock->state.v);
269                 do {
270                         new.v = old.v = v;
271
272                         if (!(old.v & l[type].lock_fail))
273                                 new.v += l[type].lock_val;
274                         else if (!(new.waiters & (1 << type)))
275                                 new.waiters |= 1 << type;
276                         else
277                                 break; /* waiting bit already set */
278                 } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
279                                         old.v, new.v)) != old.v);
280
281                 if (!(old.v & l[type].lock_fail))
282                         break;
283
284                 schedule();
285         }
286
287         __set_current_state(TASK_RUNNING);
288
289         if (!list_empty_careful(&wait.list)) {
290                 raw_spin_lock(&lock->wait_lock);
291                 list_del_init(&wait.list);
292                 raw_spin_unlock(&lock->wait_lock);
293         }
294 done:
295         lock_acquired(&lock->dep_map, _RET_IP_);
296         six_set_owner(lock, type);
297 }
298
299 static inline void six_lock_wakeup(struct six_lock *lock,
300                                    union six_lock_state state,
301                                    unsigned waitlist_id)
302 {
303         struct list_head *wait_list = &lock->wait_list[waitlist_id];
304         struct six_lock_waiter *w, *next;
305
306         if (waitlist_id == SIX_LOCK_write && state.read_lock)
307                 return;
308
309         if (!(state.waiters & (1 << waitlist_id)))
310                 return;
311
312         clear_bit(waitlist_bitnr(waitlist_id),
313                   (unsigned long *) &lock->state.v);
314
315         raw_spin_lock(&lock->wait_lock);
316
317         list_for_each_entry_safe(w, next, wait_list, list) {
318                 list_del_init(&w->list);
319
320                 if (wake_up_process(w->task) &&
321                     waitlist_id != SIX_LOCK_read) {
322                         if (!list_empty(wait_list))
323                                 set_bit(waitlist_bitnr(waitlist_id),
324                                         (unsigned long *) &lock->state.v);
325                         break;
326                 }
327         }
328
329         raw_spin_unlock(&lock->wait_lock);
330 }
331
332 void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
333 {
334         const struct six_lock_vals l[] = LOCK_VALS;
335         union six_lock_state state;
336
337         six_clear_owner(lock, type);
338
339         EBUG_ON(!(lock->state.v & l[type].held_mask));
340         EBUG_ON(type == SIX_LOCK_write &&
341                 !(lock->state.v & __SIX_LOCK_HELD_intent));
342
343         state.v = atomic64_add_return_release(l[type].unlock_val,
344                                               &lock->state.counter);
345         six_release(&lock->dep_map);
346         six_lock_wakeup(lock, state, l[type].unlock_wakeup);
347 }
348
349 bool six_trylock_convert(struct six_lock *lock,
350                          enum six_lock_type from,
351                          enum six_lock_type to)
352 {
353         const struct six_lock_vals l[] = LOCK_VALS;
354         union six_lock_state old, new;
355         u64 v = READ_ONCE(lock->state.v);
356
357         do {
358                 new.v = old.v = v;
359                 new.v += l[from].unlock_val;
360
361                 if (new.v & l[to].lock_fail)
362                         return false;
363         } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
364                                 old.v,
365                                 new.v + l[to].lock_val)) != old.v);
366
367         six_clear_owner(lock, from);
368         six_set_owner(lock, to);
369
370         six_lock_wakeup(lock, new, l[from].unlock_wakeup);
371
372         return true;
373 }
374
375 /*
376  * Increment read/intent lock count, assuming we already have it read or intent
377  * locked:
378  */
379 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
380 {
381         const struct six_lock_vals l[] = LOCK_VALS;
382
383         EBUG_ON(type == SIX_LOCK_write);
384         six_acquire(&lock->dep_map, 0);
385
386         /* XXX: assert already locked, and that we don't overflow: */
387
388         atomic64_add(l[type].lock_val, &lock->state.counter);
389 }
390
391 /* Convert from intent to read: */
392 void six_lock_downgrade(struct six_lock *lock)
393 {
394         six_lock_increment(lock, SIX_LOCK_read);
395         six_unlock_intent(lock);
396 }