]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update_leaf.c
Update bcachefs sources to 7e03c1ab0e bcachefs: Kill bchfs_extent_update()
[bcachefs-tools-debian] / libbcachefs / btree_update_leaf.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_update.h"
5 #include "btree_update_interior.h"
6 #include "btree_gc.h"
7 #include "btree_io.h"
8 #include "btree_iter.h"
9 #include "btree_locking.h"
10 #include "buckets.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "journal_reclaim.h"
16 #include "keylist.h"
17 #include "replicas.h"
18
19 #include <linux/sort.h>
20 #include <trace/events/bcachefs.h>
21
22 static inline bool same_leaf_as_prev(struct btree_trans *trans,
23                                      unsigned sorted_idx)
24 {
25         struct btree_insert_entry *i = trans->updates +
26                 trans->updates_sorted[sorted_idx];
27         struct btree_insert_entry *prev = sorted_idx
28                 ? trans->updates + trans->updates_sorted[sorted_idx - 1]
29                 : NULL;
30
31         return prev &&
32                 i->iter->l[0].b == prev->iter->l[0].b;
33 }
34
35 #define trans_for_each_update_sorted(_trans, _i, _iter)                 \
36         for (_iter = 0;                                                 \
37              _iter < _trans->nr_updates &&                              \
38              (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \
39              _iter++)
40
41 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
42                                             struct btree_iter *iter)
43 {
44         bch2_btree_node_lock_write(b, iter);
45
46         if (unlikely(btree_node_just_written(b)) &&
47             bch2_btree_post_write_cleanup(c, b))
48                 bch2_btree_iter_reinit_node(iter, b);
49
50         /*
51          * If the last bset has been written, or if it's gotten too big - start
52          * a new bset to insert into:
53          */
54         if (want_new_bset(c, b))
55                 bch2_btree_init_next(c, b, iter);
56 }
57
58 static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
59 {
60         struct bch_fs *c = trans->c;
61         struct btree_insert_entry *i;
62         unsigned iter;
63
64         trans_for_each_update_sorted(trans, i, iter) {
65                 if (same_leaf_as_prev(trans, iter))
66                         continue;
67
68                 if (lock)
69                         bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
70                 else
71                         bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
72         }
73 }
74
75 static inline void btree_trans_sort_updates(struct btree_trans *trans)
76 {
77         struct btree_insert_entry *l, *r;
78         unsigned nr = 0, pos;
79
80         trans_for_each_update(trans, l) {
81                 for (pos = 0; pos < nr; pos++) {
82                         r = trans->updates + trans->updates_sorted[pos];
83
84                         if (btree_iter_cmp(l->iter, r->iter) <= 0)
85                                 break;
86                 }
87
88                 memmove(&trans->updates_sorted[pos + 1],
89                         &trans->updates_sorted[pos],
90                         (nr - pos) * sizeof(trans->updates_sorted[0]));
91
92                 trans->updates_sorted[pos] = l - trans->updates;
93                 nr++;
94         }
95
96         BUG_ON(nr != trans->nr_updates);
97 }
98
99 /* Inserting into a given leaf node (last stage of insert): */
100
101 /* Handle overwrites and do insert, for non extents: */
102 bool bch2_btree_bset_insert_key(struct btree_iter *iter,
103                                 struct btree *b,
104                                 struct btree_node_iter *node_iter,
105                                 struct bkey_i *insert)
106 {
107         const struct bkey_format *f = &b->format;
108         struct bkey_packed *k;
109         unsigned clobber_u64s;
110
111         EBUG_ON(btree_node_just_written(b));
112         EBUG_ON(bset_written(b, btree_bset_last(b)));
113         EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
114         EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
115                 bkey_cmp(insert->k.p, b->data->max_key) > 0);
116
117         k = bch2_btree_node_iter_peek_all(node_iter, b);
118         if (k && !bkey_cmp_packed(b, k, &insert->k)) {
119                 BUG_ON(bkey_whiteout(k));
120
121                 if (!bkey_written(b, k) &&
122                     bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
123                     !bkey_whiteout(&insert->k)) {
124                         k->type = insert->k.type;
125                         memcpy_u64s(bkeyp_val(f, k), &insert->v,
126                                     bkey_val_u64s(&insert->k));
127                         return true;
128                 }
129
130                 insert->k.needs_whiteout = k->needs_whiteout;
131
132                 btree_account_key_drop(b, k);
133
134                 if (k >= btree_bset_last(b)->start) {
135                         clobber_u64s = k->u64s;
136
137                         /*
138                          * If we're deleting, and the key we're deleting doesn't
139                          * need a whiteout (it wasn't overwriting a key that had
140                          * been written to disk) - just delete it:
141                          */
142                         if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
143                                 bch2_bset_delete(b, k, clobber_u64s);
144                                 bch2_btree_node_iter_fix(iter, b, node_iter,
145                                                          k, clobber_u64s, 0);
146                                 return true;
147                         }
148
149                         goto overwrite;
150                 }
151
152                 k->type = KEY_TYPE_deleted;
153                 bch2_btree_node_iter_fix(iter, b, node_iter, k,
154                                          k->u64s, k->u64s);
155
156                 if (bkey_whiteout(&insert->k)) {
157                         reserve_whiteout(b, k);
158                         return true;
159                 } else {
160                         k->needs_whiteout = false;
161                 }
162         } else {
163                 /*
164                  * Deleting, but the key to delete wasn't found - nothing to do:
165                  */
166                 if (bkey_whiteout(&insert->k))
167                         return false;
168
169                 insert->k.needs_whiteout = false;
170         }
171
172         k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
173         clobber_u64s = 0;
174 overwrite:
175         bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
176         bch2_btree_node_iter_fix(iter, b, node_iter, k,
177                                  clobber_u64s, k->u64s);
178         return true;
179 }
180
181 static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
182                                unsigned i, u64 seq)
183 {
184         struct bch_fs *c = container_of(j, struct bch_fs, journal);
185         struct btree_write *w = container_of(pin, struct btree_write, journal);
186         struct btree *b = container_of(w, struct btree, writes[i]);
187
188         btree_node_lock_type(c, b, SIX_LOCK_read);
189         bch2_btree_node_write_cond(c, b,
190                 (btree_current_write(b) == w && w->journal.seq == seq));
191         six_unlock_read(&b->lock);
192 }
193
194 static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
195 {
196         return __btree_node_flush(j, pin, 0, seq);
197 }
198
199 static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
200 {
201         return __btree_node_flush(j, pin, 1, seq);
202 }
203
204 static inline void __btree_journal_key(struct btree_trans *trans,
205                                        enum btree_id btree_id,
206                                        struct bkey_i *insert)
207 {
208         struct journal *j = &trans->c->journal;
209         u64 seq = trans->journal_res.seq;
210         bool needs_whiteout = insert->k.needs_whiteout;
211
212         /* ick */
213         insert->k.needs_whiteout = false;
214         bch2_journal_add_keys(j, &trans->journal_res,
215                               btree_id, insert);
216         insert->k.needs_whiteout = needs_whiteout;
217
218         bch2_journal_set_has_inode(j, &trans->journal_res,
219                                    insert->k.p.inode);
220
221         if (trans->journal_seq)
222                 *trans->journal_seq = seq;
223 }
224
225 void bch2_btree_journal_key(struct btree_trans *trans,
226                            struct btree_iter *iter,
227                            struct bkey_i *insert)
228 {
229         struct bch_fs *c = trans->c;
230         struct journal *j = &c->journal;
231         struct btree *b = iter->l[0].b;
232         struct btree_write *w = btree_current_write(b);
233
234         EBUG_ON(iter->level || b->level);
235         EBUG_ON(trans->journal_res.ref !=
236                 !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
237
238         if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
239                 __btree_journal_key(trans, iter->btree_id, insert);
240                 btree_bset_last(b)->journal_seq =
241                         cpu_to_le64(trans->journal_res.seq);
242         }
243
244         if (unlikely(!journal_pin_active(&w->journal))) {
245                 u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
246                         ? trans->journal_res.seq
247                         : j->replay_journal_seq;
248
249                 bch2_journal_pin_add(j, seq, &w->journal,
250                                      btree_node_write_idx(b) == 0
251                                      ? btree_node_flush0
252                                      : btree_node_flush1);
253         }
254
255         if (unlikely(!btree_node_dirty(b)))
256                 set_btree_node_dirty(b);
257 }
258
259 static void bch2_insert_fixup_key(struct btree_trans *trans,
260                                   struct btree_insert_entry *insert)
261 {
262         struct btree_iter *iter = insert->iter;
263         struct btree_iter_level *l = &iter->l[0];
264
265         EBUG_ON(iter->level);
266         EBUG_ON(insert->k->k.u64s >
267                 bch_btree_keys_u64s_remaining(trans->c, l->b));
268
269         if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
270                                        insert->k))
271                 bch2_btree_journal_key(trans, iter, insert->k);
272 }
273
274 /**
275  * btree_insert_key - insert a key one key into a leaf node
276  */
277 static void btree_insert_key_leaf(struct btree_trans *trans,
278                                   struct btree_insert_entry *insert)
279 {
280         struct bch_fs *c = trans->c;
281         struct btree_iter *iter = insert->iter;
282         struct btree *b = iter->l[0].b;
283         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
284         int old_live_u64s = b->nr.live_u64s;
285         int live_u64s_added, u64s_added;
286
287         if (!btree_node_is_extents(b))
288                 bch2_insert_fixup_key(trans, insert);
289         else
290                 bch2_insert_fixup_extent(trans, insert);
291
292         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
293         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
294
295         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
296                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
297         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
298                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
299
300         if (u64s_added > live_u64s_added &&
301             bch2_maybe_compact_whiteouts(c, b))
302                 bch2_btree_iter_reinit_node(iter, b);
303
304         trace_btree_insert_key(c, b, insert->k);
305 }
306
307 /* Normal update interface: */
308
309 static inline void btree_insert_entry_checks(struct btree_trans *trans,
310                                              struct btree_insert_entry *i)
311 {
312         struct bch_fs *c = trans->c;
313
314         BUG_ON(i->iter->level);
315         BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
316         EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
317                 bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
318         EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
319                 !(trans->flags & BTREE_INSERT_ATOMIC));
320
321         BUG_ON(debug_check_bkeys(c) &&
322                !bkey_deleted(&i->k->k) &&
323                bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
324 }
325
326 static int bch2_trans_journal_preres_get(struct btree_trans *trans)
327 {
328         struct bch_fs *c = trans->c;
329         struct btree_insert_entry *i;
330         unsigned u64s = 0;
331         int ret;
332
333         trans_for_each_update(trans, i)
334                 if (0)
335                         u64s += jset_u64s(i->k->k.u64s);
336
337         if (!u64s)
338                 return 0;
339
340         ret = bch2_journal_preres_get(&c->journal,
341                         &trans->journal_preres, u64s,
342                         JOURNAL_RES_GET_NONBLOCK);
343         if (ret != -EAGAIN)
344                 return ret;
345
346         bch2_trans_unlock(trans);
347
348         ret = bch2_journal_preres_get(&c->journal,
349                         &trans->journal_preres, u64s, 0);
350         if (ret)
351                 return ret;
352
353         if (!bch2_trans_relock(trans)) {
354                 trace_trans_restart_journal_preres_get(trans->ip);
355                 return -EINTR;
356         }
357
358         return 0;
359 }
360
361 static int bch2_trans_journal_res_get(struct btree_trans *trans,
362                                       unsigned flags)
363 {
364         struct bch_fs *c = trans->c;
365         int ret;
366
367         if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
368                 flags |= JOURNAL_RES_GET_RESERVED;
369
370         ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
371                                    trans->journal_u64s, flags);
372
373         return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
374 }
375
376 static enum btree_insert_ret
377 btree_key_can_insert(struct btree_trans *trans,
378                      struct btree_insert_entry *insert,
379                      unsigned *u64s)
380 {
381         struct bch_fs *c = trans->c;
382         struct btree *b = insert->iter->l[0].b;
383         static enum btree_insert_ret ret;
384
385         if (unlikely(btree_node_fake(b)))
386                 return BTREE_INSERT_BTREE_NODE_FULL;
387
388         ret = !btree_node_is_extents(b)
389                 ? BTREE_INSERT_OK
390                 : bch2_extent_can_insert(trans, insert, u64s);
391         if (ret)
392                 return ret;
393
394         if (*u64s > bch_btree_keys_u64s_remaining(c, b))
395                 return BTREE_INSERT_BTREE_NODE_FULL;
396
397         return BTREE_INSERT_OK;
398 }
399
400 static int btree_trans_check_can_insert(struct btree_trans *trans,
401                                         struct btree_insert_entry **stopped_at)
402 {
403         struct btree_insert_entry *i;
404         unsigned iter, u64s = 0;
405         int ret;
406
407         trans_for_each_update_sorted(trans, i, iter) {
408                 /* Multiple inserts might go to same leaf: */
409                 if (!same_leaf_as_prev(trans, iter))
410                         u64s = 0;
411
412                 u64s += i->k->k.u64s;
413                 ret = btree_key_can_insert(trans, i, &u64s);
414                 if (ret) {
415                         *stopped_at = i;
416                         return ret;
417                 }
418         }
419
420         return 0;
421 }
422
423 static inline void do_btree_insert_one(struct btree_trans *trans,
424                                        struct btree_insert_entry *insert)
425 {
426         btree_insert_key_leaf(trans, insert);
427 }
428
429 static inline bool update_triggers_transactional(struct btree_trans *trans,
430                                                  struct btree_insert_entry *i)
431 {
432         return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
433                 (i->iter->btree_id == BTREE_ID_EXTENTS ||
434                  i->iter->btree_id == BTREE_ID_INODES ||
435                  i->iter->btree_id == BTREE_ID_REFLINK);
436 }
437
438 static inline bool update_has_triggers(struct btree_trans *trans,
439                                        struct btree_insert_entry *i)
440 {
441         return likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
442                 btree_node_type_needs_gc(i->iter->btree_id);
443 }
444
445 /*
446  * Get journal reservation, take write locks, and attempt to do btree update(s):
447  */
448 static inline int do_btree_insert_at(struct btree_trans *trans,
449                                      struct btree_insert_entry **stopped_at)
450 {
451         struct bch_fs *c = trans->c;
452         struct bch_fs_usage *fs_usage = NULL;
453         struct btree_insert_entry *i;
454         struct btree_iter *iter;
455         unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
456                 ? BCH_BUCKET_MARK_BUCKET_INVALIDATE
457                 : 0;
458         int ret;
459
460         trans_for_each_update(trans, i)
461                 BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
462
463         /*
464          * note: running triggers will append more updates to the list of
465          * updates as we're walking it:
466          */
467         trans_for_each_update(trans, i)
468                 if (update_has_triggers(trans, i) &&
469                     update_triggers_transactional(trans, i)) {
470                         ret = bch2_trans_mark_update(trans, i->iter, i->k);
471                         if (ret == -EINTR)
472                                 trace_trans_restart_mark(trans->ip);
473                         if (ret)
474                                 goto out_clear_replicas;
475                 }
476
477         trans_for_each_iter(trans, iter) {
478                 if (iter->nodes_locked != iter->nodes_intent_locked) {
479                         BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
480                         BUG_ON(trans->iters_live & (1ULL << iter->idx));
481                         __bch2_btree_iter_unlock(iter);
482                 }
483         }
484
485         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
486                 trans_for_each_update(trans, i)
487                         btree_insert_entry_checks(trans, i);
488         bch2_btree_trans_verify_locks(trans);
489
490         /*
491          * No more updates can be added - sort updates so we can take write
492          * locks in the correct order:
493          */
494         btree_trans_sort_updates(trans);
495
496         btree_trans_lock_write(trans, true);
497
498         if (race_fault()) {
499                 ret = -EINTR;
500                 trace_trans_restart_fault_inject(trans->ip);
501                 goto out;
502         }
503
504         /*
505          * Check if the insert will fit in the leaf node with the write lock
506          * held, otherwise another thread could write the node changing the
507          * amount of space available:
508          */
509         ret = btree_trans_check_can_insert(trans, stopped_at);
510         if (ret)
511                 goto out;
512
513         trans_for_each_update(trans, i) {
514                 if (!btree_node_type_needs_gc(i->iter->btree_id))
515                         continue;
516
517                 if (!fs_usage) {
518                         percpu_down_read(&c->mark_lock);
519                         fs_usage = bch2_fs_usage_scratch_get(c);
520                 }
521
522                 if (!bch2_bkey_replicas_marked_locked(c,
523                         bkey_i_to_s_c(i->k), true)) {
524                         ret = BTREE_INSERT_NEED_MARK_REPLICAS;
525                         goto out;
526                 }
527         }
528
529         /*
530          * Don't get journal reservation until after we know insert will
531          * succeed:
532          */
533         if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
534                 trans->journal_u64s = 0;
535
536                 trans_for_each_update(trans, i)
537                         trans->journal_u64s += jset_u64s(i->k->k.u64s);
538
539                 ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
540                 if (ret)
541                         goto out;
542         }
543
544         if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
545                 if (journal_seq_verify(c))
546                         trans_for_each_update(trans, i)
547                                 i->k->k.version.lo = trans->journal_res.seq;
548                 else if (inject_invalid_keys(c))
549                         trans_for_each_update(trans, i)
550                                 i->k->k.version = MAX_VERSION;
551         }
552
553         trans_for_each_update(trans, i)
554                 if (update_has_triggers(trans, i) &&
555                     !update_triggers_transactional(trans, i))
556                         bch2_mark_update(trans, i, fs_usage, mark_flags);
557
558         if (fs_usage && trans->fs_usage_deltas)
559                 bch2_replicas_delta_list_apply(c, fs_usage,
560                                                trans->fs_usage_deltas);
561
562         if (fs_usage)
563                 bch2_trans_fs_usage_apply(trans, fs_usage);
564
565         if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
566             unlikely(c->gc_pos.phase))
567                 trans_for_each_update(trans, i)
568                         if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
569                                 bch2_mark_update(trans, i, NULL,
570                                                  mark_flags|
571                                                  BCH_BUCKET_MARK_GC);
572
573         trans_for_each_update(trans, i)
574                 do_btree_insert_one(trans, i);
575 out:
576         BUG_ON(ret &&
577                (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
578                trans->journal_res.ref);
579
580         btree_trans_lock_write(trans, false);
581
582         if (fs_usage) {
583                 bch2_fs_usage_scratch_put(c, fs_usage);
584                 percpu_up_read(&c->mark_lock);
585         }
586
587         bch2_journal_res_put(&c->journal, &trans->journal_res);
588 out_clear_replicas:
589         if (trans->fs_usage_deltas) {
590                 memset(&trans->fs_usage_deltas->fs_usage, 0,
591                        sizeof(trans->fs_usage_deltas->fs_usage));
592                 trans->fs_usage_deltas->used = 0;
593         }
594
595         return ret;
596 }
597
598 static noinline
599 int bch2_trans_commit_error(struct btree_trans *trans,
600                             struct btree_insert_entry *i,
601                             int ret)
602 {
603         struct bch_fs *c = trans->c;
604         unsigned flags = trans->flags;
605
606         /*
607          * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
608          * update; if we haven't done anything yet it doesn't apply
609          */
610         flags &= ~BTREE_INSERT_NOUNLOCK;
611
612         switch (ret) {
613         case BTREE_INSERT_BTREE_NODE_FULL:
614                 ret = bch2_btree_split_leaf(c, i->iter, flags);
615
616                 /*
617                  * if the split succeeded without dropping locks the insert will
618                  * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
619                  * caller peeked() and is overwriting won't have changed)
620                  */
621 #if 0
622                 /*
623                  * XXX:
624                  * split -> btree node merging (of parent node) might still drop
625                  * locks when we're not passing it BTREE_INSERT_NOUNLOCK
626                  *
627                  * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that
628                  * will inhibit merging - but we don't have a reliable way yet
629                  * (do we?) of checking if we dropped locks in this path
630                  */
631                 if (!ret)
632                         goto retry;
633 #endif
634
635                 /*
636                  * don't care if we got ENOSPC because we told split it
637                  * couldn't block:
638                  */
639                 if (!ret ||
640                     ret == -EINTR ||
641                     (flags & BTREE_INSERT_NOUNLOCK)) {
642                         trace_trans_restart_btree_node_split(trans->ip);
643                         ret = -EINTR;
644                 }
645                 break;
646         case BTREE_INSERT_ENOSPC:
647                 ret = -ENOSPC;
648                 break;
649         case BTREE_INSERT_NEED_MARK_REPLICAS:
650                 bch2_trans_unlock(trans);
651
652                 trans_for_each_update(trans, i) {
653                         ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
654                         if (ret)
655                                 return ret;
656                 }
657
658                 if (bch2_trans_relock(trans))
659                         return 0;
660
661                 trace_trans_restart_mark_replicas(trans->ip);
662                 ret = -EINTR;
663                 break;
664         case BTREE_INSERT_NEED_JOURNAL_RES:
665                 bch2_trans_unlock(trans);
666
667                 ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
668                 if (ret)
669                         return ret;
670
671                 if (bch2_trans_relock(trans))
672                         return 0;
673
674                 trace_trans_restart_journal_res_get(trans->ip);
675                 ret = -EINTR;
676                 break;
677         default:
678                 BUG_ON(ret >= 0);
679                 break;
680         }
681
682         if (ret == -EINTR) {
683                 int ret2 = bch2_btree_iter_traverse_all(trans);
684
685                 if (ret2) {
686                         trace_trans_restart_traverse(trans->ip);
687                         return ret2;
688                 }
689
690                 /*
691                  * BTREE_ITER_ATOMIC means we have to return -EINTR if we
692                  * dropped locks:
693                  */
694                 if (!(flags & BTREE_INSERT_ATOMIC))
695                         return 0;
696
697                 trace_trans_restart_atomic(trans->ip);
698         }
699
700         return ret;
701 }
702
703 /**
704  * __bch_btree_insert_at - insert keys at given iterator positions
705  *
706  * This is main entry point for btree updates.
707  *
708  * Return values:
709  * -EINTR: locking changed, this function should be called again. Only returned
710  *  if passed BTREE_INSERT_ATOMIC.
711  * -EROFS: filesystem read only
712  * -EIO: journal or btree node IO error
713  */
714 static int __bch2_trans_commit(struct btree_trans *trans,
715                                struct btree_insert_entry **stopped_at)
716 {
717         struct bch_fs *c = trans->c;
718         struct btree_insert_entry *i;
719         unsigned iter;
720         int ret;
721
722         trans_for_each_update(trans, i) {
723                 if (!bch2_btree_iter_upgrade(i->iter, 1)) {
724                         trace_trans_restart_upgrade(trans->ip);
725                         ret = -EINTR;
726                         goto err;
727                 }
728
729                 ret = btree_iter_err(i->iter);
730                 if (ret)
731                         goto err;
732         }
733
734         ret = do_btree_insert_at(trans, stopped_at);
735         if (unlikely(ret))
736                 goto err;
737
738         if (trans->flags & BTREE_INSERT_NOUNLOCK)
739                 trans->nounlock = true;
740
741         trans_for_each_update_sorted(trans, i, iter)
742                 if (!same_leaf_as_prev(trans, iter))
743                         bch2_foreground_maybe_merge(c, i->iter,
744                                                     0, trans->flags);
745
746         trans->nounlock = false;
747
748         trans_for_each_update(trans, i)
749                 bch2_btree_iter_downgrade(i->iter);
750 err:
751         /* make sure we didn't drop or screw up locks: */
752         bch2_btree_trans_verify_locks(trans);
753
754         return ret;
755 }
756
757 int bch2_trans_commit(struct btree_trans *trans,
758                       struct disk_reservation *disk_res,
759                       u64 *journal_seq,
760                       unsigned flags)
761 {
762         struct bch_fs *c = trans->c;
763         struct btree_insert_entry *i = NULL;
764         struct btree_iter *iter;
765         unsigned orig_nr_updates        = trans->nr_updates;
766         unsigned orig_mem_top           = trans->mem_top;
767         int ret = 0;
768
769         if (!trans->nr_updates)
770                 goto out_noupdates;
771
772         /* for the sake of sanity: */
773         BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC));
774
775         if (flags & BTREE_INSERT_GC_LOCK_HELD)
776                 lockdep_assert_held(&c->gc_lock);
777
778         if (!trans->commit_start)
779                 trans->commit_start = local_clock();
780
781         memset(&trans->journal_res, 0, sizeof(trans->journal_res));
782         memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
783         trans->disk_res         = disk_res;
784         trans->journal_seq      = journal_seq;
785         trans->flags            = flags;
786
787         if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
788                      !percpu_ref_tryget(&c->writes))) {
789                 if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
790                         return -EROFS;
791
792                 bch2_trans_unlock(trans);
793
794                 ret = bch2_fs_read_write_early(c);
795                 if (ret)
796                         return ret;
797
798                 percpu_ref_get(&c->writes);
799
800                 if (!bch2_trans_relock(trans)) {
801                         ret = -EINTR;
802                         goto err;
803                 }
804         }
805 retry:
806         ret = bch2_trans_journal_preres_get(trans);
807         if (ret)
808                 goto err;
809
810         ret = __bch2_trans_commit(trans, &i);
811         if (ret)
812                 goto err;
813 out:
814         bch2_journal_preres_put(&c->journal, &trans->journal_preres);
815
816         if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
817                 percpu_ref_put(&c->writes);
818 out_noupdates:
819         if (!ret && trans->commit_start) {
820                 bch2_time_stats_update(&c->times[BCH_TIME_btree_update],
821                                        trans->commit_start);
822                 trans->commit_start = 0;
823         }
824
825         BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
826
827         trans_for_each_iter(trans, iter)
828                 iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
829
830         if (!ret) {
831                 bch2_trans_unlink_iters(trans);
832                 trans->iters_touched = 0;
833         }
834         trans->nr_updates       = 0;
835         trans->mem_top          = 0;
836
837         return ret;
838 err:
839         ret = bch2_trans_commit_error(trans, i, ret);
840
841         /* free updates and memory used by triggers, they'll be reexecuted: */
842         trans->nr_updates       = orig_nr_updates;
843         trans->mem_top          = orig_mem_top;
844
845         /* can't loop if it was passed in and we changed it: */
846         if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
847                 ret = -EINTR;
848
849         if (!ret)
850                 goto retry;
851
852         goto out;
853 }
854
855 /**
856  * bch2_btree_insert - insert keys into the extent btree
857  * @c:                  pointer to struct bch_fs
858  * @id:                 btree to insert into
859  * @insert_keys:        list of keys to insert
860  * @hook:               insert callback
861  */
862 int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
863                      struct bkey_i *k,
864                      struct disk_reservation *disk_res,
865                      u64 *journal_seq, int flags)
866 {
867         struct btree_trans trans;
868         struct btree_iter *iter;
869         int ret;
870
871         bch2_trans_init(&trans, c, 0, 0);
872 retry:
873         bch2_trans_begin(&trans);
874
875         iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
876                                    BTREE_ITER_INTENT);
877
878         bch2_trans_update(&trans, iter, k);
879
880         ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
881         if (ret == -EINTR)
882                 goto retry;
883         bch2_trans_exit(&trans);
884
885         return ret;
886 }
887
888 int bch2_btree_delete_at_range(struct btree_trans *trans,
889                                struct btree_iter *iter,
890                                struct bpos end,
891                                u64 *journal_seq)
892 {
893         struct bkey_s_c k;
894         int ret = 0;
895 retry:
896         while ((k = bch2_btree_iter_peek(iter)).k &&
897                !(ret = bkey_err(k)) &&
898                bkey_cmp(iter->pos, end) < 0) {
899                 struct bkey_i delete;
900
901                 bkey_init(&delete.k);
902
903                 /*
904                  * For extents, iter.pos won't necessarily be the same as
905                  * bkey_start_pos(k.k) (for non extents they always will be the
906                  * same). It's important that we delete starting from iter.pos
907                  * because the range we want to delete could start in the middle
908                  * of k.
909                  *
910                  * (bch2_btree_iter_peek() does guarantee that iter.pos >=
911                  * bkey_start_pos(k.k)).
912                  */
913                 delete.k.p = iter->pos;
914
915                 if (iter->flags & BTREE_ITER_IS_EXTENTS) {
916                         unsigned max_sectors =
917                                 KEY_SIZE_MAX & (~0 << trans->c->block_bits);
918
919                         /* create the biggest key we can */
920                         bch2_key_resize(&delete.k, max_sectors);
921                         bch2_cut_back(end, &delete.k);
922
923                         ret = bch2_extent_trim_atomic(&delete, iter);
924                         if (ret)
925                                 break;
926                 }
927
928                 bch2_trans_update(trans, iter, &delete);
929                 ret = bch2_trans_commit(trans, NULL, journal_seq,
930                                         BTREE_INSERT_ATOMIC|
931                                         BTREE_INSERT_NOFAIL);
932                 if (ret)
933                         break;
934
935                 bch2_trans_cond_resched(trans);
936         }
937
938         if (ret == -EINTR) {
939                 ret = 0;
940                 goto retry;
941         }
942
943         return ret;
944
945 }
946
947 int bch2_btree_delete_at(struct btree_trans *trans,
948                          struct btree_iter *iter, unsigned flags)
949 {
950         struct bkey_i k;
951
952         bkey_init(&k.k);
953         k.k.p = iter->pos;
954
955         bch2_trans_update(trans, iter, &k);
956         return bch2_trans_commit(trans, NULL, NULL,
957                                  BTREE_INSERT_NOFAIL|
958                                  BTREE_INSERT_USE_RESERVE|flags);
959 }
960
961 /*
962  * bch_btree_delete_range - delete everything within a given range
963  *
964  * Range is a half open interval - [start, end)
965  */
966 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
967                             struct bpos start, struct bpos end,
968                             u64 *journal_seq)
969 {
970         struct btree_trans trans;
971         struct btree_iter *iter;
972         int ret = 0;
973
974         /*
975          * XXX: whether we need mem/more iters depends on whether this btree id
976          * has triggers
977          */
978         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
979
980         iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
981
982         ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
983         ret = bch2_trans_exit(&trans) ?: ret;
984
985         BUG_ON(ret == -EINTR);
986         return ret;
987 }