]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/data_update.c
Update bcachefs sources to 33a60d9b05 bcachefs: Assorted fixes for clang
[bcachefs-tools-debian] / libbcachefs / data_update.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "bkey_buf.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "data_update.h"
9 #include "ec.h"
10 #include "error.h"
11 #include "extents.h"
12 #include "io.h"
13 #include "keylist.h"
14 #include "move.h"
15 #include "nocow_locking.h"
16 #include "subvolume.h"
17 #include "trace.h"
18
19 static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
20 {
21         if (trace_move_extent_finish_enabled()) {
22                 struct printbuf buf = PRINTBUF;
23
24                 bch2_bkey_val_to_text(&buf, c, k);
25                 trace_move_extent_finish(c, buf.buf);
26                 printbuf_exit(&buf);
27         }
28 }
29
30 static void trace_move_extent_fail2(struct data_update *m,
31                          struct bkey_s_c new,
32                          struct bkey_s_c wrote,
33                          struct bkey_i *insert,
34                          const char *msg)
35 {
36         struct bch_fs *c = m->op.c;
37         struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
38         const union bch_extent_entry *entry;
39         struct bch_extent_ptr *ptr;
40         struct extent_ptr_decoded p;
41         struct printbuf buf = PRINTBUF;
42         unsigned i, rewrites_found = 0;
43
44         if (!trace_move_extent_fail_enabled())
45                 return;
46
47         prt_str(&buf, msg);
48
49         if (insert) {
50                 i = 0;
51                 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
52                         struct bkey_s new_s;
53                         new_s.k = (void *) new.k;
54                         new_s.v = (void *) new.v;
55
56                         if (((1U << i) & m->data_opts.rewrite_ptrs) &&
57                             (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
58                             !ptr->cached)
59                                 rewrites_found |= 1U << i;
60                         i++;
61                 }
62         }
63
64         prt_printf(&buf, "\nrewrite ptrs:   %u%u%u%u",
65                    (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
66                    (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
67                    (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
68                    (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
69
70         prt_printf(&buf, "\nrewrites found: %u%u%u%u",
71                    (rewrites_found & (1 << 0)) != 0,
72                    (rewrites_found & (1 << 1)) != 0,
73                    (rewrites_found & (1 << 2)) != 0,
74                    (rewrites_found & (1 << 3)) != 0);
75
76         prt_str(&buf, "\nold:    ");
77         bch2_bkey_val_to_text(&buf, c, old);
78
79         prt_str(&buf, "\nnew:    ");
80         bch2_bkey_val_to_text(&buf, c, new);
81
82         prt_str(&buf, "\nwrote:  ");
83         bch2_bkey_val_to_text(&buf, c, wrote);
84
85         if (insert) {
86                 prt_str(&buf, "\ninsert: ");
87                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
88         }
89
90         trace_move_extent_fail(c, buf.buf);
91         printbuf_exit(&buf);
92 }
93
94 static int __bch2_data_update_index_update(struct btree_trans *trans,
95                                            struct bch_write_op *op)
96 {
97         struct bch_fs *c = op->c;
98         struct btree_iter iter;
99         struct data_update *m =
100                 container_of(op, struct data_update, op);
101         struct keylist *keys = &op->insert_keys;
102         struct bkey_buf _new, _insert;
103         int ret = 0;
104
105         bch2_bkey_buf_init(&_new);
106         bch2_bkey_buf_init(&_insert);
107         bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
108
109         bch2_trans_iter_init(trans, &iter, m->btree_id,
110                              bkey_start_pos(&bch2_keylist_front(keys)->k),
111                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
112
113         while (1) {
114                 struct bkey_s_c k;
115                 struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
116                 struct bkey_i *insert = NULL;
117                 struct bkey_i_extent *new;
118                 const union bch_extent_entry *entry_c;
119                 union bch_extent_entry *entry;
120                 struct extent_ptr_decoded p;
121                 struct bch_extent_ptr *ptr;
122                 const struct bch_extent_ptr *ptr_c;
123                 struct bpos next_pos;
124                 bool should_check_enospc;
125                 s64 i_sectors_delta = 0, disk_sectors_delta = 0;
126                 unsigned rewrites_found = 0, durability, i;
127
128                 bch2_trans_begin(trans);
129
130                 k = bch2_btree_iter_peek_slot(&iter);
131                 ret = bkey_err(k);
132                 if (ret)
133                         goto err;
134
135                 new = bkey_i_to_extent(bch2_keylist_front(keys));
136
137                 if (!bch2_extents_match(k, old)) {
138                         trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
139                                                 NULL, "no match:");
140                         goto nowork;
141                 }
142
143                 bkey_reassemble(_insert.k, k);
144                 insert = _insert.k;
145
146                 bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
147                 new = bkey_i_to_extent(_new.k);
148                 bch2_cut_front(iter.pos, &new->k_i);
149
150                 bch2_cut_front(iter.pos,        insert);
151                 bch2_cut_back(new->k.p,         insert);
152                 bch2_cut_back(insert->k.p,      &new->k_i);
153
154                 /*
155                  * @old: extent that we read from
156                  * @insert: key that we're going to update, initialized from
157                  * extent currently in btree - same as @old unless we raced with
158                  * other updates
159                  * @new: extent with new pointers that we'll be adding to @insert
160                  *
161                  * Fist, drop rewrite_ptrs from @new:
162                  */
163                 i = 0;
164                 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
165                         if (((1U << i) & m->data_opts.rewrite_ptrs) &&
166                             (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
167                             !ptr->cached) {
168                                 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
169                                 /*
170                                  * See comment below:
171                                 bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
172                                 */
173                                 rewrites_found |= 1U << i;
174                         }
175                         i++;
176                 }
177
178                 if (m->data_opts.rewrite_ptrs &&
179                     !rewrites_found &&
180                     bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
181                         trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
182                         goto nowork;
183                 }
184
185                 /*
186                  * A replica that we just wrote might conflict with a replica
187                  * that we want to keep, due to racing with another move:
188                  */
189 restart_drop_conflicting_replicas:
190                 extent_for_each_ptr(extent_i_to_s(new), ptr)
191                         if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
192                             !ptr_c->cached) {
193                                 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
194                                 goto restart_drop_conflicting_replicas;
195                         }
196
197                 if (!bkey_val_u64s(&new->k)) {
198                         trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
199                         goto nowork;
200                 }
201
202                 /* Now, drop pointers that conflict with what we just wrote: */
203                 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
204                         if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
205                                 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
206
207                 durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
208                         bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
209
210                 /* Now, drop excess replicas: */
211 restart_drop_extra_replicas:
212                 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
213                         unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
214
215                         if (!p.ptr.cached &&
216                             durability - ptr_durability >= m->op.opts.data_replicas) {
217                                 durability -= ptr_durability;
218                                 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
219                                 /*
220                                  * Currently, we're dropping unneeded replicas
221                                  * instead of marking them as cached, since
222                                  * cached data in stripe buckets prevents them
223                                  * from being reused:
224                                 bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
225                                 */
226                                 goto restart_drop_extra_replicas;
227                         }
228                 }
229
230                 /* Finally, add the pointers we just wrote: */
231                 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
232                         bch2_extent_ptr_decoded_append(insert, &p);
233
234                 bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
235                 bch2_extent_normalize(c, bkey_i_to_s(insert));
236
237                 ret = bch2_sum_sector_overwrites(trans, &iter, insert,
238                                                  &should_check_enospc,
239                                                  &i_sectors_delta,
240                                                  &disk_sectors_delta);
241                 if (ret)
242                         goto err;
243
244                 if (disk_sectors_delta > (s64) op->res.sectors) {
245                         ret = bch2_disk_reservation_add(c, &op->res,
246                                                 disk_sectors_delta - op->res.sectors,
247                                                 !should_check_enospc
248                                                 ? BCH_DISK_RESERVATION_NOFAIL : 0);
249                         if (ret)
250                                 goto out;
251                 }
252
253                 next_pos = insert->k.p;
254
255                 ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
256                                                 k.k->p, bkey_start_pos(&insert->k)) ?:
257                         bch2_insert_snapshot_whiteouts(trans, m->btree_id,
258                                                 k.k->p, insert->k.p);
259                 if (ret)
260                         goto err;
261
262                 ret   = bch2_trans_update(trans, &iter, insert,
263                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
264                         bch2_trans_commit(trans, &op->res,
265                                 NULL,
266                                 BTREE_INSERT_NOCHECK_RW|
267                                 BTREE_INSERT_NOFAIL|
268                                 m->data_opts.btree_insert_flags);
269                 if (!ret) {
270                         bch2_btree_iter_set_pos(&iter, next_pos);
271
272                         this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
273                         trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
274                 }
275 err:
276                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
277                         ret = 0;
278                 if (ret)
279                         break;
280 next:
281                 while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
282                         bch2_keylist_pop_front(keys);
283                         if (bch2_keylist_empty(keys))
284                                 goto out;
285                 }
286                 continue;
287 nowork:
288                 if (m->ctxt && m->ctxt->stats) {
289                         BUG_ON(k.k->p.offset <= iter.pos.offset);
290                         atomic64_inc(&m->ctxt->stats->keys_raced);
291                         atomic64_add(k.k->p.offset - iter.pos.offset,
292                                      &m->ctxt->stats->sectors_raced);
293                 }
294
295                 this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
296
297                 bch2_btree_iter_advance(&iter);
298                 goto next;
299         }
300 out:
301         bch2_trans_iter_exit(trans, &iter);
302         bch2_bkey_buf_exit(&_insert, c);
303         bch2_bkey_buf_exit(&_new, c);
304         BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
305         return ret;
306 }
307
308 int bch2_data_update_index_update(struct bch_write_op *op)
309 {
310         return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
311 }
312
313 void bch2_data_update_read_done(struct data_update *m,
314                                 struct bch_extent_crc_unpacked crc)
315 {
316         /* write bio must own pages: */
317         BUG_ON(!m->op.wbio.bio.bi_vcnt);
318
319         m->op.crc = crc;
320         m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
321
322         closure_call(&m->op.cl, bch2_write, NULL, NULL);
323 }
324
325 void bch2_data_update_exit(struct data_update *update)
326 {
327         struct bch_fs *c = update->op.c;
328         struct bkey_ptrs_c ptrs =
329                 bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
330         const struct bch_extent_ptr *ptr;
331
332         bkey_for_each_ptr(ptrs, ptr) {
333                 if (c->opts.nocow_enabled)
334                         bch2_bucket_nocow_unlock(&c->nocow_locks,
335                                                  PTR_BUCKET_POS(c, ptr), 0);
336                 percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
337         }
338
339         bch2_bkey_buf_exit(&update->k, c);
340         bch2_disk_reservation_put(c, &update->op.res);
341         bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
342 }
343
344 void bch2_update_unwritten_extent(struct btree_trans *trans,
345                                   struct data_update *update)
346 {
347         struct bch_fs *c = update->op.c;
348         struct bio *bio = &update->op.wbio.bio;
349         struct bkey_i_extent *e;
350         struct write_point *wp;
351         struct bch_extent_ptr *ptr;
352         struct closure cl;
353         struct btree_iter iter;
354         struct bkey_s_c k;
355         int ret;
356
357         closure_init_stack(&cl);
358         bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
359
360         while (bio_sectors(bio)) {
361                 unsigned sectors = bio_sectors(bio);
362
363                 bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
364                                      BTREE_ITER_SLOTS);
365                 ret = lockrestart_do(trans, ({
366                         k = bch2_btree_iter_peek_slot(&iter);
367                         bkey_err(k);
368                 }));
369                 bch2_trans_iter_exit(trans, &iter);
370
371                 if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
372                         break;
373
374                 e = bkey_extent_init(update->op.insert_keys.top);
375                 e->k.p = update->op.pos;
376
377                 ret = bch2_alloc_sectors_start_trans(trans,
378                                 update->op.target,
379                                 false,
380                                 update->op.write_point,
381                                 &update->op.devs_have,
382                                 update->op.nr_replicas,
383                                 update->op.nr_replicas,
384                                 update->op.watermark,
385                                 0, &cl, &wp);
386                 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
387                         bch2_trans_unlock(trans);
388                         closure_sync(&cl);
389                         continue;
390                 }
391
392                 if (ret)
393                         return;
394
395                 sectors = min(sectors, wp->sectors_free);
396
397                 bch2_key_resize(&e->k, sectors);
398
399                 bch2_open_bucket_get(c, wp, &update->op.open_buckets);
400                 bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
401                 bch2_alloc_sectors_done(c, wp);
402
403                 bio_advance(bio, sectors << 9);
404                 update->op.pos.offset += sectors;
405
406                 extent_for_each_ptr(extent_i_to_s(e), ptr)
407                         ptr->unwritten = true;
408                 bch2_keylist_push(&update->op.insert_keys);
409
410                 ret = __bch2_data_update_index_update(trans, &update->op);
411
412                 bch2_open_buckets_put(c, &update->op.open_buckets);
413
414                 if (ret)
415                         break;
416         }
417
418         if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
419                 bch2_trans_unlock(trans);
420                 closure_sync(&cl);
421         }
422 }
423
424 int bch2_data_update_init(struct btree_trans *trans,
425                           struct moving_context *ctxt,
426                           struct data_update *m,
427                           struct write_point_specifier wp,
428                           struct bch_io_opts io_opts,
429                           struct data_update_opts data_opts,
430                           enum btree_id btree_id,
431                           struct bkey_s_c k)
432 {
433         struct bch_fs *c = trans->c;
434         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
435         const union bch_extent_entry *entry;
436         struct extent_ptr_decoded p;
437         const struct bch_extent_ptr *ptr;
438         unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
439         unsigned ptrs_locked = 0;
440         int ret;
441
442         bch2_bkey_buf_init(&m->k);
443         bch2_bkey_buf_reassemble(&m->k, c, k);
444         m->btree_id     = btree_id;
445         m->data_opts    = data_opts;
446
447         bch2_write_op_init(&m->op, c, io_opts);
448         m->op.pos       = bkey_start_pos(k.k);
449         m->op.version   = k.k->version;
450         m->op.target    = data_opts.target;
451         m->op.write_point = wp;
452         m->op.nr_replicas = 0;
453         m->op.flags     |= BCH_WRITE_PAGES_STABLE|
454                 BCH_WRITE_PAGES_OWNED|
455                 BCH_WRITE_DATA_ENCODED|
456                 BCH_WRITE_MOVE|
457                 m->data_opts.write_flags;
458         m->op.compression_opt   = io_opts.background_compression ?: io_opts.compression;
459         m->op.watermark         = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
460
461         bkey_for_each_ptr(ptrs, ptr)
462                 percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
463
464         i = 0;
465         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
466                 bool locked;
467
468                 if (((1U << i) & m->data_opts.rewrite_ptrs)) {
469                         BUG_ON(p.ptr.cached);
470
471                         if (crc_is_compressed(p.crc))
472                                 reserve_sectors += k.k->size;
473
474                         m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
475                 } else if (!p.ptr.cached) {
476                         bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
477                 }
478
479                 /*
480                  * op->csum_type is normally initialized from the fs/file's
481                  * current options - but if an extent is encrypted, we require
482                  * that it stays encrypted:
483                  */
484                 if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
485                         m->op.nonce     = p.crc.nonce + p.crc.offset;
486                         m->op.csum_type = p.crc.csum_type;
487                 }
488
489                 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
490                         m->op.incompressible = true;
491
492                 if (c->opts.nocow_enabled) {
493                         if (ctxt) {
494                                 move_ctxt_wait_event(ctxt, trans,
495                                                 (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
496                                                                           PTR_BUCKET_POS(c, &p.ptr), 0)) ||
497                                                 !atomic_read(&ctxt->read_sectors));
498
499                                 if (!locked)
500                                         bch2_bucket_nocow_lock(&c->nocow_locks,
501                                                                PTR_BUCKET_POS(c, &p.ptr), 0);
502                         } else {
503                                 if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
504                                                                PTR_BUCKET_POS(c, &p.ptr), 0)) {
505                                         ret = -BCH_ERR_nocow_lock_blocked;
506                                         goto err;
507                                 }
508                         }
509                         ptrs_locked |= (1U << i);
510                 }
511
512                 i++;
513         }
514
515         if (reserve_sectors) {
516                 ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
517                                 m->data_opts.extra_replicas
518                                 ? 0
519                                 : BCH_DISK_RESERVATION_NOFAIL);
520                 if (ret)
521                         goto err;
522         }
523
524         m->op.nr_replicas += m->data_opts.extra_replicas;
525         m->op.nr_replicas_required = m->op.nr_replicas;
526
527         BUG_ON(!m->op.nr_replicas);
528
529         /* Special handling required: */
530         if (bkey_extent_is_unwritten(k))
531                 return -BCH_ERR_unwritten_extent_update;
532         return 0;
533 err:
534         i = 0;
535         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
536                 if ((1U << i) & ptrs_locked)
537                         bch2_bucket_nocow_unlock(&c->nocow_locks,
538                                                  PTR_BUCKET_POS(c, &p.ptr), 0);
539                 percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
540                 i++;
541         }
542
543         bch2_bkey_buf_exit(&m->k, c);
544         bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
545         return ret;
546 }
547
548 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
549 {
550         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
551         const struct bch_extent_ptr *ptr;
552         unsigned i = 0;
553
554         bkey_for_each_ptr(ptrs, ptr) {
555                 if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
556                         opts->kill_ptrs |= 1U << i;
557                         opts->rewrite_ptrs ^= 1U << i;
558                 }
559
560                 i++;
561         }
562 }