]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io_misc.c
Upload to experimental
[bcachefs-tools-debian] / libbcachefs / io_misc.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * io_misc.c - fallocate, fpunch, truncate:
4  */
5
6 #include "bcachefs.h"
7 #include "alloc_foreground.h"
8 #include "bkey_buf.h"
9 #include "btree_update.h"
10 #include "buckets.h"
11 #include "clock.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "extent_update.h"
15 #include "inode.h"
16 #include "io_misc.h"
17 #include "io_write.h"
18 #include "logged_ops.h"
19 #include "rebalance.h"
20 #include "subvolume.h"
21
22 /* Overwrites whatever was present with zeroes: */
23 int bch2_extent_fallocate(struct btree_trans *trans,
24                           subvol_inum inum,
25                           struct btree_iter *iter,
26                           u64 sectors,
27                           struct bch_io_opts opts,
28                           s64 *i_sectors_delta,
29                           struct write_point_specifier write_point)
30 {
31         struct bch_fs *c = trans->c;
32         struct disk_reservation disk_res = { 0 };
33         struct closure cl;
34         struct open_buckets open_buckets = { 0 };
35         struct bkey_s_c k;
36         struct bkey_buf old, new;
37         unsigned sectors_allocated = 0;
38         bool have_reservation = false;
39         bool unwritten = opts.nocow &&
40             c->sb.version >= bcachefs_metadata_version_unwritten_extents;
41         int ret;
42
43         bch2_bkey_buf_init(&old);
44         bch2_bkey_buf_init(&new);
45         closure_init_stack(&cl);
46
47         k = bch2_btree_iter_peek_slot(iter);
48         ret = bkey_err(k);
49         if (ret)
50                 return ret;
51
52         sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
53
54         if (!have_reservation) {
55                 unsigned new_replicas =
56                         max(0, (int) opts.data_replicas -
57                             (int) bch2_bkey_nr_ptrs_fully_allocated(k));
58                 /*
59                  * Get a disk reservation before (in the nocow case) calling
60                  * into the allocator:
61                  */
62                 ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
63                 if (unlikely(ret))
64                         goto err;
65
66                 bch2_bkey_buf_reassemble(&old, c, k);
67         }
68
69         if (have_reservation) {
70                 if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
71                         goto err;
72
73                 bch2_key_resize(&new.k->k, sectors);
74         } else if (!unwritten) {
75                 struct bkey_i_reservation *reservation;
76
77                 bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
78                 reservation = bkey_reservation_init(new.k);
79                 reservation->k.p = iter->pos;
80                 bch2_key_resize(&reservation->k, sectors);
81                 reservation->v.nr_replicas = opts.data_replicas;
82         } else {
83                 struct bkey_i_extent *e;
84                 struct bch_devs_list devs_have;
85                 struct write_point *wp;
86                 struct bch_extent_ptr *ptr;
87
88                 devs_have.nr = 0;
89
90                 bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
91
92                 e = bkey_extent_init(new.k);
93                 e->k.p = iter->pos;
94
95                 ret = bch2_alloc_sectors_start_trans(trans,
96                                 opts.foreground_target,
97                                 false,
98                                 write_point,
99                                 &devs_have,
100                                 opts.data_replicas,
101                                 opts.data_replicas,
102                                 BCH_WATERMARK_normal, 0, &cl, &wp);
103                 if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
104                         ret = -BCH_ERR_transaction_restart_nested;
105                 if (ret)
106                         goto err;
107
108                 sectors = min_t(u64, sectors, wp->sectors_free);
109                 sectors_allocated = sectors;
110
111                 bch2_key_resize(&e->k, sectors);
112
113                 bch2_open_bucket_get(c, wp, &open_buckets);
114                 bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
115                 bch2_alloc_sectors_done(c, wp);
116
117                 extent_for_each_ptr(extent_i_to_s(e), ptr)
118                         ptr->unwritten = true;
119         }
120
121         have_reservation = true;
122
123         ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
124                                  0, i_sectors_delta, true);
125 err:
126         if (!ret && sectors_allocated)
127                 bch2_increment_clock(c, sectors_allocated, WRITE);
128
129         bch2_open_buckets_put(c, &open_buckets);
130         bch2_disk_reservation_put(c, &disk_res);
131         bch2_bkey_buf_exit(&new, c);
132         bch2_bkey_buf_exit(&old, c);
133
134         if (closure_nr_remaining(&cl) != 1) {
135                 bch2_trans_unlock(trans);
136                 closure_sync(&cl);
137         }
138
139         return ret;
140 }
141
142 /*
143  * Returns -BCH_ERR_transacton_restart if we had to drop locks:
144  */
145 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
146                    subvol_inum inum, u64 end,
147                    s64 *i_sectors_delta)
148 {
149         struct bch_fs *c        = trans->c;
150         unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
151         struct bpos end_pos = POS(inum.inum, end);
152         struct bkey_s_c k;
153         int ret = 0, ret2 = 0;
154         u32 snapshot;
155
156         while (!ret ||
157                bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
158                 struct disk_reservation disk_res =
159                         bch2_disk_reservation_init(c, 0);
160                 struct bkey_i delete;
161
162                 if (ret)
163                         ret2 = ret;
164
165                 bch2_trans_begin(trans);
166
167                 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
168                 if (ret)
169                         continue;
170
171                 bch2_btree_iter_set_snapshot(iter, snapshot);
172
173                 /*
174                  * peek_upto() doesn't have ideal semantics for extents:
175                  */
176                 k = bch2_btree_iter_peek_upto(iter, end_pos);
177                 if (!k.k)
178                         break;
179
180                 ret = bkey_err(k);
181                 if (ret)
182                         continue;
183
184                 bkey_init(&delete.k);
185                 delete.k.p = iter->pos;
186
187                 /* create the biggest key we can */
188                 bch2_key_resize(&delete.k, max_sectors);
189                 bch2_cut_back(end_pos, &delete);
190
191                 ret = bch2_extent_update(trans, inum, iter, &delete,
192                                 &disk_res, 0, i_sectors_delta, false);
193                 bch2_disk_reservation_put(c, &disk_res);
194         }
195
196         return ret ?: ret2;
197 }
198
199 int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
200                 s64 *i_sectors_delta)
201 {
202         struct btree_trans *trans = bch2_trans_get(c);
203         struct btree_iter iter;
204         int ret;
205
206         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
207                              POS(inum.inum, start),
208                              BTREE_ITER_INTENT);
209
210         ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
211
212         bch2_trans_iter_exit(trans, &iter);
213         bch2_trans_put(trans);
214
215         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
216                 ret = 0;
217
218         return ret;
219 }
220
221 /* truncate: */
222
223 void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
224 {
225         struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
226
227         prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
228         prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
229         prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
230 }
231
232 static int truncate_set_isize(struct btree_trans *trans,
233                               subvol_inum inum,
234                               u64 new_i_size)
235 {
236         struct btree_iter iter = { NULL };
237         struct bch_inode_unpacked inode_u;
238         int ret;
239
240         ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
241                 (inode_u.bi_size = new_i_size, 0) ?:
242                 bch2_inode_write(trans, &iter, &inode_u);
243
244         bch2_trans_iter_exit(trans, &iter);
245         return ret;
246 }
247
248 static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
249                                             struct bkey_i *op_k,
250                                             u64 *i_sectors_delta)
251 {
252         struct bch_fs *c = trans->c;
253         struct btree_iter fpunch_iter;
254         struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
255         subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
256         u64 new_i_size = le64_to_cpu(op->v.new_i_size);
257         int ret;
258
259         ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
260                         truncate_set_isize(trans, inum, new_i_size));
261         if (ret)
262                 goto err;
263
264         bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
265                              POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
266                              BTREE_ITER_INTENT);
267         ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
268         bch2_trans_iter_exit(trans, &fpunch_iter);
269
270         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
271                 ret = 0;
272 err:
273         bch2_logged_op_finish(trans, op_k);
274         return ret;
275 }
276
277 int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
278 {
279         return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
280 }
281
282 int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
283 {
284         struct bkey_i_logged_op_truncate op;
285
286         bkey_logged_op_truncate_init(&op.k_i);
287         op.v.subvol     = cpu_to_le32(inum.subvol);
288         op.v.inum       = cpu_to_le64(inum.inum);
289         op.v.new_i_size = cpu_to_le64(new_i_size);
290
291         /*
292          * Logged ops aren't atomic w.r.t. snapshot creation: creating a
293          * snapshot while they're in progress, then crashing, will result in the
294          * resume only proceeding in one of the snapshots
295          */
296         down_read(&c->snapshot_create_lock);
297         int ret = bch2_trans_run(c,
298                 bch2_logged_op_start(trans, &op.k_i) ?:
299                 __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
300         up_read(&c->snapshot_create_lock);
301
302         return ret;
303 }
304
305 /* finsert/fcollapse: */
306
307 void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
308 {
309         struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
310
311         prt_printf(out, "subvol=%u",            le32_to_cpu(op.v->subvol));
312         prt_printf(out, " inum=%llu",           le64_to_cpu(op.v->inum));
313         prt_printf(out, " dst_offset=%lli",     le64_to_cpu(op.v->dst_offset));
314         prt_printf(out, " src_offset=%llu",     le64_to_cpu(op.v->src_offset));
315 }
316
317 static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
318 {
319         struct btree_iter iter;
320         struct bch_inode_unpacked inode_u;
321         int ret;
322
323         offset  <<= 9;
324         len     <<= 9;
325
326         ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
327         if (ret)
328                 return ret;
329
330         if (len > 0) {
331                 if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
332                         ret = -EFBIG;
333                         goto err;
334                 }
335
336                 if (offset >= inode_u.bi_size) {
337                         ret = -EINVAL;
338                         goto err;
339                 }
340         }
341
342         inode_u.bi_size += len;
343         inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
344
345         ret = bch2_inode_write(trans, &iter, &inode_u);
346 err:
347         bch2_trans_iter_exit(trans, &iter);
348         return ret;
349 }
350
351 static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
352                                            struct bkey_i *op_k,
353                                            u64 *i_sectors_delta)
354 {
355         struct bch_fs *c = trans->c;
356         struct btree_iter iter;
357         struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
358         subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
359         struct bch_io_opts opts;
360         u64 dst_offset = le64_to_cpu(op->v.dst_offset);
361         u64 src_offset = le64_to_cpu(op->v.src_offset);
362         s64 shift = dst_offset - src_offset;
363         u64 len = abs(shift);
364         u64 pos = le64_to_cpu(op->v.pos);
365         bool insert = shift > 0;
366         int ret = 0;
367
368         ret = bch2_inum_opts_get(trans, inum, &opts);
369         if (ret)
370                 return ret;
371
372         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
373                              POS(inum.inum, 0),
374                              BTREE_ITER_INTENT);
375
376         switch (op->v.state) {
377 case LOGGED_OP_FINSERT_start:
378         op->v.state = LOGGED_OP_FINSERT_shift_extents;
379
380         if (insert) {
381                 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
382                                 adjust_i_size(trans, inum, src_offset, len) ?:
383                                 bch2_logged_op_update(trans, &op->k_i));
384                 if (ret)
385                         goto err;
386         } else {
387                 bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
388
389                 ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
390                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
391                         goto err;
392
393                 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
394                                 bch2_logged_op_update(trans, &op->k_i));
395         }
396
397         fallthrough;
398 case LOGGED_OP_FINSERT_shift_extents:
399         while (1) {
400                 struct disk_reservation disk_res =
401                         bch2_disk_reservation_init(c, 0);
402                 struct bkey_i delete, *copy;
403                 struct bkey_s_c k;
404                 struct bpos src_pos = POS(inum.inum, src_offset);
405                 u32 snapshot;
406
407                 bch2_trans_begin(trans);
408
409                 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
410                 if (ret)
411                         goto btree_err;
412
413                 bch2_btree_iter_set_snapshot(&iter, snapshot);
414                 bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
415
416                 k = insert
417                         ? bch2_btree_iter_peek_prev(&iter)
418                         : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
419                 if ((ret = bkey_err(k)))
420                         goto btree_err;
421
422                 if (!k.k ||
423                     k.k->p.inode != inum.inum ||
424                     bkey_le(k.k->p, POS(inum.inum, src_offset)))
425                         break;
426
427                 copy = bch2_bkey_make_mut_noupdate(trans, k);
428                 if ((ret = PTR_ERR_OR_ZERO(copy)))
429                         goto btree_err;
430
431                 if (insert &&
432                     bkey_lt(bkey_start_pos(k.k), src_pos)) {
433                         bch2_cut_front(src_pos, copy);
434
435                         /* Splitting compressed extent? */
436                         bch2_disk_reservation_add(c, &disk_res,
437                                         copy->k.size *
438                                         bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
439                                         BCH_DISK_RESERVATION_NOFAIL);
440                 }
441
442                 bkey_init(&delete.k);
443                 delete.k.p = copy->k.p;
444                 delete.k.p.snapshot = snapshot;
445                 delete.k.size = copy->k.size;
446
447                 copy->k.p.offset += shift;
448                 copy->k.p.snapshot = snapshot;
449
450                 op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
451
452                 ret =   bch2_bkey_set_needs_rebalance(c, copy,
453                                         opts.background_target,
454                                         opts.background_compression) ?:
455                         bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
456                         bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
457                         bch2_logged_op_update(trans, &op->k_i) ?:
458                         bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
459 btree_err:
460                 bch2_disk_reservation_put(c, &disk_res);
461
462                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
463                         continue;
464                 if (ret)
465                         goto err;
466
467                 pos = le64_to_cpu(op->v.pos);
468         }
469
470         op->v.state = LOGGED_OP_FINSERT_finish;
471
472         if (!insert) {
473                 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
474                                 adjust_i_size(trans, inum, src_offset, shift) ?:
475                                 bch2_logged_op_update(trans, &op->k_i));
476         } else {
477                 /* We need an inode update to update bi_journal_seq for fsync: */
478                 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
479                                 adjust_i_size(trans, inum, 0, 0) ?:
480                                 bch2_logged_op_update(trans, &op->k_i));
481         }
482
483         break;
484 case LOGGED_OP_FINSERT_finish:
485         break;
486         }
487 err:
488         bch2_logged_op_finish(trans, op_k);
489         bch2_trans_iter_exit(trans, &iter);
490         return ret;
491 }
492
493 int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
494 {
495         return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
496 }
497
498 int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
499                            u64 offset, u64 len, bool insert,
500                            s64 *i_sectors_delta)
501 {
502         struct bkey_i_logged_op_finsert op;
503         s64 shift = insert ? len : -len;
504
505         bkey_logged_op_finsert_init(&op.k_i);
506         op.v.subvol     = cpu_to_le32(inum.subvol);
507         op.v.inum       = cpu_to_le64(inum.inum);
508         op.v.dst_offset = cpu_to_le64(offset + shift);
509         op.v.src_offset = cpu_to_le64(offset);
510         op.v.pos        = cpu_to_le64(insert ? U64_MAX : offset);
511
512         /*
513          * Logged ops aren't atomic w.r.t. snapshot creation: creating a
514          * snapshot while they're in progress, then crashing, will result in the
515          * resume only proceeding in one of the snapshots
516          */
517         down_read(&c->snapshot_create_lock);
518         int ret = bch2_trans_run(c,
519                 bch2_logged_op_start(trans, &op.k_i) ?:
520                 __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
521         up_read(&c->snapshot_create_lock);
522
523         return ret;
524 }