Update bcachefs sources to c887148ebf99 thread_with_file: add f_ops.flush

[bcachefs-tools-debian] / libbcachefs / io_write.c
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c

index 6d9c777213e3ca56305f37123d7e1f82549a9c1a..f7c4a428c17b066a1a44bb1adfc9dcb92bc62eb0 100644 (file)
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
  
         bch2_congested_acct(ca, io_latency, now, rw);
  
-       __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+       __time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
  }
  
  #endif
@@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
         struct btree_iter iter;
         struct bkey_i *k;
         struct bkey_i_inode_v3 *inode;
+       /*
+        * Crazy performance optimization:
+        * Every extent update needs to also update the inode: the inode trigger
+        * will set bi->journal_seq to the journal sequence number of this
+        * transaction - for fsync.
+        *
+        * But if that's the only reason we're updating the inode (we're not
+        * updating bi_size or bi_sectors), then we don't need the inode update
+        * to be journalled - if we crash, the bi_journal_seq update will be
+        * lost, but that's fine.
+        */
         unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
         int ret;
  
@@ -223,7 +234,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
  
         inode = bkey_i_to_inode_v3(k);
  
-       if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+       if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
             new_i_size > le64_to_cpu(inode->v.bi_size)) {
                 inode->v.bi_size = cpu_to_le64(new_i_size);
                 inode_update_flags = 0;
@@ -305,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
                                                   i_sectors_delta) ?:
                 bch2_trans_update(trans, iter, k, 0) ?:
                 bch2_trans_commit(trans, disk_res, NULL,
-                               BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL);
+                               BCH_TRANS_COMMIT_no_check_rw|
+                               BCH_TRANS_COMMIT_no_enospc);
         if (unlikely(ret))
                 return ret;
  
@@ -351,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
                                      bkey_start_pos(&sk.k->k),
                                      BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
  
-               ret =   bch2_bkey_set_needs_rebalance(c, sk.k,
-                                       op->opts.background_target,
-                                       op->opts.background_compression) ?:
+               ret =   bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
                         bch2_extent_update(trans, inum, &iter, sk.k,
                                         &op->res,
                                         op->new_i_size, &op->i_sectors_delta,
@@ -385,17 +394,14 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                                bool nocow)
  {
         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-       const struct bch_extent_ptr *ptr;
         struct bch_write_bio *n;
-       struct bch_dev *ca;
  
         BUG_ON(c->opts.nochanges);
  
         bkey_for_each_ptr(ptrs, ptr) {
-               BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
-                      !c->devs[ptr->dev]);
+               BUG_ON(!bch2_dev_exists2(c, ptr->dev));
  
-               ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
  
                 if (to_entry(ptr + 1) < ptrs.end) {
                         n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
@@ -451,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
  
         EBUG_ON(op->open_buckets.nr);
  
-       bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+       time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
         bch2_disk_reservation_put(c, &op->res);
  
         if (!(op->flags & BCH_WRITE_MOVE))
@@ -524,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op)
  
                         bch_err_inum_offset_ratelimited(c,
                                 insert->k.p.inode, insert->k.p.offset << 9,
-                               "write error while doing btree update: %s",
+                               "%s write error while doing btree update: %s",
+                               op->flags & BCH_WRITE_MOVE ? "move" : "user",
                                 bch2_err_str(ret));
                 }
  
@@ -569,9 +576,9 @@ static inline void wp_update_state(struct write_point *wp, bool running)
         __wp_update_state(wp, state);
  }
  
-static void bch2_write_index(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_write_index)
  {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       closure_type(op, struct bch_write_op, cl);
         struct write_point *wp = op->wp;
         struct workqueue_struct *wq = index_update_wq(op);
         unsigned long flags;
@@ -637,7 +644,7 @@ static void bch2_write_endio(struct bio *bio)
         struct bch_fs *c                = wbio->c;
         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
  
-       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
                                     op->pos.inode,
                                     wbio->inode_offset << 9,
                                     "data write error: %s",
@@ -784,7 +791,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
          * checksum:
          */
         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-       if (bch2_crc_cmp(op->crc.csum, csum))
+       if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
                 return -EIO;
  
         ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
@@ -1061,7 +1068,8 @@ do_write:
         *_dst = dst;
         return more;
  csum_err:
-       bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+       bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+               op->flags & BCH_WRITE_MOVE ? "move" : "user");
         ret = -EIO;
  err:
         if (to_wbio(dst)->bounce)
@@ -1098,16 +1106,14 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
  static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
  {
         struct bch_fs *c = op->c;
-       const struct bch_extent_ptr *ptr;
-       struct bkey_i *k;
  
         for_each_keylist_key(&op->insert_keys, k) {
                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
  
                 bkey_for_each_ptr(ptrs, ptr)
                         bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                              PTR_BUCKET_POS(c, ptr),
-                                              BUCKET_NOCOW_LOCK_UPDATE);
+                                                PTR_BUCKET_POS(c, ptr),
+                                                BUCKET_NOCOW_LOCK_UPDATE);
         }
  }
  
@@ -1117,25 +1123,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
                                                   struct bkey_s_c k,
                                                   u64 new_i_size)
  {
-       struct bkey_i *new;
-       struct bkey_ptrs ptrs;
-       struct bch_extent_ptr *ptr;
-       int ret;
-
         if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
                 /* trace this */
                 return 0;
         }
  
-       new = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(new);
+       struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+       int ret = PTR_ERR_OR_ZERO(new);
         if (ret)
                 return ret;
  
         bch2_cut_front(bkey_start_pos(&orig->k), new);
         bch2_cut_back(orig->k.p, new);
  
-       ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
         bkey_for_each_ptr(ptrs, ptr)
                 ptr->unwritten = 0;
  
@@ -1156,16 +1157,12 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
  {
         struct bch_fs *c = op->c;
         struct btree_trans *trans = bch2_trans_get(c);
-       struct btree_iter iter;
-       struct bkey_i *orig;
-       struct bkey_s_c k;
-       int ret;
  
         for_each_keylist_key(&op->insert_keys, orig) {
-               ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+               int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
                                      bkey_start_pos(&orig->k), orig->k.p,
                                      BTREE_ITER_INTENT, k,
-                                    NULL, NULL, BTREE_INSERT_NOFAIL, ({
+                                    NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
                         bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
                 }));
  
@@ -1174,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
  
                         bch_err_inum_offset_ratelimited(c,
                                 insert->k.p.inode, insert->k.p.offset << 9,
-                               "write error while doing btree update: %s",
+                               "%s write error while doing btree update: %s",
+                               op->flags & BCH_WRITE_MOVE ? "move" : "user",
                                 bch2_err_str(ret));
                 }
  
@@ -1197,34 +1195,35 @@ static void __bch2_nocow_write_done(struct bch_write_op *op)
                 bch2_nocow_write_convert_unwritten(op);
  }
  
-static void bch2_nocow_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
  {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       closure_type(op, struct bch_write_op, cl);
  
         __bch2_nocow_write_done(op);
         bch2_write_done(cl);
  }
  
+struct bucket_to_lock {
+       struct bpos             b;
+       unsigned                gen;
+       struct nocow_lock_bucket *l;
+};
+
  static void bch2_nocow_write(struct bch_write_op *op)
  {
         struct bch_fs *c = op->c;
         struct btree_trans *trans;
         struct btree_iter iter;
         struct bkey_s_c k;
-       struct bkey_ptrs_c ptrs;
-       const struct bch_extent_ptr *ptr;
-       struct {
-               struct bpos     b;
-               unsigned        gen;
-               struct nocow_lock_bucket *l;
-       } buckets[BCH_REPLICAS_MAX];
-       unsigned nr_buckets = 0;
+       DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
         u32 snapshot;
-       int ret, i;
+       struct bucket_to_lock *stale_at;
+       int ret;
  
         if (op->flags & BCH_WRITE_MOVE)
                 return;
  
+       darray_init(&buckets);
         trans = bch2_trans_get(c);
  retry:
         bch2_trans_begin(trans);
@@ -1239,7 +1238,7 @@ retry:
         while (1) {
                 struct bio *bio = &op->wbio.bio;
  
-               nr_buckets = 0;
+               buckets.nr = 0;
  
                 k = bch2_btree_iter_peek_slot(&iter);
                 ret = bkey_err(k);
@@ -1252,26 +1251,26 @@ retry:
                         break;
  
                 if (bch2_keylist_realloc(&op->insert_keys,
-                                       op->inline_keys,
-                                       ARRAY_SIZE(op->inline_keys),
-                                       k.k->u64s))
+                                        op->inline_keys,
+                                        ARRAY_SIZE(op->inline_keys),
+                                        k.k->u64s))
                         break;
  
                 /* Get iorefs before dropping btree locks: */
-               ptrs = bch2_bkey_ptrs_c(k);
+               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                 bkey_for_each_ptr(ptrs, ptr) {
-                       buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
-                       buckets[nr_buckets].gen = ptr->gen;
-                       buckets[nr_buckets].l =
-                               bucket_nocow_lock(&c->nocow_locks,
-                                                 bucket_to_u64(buckets[nr_buckets].b));
-
-                       prefetch(buckets[nr_buckets].l);
+                       struct bpos b = PTR_BUCKET_POS(c, ptr);
+                       struct nocow_lock_bucket *l =
+                               bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
+                       prefetch(l);
  
                         if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
                                 goto err_get_ioref;
  
-                       nr_buckets++;
+                       /* XXX allocating memory with btree locks held - rare */
+                       darray_push_gfp(&buckets, ((struct bucket_to_lock) {
+                                                  .b = b, .gen = ptr->gen, .l = l,
+                                                  }), GFP_KERNEL|__GFP_NOFAIL);
  
                         if (ptr->unwritten)
                                 op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
@@ -1285,21 +1284,21 @@ retry:
                 if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
                         bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
  
-               for (i = 0; i < nr_buckets; i++) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
-                       struct nocow_lock_bucket *l = buckets[i].l;
-                       bool stale;
+               darray_for_each(buckets, i) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode);
  
-                       __bch2_bucket_nocow_lock(&c->nocow_locks, l,
-                                                bucket_to_u64(buckets[i].b),
+                       __bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
+                                                bucket_to_u64(i->b),
                                                  BUCKET_NOCOW_LOCK_UPDATE);
  
                         rcu_read_lock();
-                       stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+                       bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
                         rcu_read_unlock();
  
-                       if (unlikely(stale))
+                       if (unlikely(stale)) {
+                               stale_at = i;
                                 goto err_bucket_stale;
+                       }
                 }
  
                 bio = &op->wbio.bio;
@@ -1335,15 +1334,14 @@ err:
  
         if (ret) {
                 bch_err_inum_offset_ratelimited(c,
-                               op->pos.inode,
-                               op->pos.offset << 9,
-                               "%s: btree lookup error %s",
-                               __func__, bch2_err_str(ret));
+                       op->pos.inode, op->pos.offset << 9,
+                       "%s: btree lookup error %s", __func__, bch2_err_str(ret));
                 op->error = ret;
                 op->flags |= BCH_WRITE_DONE;
         }
  
         bch2_trans_put(trans);
+       darray_exit(&buckets);
  
         /* fallback to cow write path? */
         if (!(op->flags & BCH_WRITE_DONE)) {
@@ -1352,7 +1350,7 @@ err:
                 op->insert_keys.top = op->insert_keys.keys;
         } else if (op->flags & BCH_WRITE_SYNC) {
                 closure_sync(&op->cl);
-               bch2_nocow_write_done(&op->cl);
+               bch2_nocow_write_done(&op->cl.work);
         } else {
                 /*
                  * XXX
@@ -1363,24 +1361,21 @@ err:
         }
         return;
  err_get_ioref:
-       for (i = 0; i < nr_buckets; i++)
-               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+       darray_for_each(buckets, i)
+               percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref);
  
         /* Fall back to COW path: */
         goto out;
  err_bucket_stale:
-       while (i >= 0) {
-               bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                        buckets[i].b,
-                                        BUCKET_NOCOW_LOCK_UPDATE);
-               --i;
+       darray_for_each(buckets, i) {
+               bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
+               if (i == stale_at)
+                       break;
         }
-       for (i = 0; i < nr_buckets; i++)
-               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
  
         /* We can retry this: */
         ret = -BCH_ERR_transaction_restart;
-       goto out;
+       goto err_get_ioref;
  }
  
  static void __bch2_write(struct bch_write_op *op)
@@ -1453,6 +1448,13 @@ err:
                         op->flags |= BCH_WRITE_DONE;
  
                         if (ret < 0) {
+                               if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
+                                       bch_err_inum_offset_ratelimited(c,
+                                               op->pos.inode,
+                                               op->pos.offset << 9,
+                                               "%s(): %s error: %s", __func__,
+                                               op->flags & BCH_WRITE_MOVE ? "move" : "user",
+                                               bch2_err_str(ret));
                                 op->error = ret;
                                 break;
                         }
@@ -1555,9 +1557,9 @@ err:
   * If op->discard is true, instead of inserting the data it invalidates the
   * region of the cache represented by op->bio and op->inode.
   */
-void bch2_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_write)
  {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       closure_type(op, struct bch_write_op, cl);
         struct bio *bio = &op->wbio.bio;
         struct bch_fs *c = op->c;
         unsigned data_len;
@@ -1567,6 +1569,7 @@ void bch2_write(struct closure *cl)
         BUG_ON(!op->write_point.v);
         BUG_ON(bkey_eq(op->pos, POS_MAX));
  
+       op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
         op->start_time = local_clock();
         bch2_keylist_init(&op->insert_keys, op->inline_keys);
         wbio_init(bio)->put_bio = false;
@@ -1575,7 +1578,8 @@ void bch2_write(struct closure *cl)
                 bch_err_inum_offset_ratelimited(c,
                         op->pos.inode,
                         op->pos.offset << 9,
-                       "misaligned write");
+                       "%s write error: misaligned write",
+                       op->flags & BCH_WRITE_MOVE ? "move" : "user");
                 op->error = -EIO;
                 goto err;
         }