Message ID | 20231219000815.2739120-7-bvanassche@acm.org |
---|---|
State | Superseded |
Headers | show |
Series | Pass data lifetime information to SCSI disk devices | expand |
On Mon, Dec 18, 2023 at 04:07:39PM -0800, Bart Van Assche wrote: > Write hints applied with F_SET_RW_HINT on a block device affect the > shmem inode only. Propagate these hints to the block device inode > because that is the inode used when writing back dirty pages. What shmem inode? > @@ -317,6 +318,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, > > inode_lock(inode); > inode->i_write_hint = hint; > + apply_whint = inode->i_fop->apply_whint; > + if (apply_whint) > + apply_whint(file, hint); Setting the hint in file->f_mapping->inode is the right thing here, not adding a method.
On 12/27/23 23:12, Christoph Hellwig wrote: > On Mon, Dec 18, 2023 at 04:07:39PM -0800, Bart Van Assche wrote: >> Write hints applied with F_SET_RW_HINT on a block device affect the >> shmem inode only. Propagate these hints to the block device inode >> because that is the inode used when writing back dirty pages. > > What shmem inode? The inode associated with the /dev file, e.g. /dev/sda. That is another inode than the inode associated with the struct block_device instance. Without this patch, when opening /dev/sda and calling fcntl(), the shmem inode is modified but the struct block_device inode not. I think that the code path for allocation of the shmem inode is as follows: shmem_mknod() shmem_get_inode() __shmem_get_inode() new_inode(sb) alloc_inode(sb) ops->alloc_inode(sb) = shmem_alloc_inode(sb) inode_init_always(sb, inode) inode->i_mapping = &inode->i_data; >> @@ -317,6 +318,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, >> >> inode_lock(inode); >> inode->i_write_hint = hint; >> + apply_whint = inode->i_fop->apply_whint; >> + if (apply_whint) >> + apply_whint(file, hint); > > Setting the hint in file->f_mapping->inode is the right thing here, > not adding a method. Is my understanding correct that the only way to reach the struct block_device instance from the shmem code is by dereferencing file->private_data? Shouldn't all dereferences of that pointer happen in source file block/fops.c since the file->private_data pointer is assigned in that file? Please note that suggestions to improve this patch are definitely welcome. As you probably know I'm not that familiar with the filesystem code. Thanks, Bart.
On Thu, Dec 28, 2023 at 02:41:59PM -0800, Bart Van Assche wrote: > On 12/27/23 23:12, Christoph Hellwig wrote: >> On Mon, Dec 18, 2023 at 04:07:39PM -0800, Bart Van Assche wrote: >>> Write hints applied with F_SET_RW_HINT on a block device affect the >>> shmem inode only. Propagate these hints to the block device inode >>> because that is the inode used when writing back dirty pages. >> >> What shmem inode? > > The inode associated with the /dev file, e.g. /dev/sda. That is another > inode than the inode associated with the struct block_device instance. > Without this patch, when opening /dev/sda and calling fcntl(), the shmem > inode is modified but the struct block_device inode not. I think that > the code path for allocation of the shmem inode is as follows: So the block device node. That can sit on any file system (or at least any Unix-y file system that supports device nodes). >>> @@ -317,6 +318,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, >>> inode_lock(inode); >>> inode->i_write_hint = hint; >>> + apply_whint = inode->i_fop->apply_whint; >>> + if (apply_whint) >>> + apply_whint(file, hint); >> >> Setting the hint in file->f_mapping->inode is the right thing here, >> not adding a method. > > Is my understanding correct that the only way to reach the struct > block_device instance from the shmem code is by dereferencing > file->private_data? No. See blkdev_open: filp->f_mapping = handle->bdev->bd_inode->i_mapping; So you can use file->f_mapping->inode as I said in my previous mail.
On 1/3/24 01:02, Christoph Hellwig wrote:
> So you can use file->f_mapping->inode as I said in my previous mail.
Since struct address_space does not have a member with the name "inode",
I assume that you meant "host" instead of "inode"? If so, how about
modifying patch 06 of this series as shown below? With the patch below
my tests still pass.
Thanks,
Bart.
---
block/fops.c | 11 -----------
fs/fcntl.c | 15 +++++++++++----
include/linux/fs.h | 1 -
3 files changed, 11 insertions(+), 16 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index 138b388b5cb1..787ce52bc2c6 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -620,16 +620,6 @@ static int blkdev_release(struct inode *inode,
struct file *filp)
return 0;
}
-static void blkdev_apply_whint(struct file *file, enum rw_hint hint)
-{
- struct bdev_handle *handle = file->private_data;
- struct inode *bd_inode = handle->bdev->bd_inode;
-
- inode_lock(bd_inode);
- bd_inode->i_write_hint = hint;
- inode_unlock(bd_inode);
-}
-
static ssize_t
blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
@@ -864,7 +854,6 @@ const struct file_operations def_blk_fops = {
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
- .apply_whint = blkdev_apply_whint,
};
static __init int blkdev_init(void)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 18407bf5bb9b..cfb52c3a4577 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file,
unsigned int cmd,
static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
unsigned long arg)
{
- void (*apply_whint)(struct file *, enum rw_hint);
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
u64 hint;
@@ -318,11 +317,19 @@ static long fcntl_set_rw_hint(struct file *file,
unsigned int cmd,
inode_lock(inode);
inode->i_write_hint = hint;
- apply_whint = inode->i_fop->apply_whint;
- if (apply_whint)
- apply_whint(file, hint);
inode_unlock(inode);
+ /*
+ * file->f_mapping->host may differ from inode. As an example,
+ * blkdev_open() modifies file->f_mapping.
+ */
+ if (file->f_mapping->host != inode) {
+ inode = file->f_mapping->host;
+ inode_lock(inode);
+ inode->i_write_hint = hint;
+ inode_unlock(inode);
+ }
+
return 0;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 293017ea2466..a08014b68d6e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1944,7 +1944,6 @@ struct file_operations {
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
- void (*apply_whint)(struct file *, enum rw_hint hint);
} __randomize_layout;
/* Wrap a directory iterator that needs exclusive inode access */
On 1/18/24 10:51, Kanchan Joshi wrote: > Are you considering to change this so that hint is set only on one inode > (and not on two)? > IOW, should not this fragment be like below: > > --- a/fs/fcntl.c > +++ b/fs/fcntl.c > @@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file, > unsigned int cmd, > static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, > unsigned long arg) > { > - void (*apply_whint)(struct file *, enum rw_hint); > struct inode *inode = file_inode(file); > u64 __user *argp = (u64 __user *)arg; > u64 hint; > @@ -316,11 +315,15 @@ static long fcntl_set_rw_hint(struct file *file, > unsigned int cmd, > if (!rw_hint_valid(hint)) > return -EINVAL; > > + /* > + * file->f_mapping->host may differ from inode. As an example > + * blkdev_open() modifies file->f_mapping > + */ > + if (file->f_mapping->host != inode) > + inode = file->f_mapping->host; > + > inode_lock(inode); > inode->i_write_hint = hint; > - apply_whint = inode->i_fop->apply_whint; > - if (apply_whint) > - apply_whint(file, hint); > inode_unlock(inode); I think the above proposal would introduce a bug: it would break the F_GET_RW_HINT implementation. Thanks, Bart.
On 1/19/2024 12:24 AM, Bart Van Assche wrote: > On 1/18/24 10:51, Kanchan Joshi wrote: >> Are you considering to change this so that hint is set only on one inode >> (and not on two)? >> IOW, should not this fragment be like below: >> >> --- a/fs/fcntl.c >> +++ b/fs/fcntl.c >> @@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file, >> unsigned int cmd, >> static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, >> unsigned long arg) >> { >> - void (*apply_whint)(struct file *, enum rw_hint); >> struct inode *inode = file_inode(file); >> u64 __user *argp = (u64 __user *)arg; >> u64 hint; >> @@ -316,11 +315,15 @@ static long fcntl_set_rw_hint(struct file *file, >> unsigned int cmd, >> if (!rw_hint_valid(hint)) >> return -EINVAL; >> >> + /* >> + * file->f_mapping->host may differ from inode. As an example >> + * blkdev_open() modifies file->f_mapping >> + */ >> + if (file->f_mapping->host != inode) >> + inode = file->f_mapping->host; >> + >> inode_lock(inode); >> inode->i_write_hint = hint; >> - apply_whint = inode->i_fop->apply_whint; >> - if (apply_whint) >> - apply_whint(file, hint); >> inode_unlock(inode); > > I think the above proposal would introduce a bug: it would break the > F_GET_RW_HINT implementation. Right. I expected to keep the exact change in GET, too, but that will not be free from the side-effect. The buffered-write path (block_write_full_page) picks the hint from one inode, and the direct-write path (__blkdev_direct_IO_simple) picks the hint from a different inode. So, updating both seems needed here.
On 1/19/2024 7:26 PM, Kanchan Joshi wrote: > On 1/19/2024 12:24 AM, Bart Van Assche wrote: >> On 1/18/24 10:51, Kanchan Joshi wrote: >>> Are you considering to change this so that hint is set only on one inode >>> (and not on two)? >>> IOW, should not this fragment be like below: >>> >>> --- a/fs/fcntl.c >>> +++ b/fs/fcntl.c >>> @@ -306,7 +306,6 @@ static long fcntl_get_rw_hint(struct file *file, >>> unsigned int cmd, >>> static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, >>> unsigned long arg) >>> { >>> - void (*apply_whint)(struct file *, enum rw_hint); >>> struct inode *inode = file_inode(file); >>> u64 __user *argp = (u64 __user *)arg; >>> u64 hint; >>> @@ -316,11 +315,15 @@ static long fcntl_set_rw_hint(struct file *file, >>> unsigned int cmd, >>> if (!rw_hint_valid(hint)) >>> return -EINVAL; >>> >>> + /* >>> + * file->f_mapping->host may differ from inode. As an example >>> + * blkdev_open() modifies file->f_mapping >>> + */ >>> + if (file->f_mapping->host != inode) >>> + inode = file->f_mapping->host; >>> + >>> inode_lock(inode); >>> inode->i_write_hint = hint; >>> - apply_whint = inode->i_fop->apply_whint; >>> - if (apply_whint) >>> - apply_whint(file, hint); >>> inode_unlock(inode); >> >> I think the above proposal would introduce a bug: it would break the >> F_GET_RW_HINT implementation. > > Right. I expected to keep the exact change in GET, too, but that will > not be free from the side-effect. > The buffered-write path (block_write_full_page) picks the hint from one > inode, and the direct-write path (__blkdev_direct_IO_simple) picks the > hint from a different inode. > So, updating both seems needed here. I stand corrected. It's possible to do away with two updates. The direct-io code (patch 8) should rather be changed to pick the hint from bdev inode (and not from file inode). With that change, this patch only need to set the hint into only one inode (bdev one). What do you think?
On 1/22/24 01:31, Kanchan Joshi wrote: > On 1/19/2024 7:26 PM, Kanchan Joshi wrote: >> On 1/19/2024 12:24 AM, Bart Van Assche wrote: >>> I think the above proposal would introduce a bug: it would break the >>> F_GET_RW_HINT implementation. >> >> Right. I expected to keep the exact change in GET, too, but that will >> not be free from the side-effect. >> The buffered-write path (block_write_full_page) picks the hint from one >> inode, and the direct-write path (__blkdev_direct_IO_simple) picks the >> hint from a different inode. >> So, updating both seems needed here. > > I stand corrected. It's possible to do away with two updates. > The direct-io code (patch 8) should rather be changed to pick the hint > from bdev inode (and not from file inode). > With that change, this patch only need to set the hint into only one > inode (bdev one). What do you think? I think that would break direct I/O submitted by a filesystem. Bart.
On 1/23/2024 1:39 AM, Bart Van Assche wrote: > On 1/22/24 01:31, Kanchan Joshi wrote: >> On 1/19/2024 7:26 PM, Kanchan Joshi wrote: >>> On 1/19/2024 12:24 AM, Bart Van Assche wrote: >>>> I think the above proposal would introduce a bug: it would break the >>>> F_GET_RW_HINT implementation. >>> >>> Right. I expected to keep the exact change in GET, too, but that will >>> not be free from the side-effect. >>> The buffered-write path (block_write_full_page) picks the hint from one >>> inode, and the direct-write path (__blkdev_direct_IO_simple) picks the >>> hint from a different inode. >>> So, updating both seems needed here. >> >> I stand corrected. It's possible to do away with two updates. >> The direct-io code (patch 8) should rather be changed to pick the hint >> from bdev inode (and not from file inode). >> With that change, this patch only need to set the hint into only one >> inode (bdev one). What do you think? > > I think that would break direct I/O submitted by a filesystem. > By breakage do you mean not being able to set/get the hint correctly? I tested with XFS and Ext4 direct I/O. No breakage.
On 1/23/24 04:16, Kanchan Joshi wrote: > On 1/23/2024 1:39 AM, Bart Van Assche wrote: >> On 1/22/24 01:31, Kanchan Joshi wrote: >>> On 1/19/2024 7:26 PM, Kanchan Joshi wrote: >>>> On 1/19/2024 12:24 AM, Bart Van Assche wrote: >>>>> I think the above proposal would introduce a bug: it would break the >>>>> F_GET_RW_HINT implementation. >>>> >>>> Right. I expected to keep the exact change in GET, too, but that will >>>> not be free from the side-effect. >>>> The buffered-write path (block_write_full_page) picks the hint from one >>>> inode, and the direct-write path (__blkdev_direct_IO_simple) picks the >>>> hint from a different inode. >>>> So, updating both seems needed here. >>> >>> I stand corrected. It's possible to do away with two updates. >>> The direct-io code (patch 8) should rather be changed to pick the hint >>> from bdev inode (and not from file inode). >>> With that change, this patch only need to set the hint into only one >>> inode (bdev one). What do you think? >> >> I think that would break direct I/O submitted by a filesystem. > > By breakage do you mean not being able to set/get the hint correctly? > I tested with XFS and Ext4 direct I/O. No breakage. The approach that you proposed is wrong from a conceptual point of view. Zero, one or more block devices can be associated with a filesystem. It would be wrong to try to access all associated block devices from inside the F_SET_RW_HINT implementation. I don't think that there is any API in the Linux kernel for iterating over all the block devices associated with a filesystem. Bart.
diff --git a/block/fops.c b/block/fops.c index 787ce52bc2c6..138b388b5cb1 100644 --- a/block/fops.c +++ b/block/fops.c @@ -620,6 +620,16 @@ static int blkdev_release(struct inode *inode, struct file *filp) return 0; } +static void blkdev_apply_whint(struct file *file, enum rw_hint hint) +{ + struct bdev_handle *handle = file->private_data; + struct inode *bd_inode = handle->bdev->bd_inode; + + inode_lock(bd_inode); + bd_inode->i_write_hint = hint; + inode_unlock(bd_inode); +} + static ssize_t blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) { @@ -854,6 +864,7 @@ const struct file_operations def_blk_fops = { .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, + .apply_whint = blkdev_apply_whint, }; static __init int blkdev_init(void) diff --git a/fs/fcntl.c b/fs/fcntl.c index fc73c5fae43c..18407bf5bb9b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -306,6 +306,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { + void (*apply_whint)(struct file *, enum rw_hint); struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint; @@ -317,6 +318,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, inode_lock(inode); inode->i_write_hint = hint; + apply_whint = inode->i_fop->apply_whint; + if (apply_whint) + apply_whint(file, hint); inode_unlock(inode); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index a08014b68d6e..293017ea2466 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1944,6 +1944,7 @@ struct file_operations { int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); + void (*apply_whint)(struct file *, enum rw_hint hint); } __randomize_layout; /* Wrap a directory iterator that needs exclusive inode access */
Write hints applied with F_SET_RW_HINT on a block device affect the shmem inode only. Propagate these hints to the block device inode because that is the inode used when writing back dirty pages. Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Christian Brauner <brauner@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Jeff Layton <jlayton@kernel.org> Cc: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Bart Van Assche <bvanassche@acm.org> --- block/fops.c | 11 +++++++++++ fs/fcntl.c | 4 ++++ include/linux/fs.h | 1 + 3 files changed, 16 insertions(+)