diff mbox series

[RFC,RESEND,1/1] fs/namespace: defer free_mount from namespace_unlock

Message ID 20230119211455.498968-2-echanude@redhat.com
State New
Headers show
Series fs/namespace: defer free_mount from namespace_unlock | expand

Commit Message

Eric Chanudet Jan. 19, 2023, 9:14 p.m. UTC
From: Alexander Larsson <alexl@redhat.com>

Use call_rcu to defer releasing the umount'ed or detached filesystem
when calling namepsace_unlock().

Calling synchronize_rcu_expedited() has a significant cost on RT kernel
that default to rcupdate.rcu_normal_after_boot=1.

For example, on a 6.2-rt1 kernel:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
           0.07464 +- 0.00396 seconds time elapsed  ( +-  5.31% )

With this change applied:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
        0.00162604 +- 0.00000637 seconds time elapsed  ( +-  0.39% )

Waiting for the grace period before completing the syscall does not seem
mandatory. The struct mount umount'ed are queued up for release in a
separate list and no longer accessible to following syscalls.

Signed-off-by: Alexander Larsson <alexl@redhat.com>
Signed-off-by: Eric Chanudet <echanude@redhat.com>
---
 fs/namespace.c | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

Comments

Al Viro Jan. 19, 2023, 10:09 p.m. UTC | #1
On Thu, Jan 19, 2023 at 04:14:55PM -0500, Eric Chanudet wrote:
> From: Alexander Larsson <alexl@redhat.com>
> 
> Use call_rcu to defer releasing the umount'ed or detached filesystem
> when calling namepsace_unlock().
> 
> Calling synchronize_rcu_expedited() has a significant cost on RT kernel
> that default to rcupdate.rcu_normal_after_boot=1.
> 
> For example, on a 6.2-rt1 kernel:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>            0.07464 +- 0.00396 seconds time elapsed  ( +-  5.31% )
> 
> With this change applied:
> perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
>         0.00162604 +- 0.00000637 seconds time elapsed  ( +-  0.39% )
> 
> Waiting for the grace period before completing the syscall does not seem
> mandatory. The struct mount umount'ed are queued up for release in a
> separate list and no longer accessible to following syscalls.

Again, NAK.  If a filesystem is expected to be shut down by umount(2),
userland expects it to have been already shut down by the time the
syscall returns.

It's not just visibility in namespace; it's "can I pull the disk out?".
Or "can the shutdown get to taking the network down?", for that matter.
Alexander Larsson Jan. 20, 2023, 8:43 a.m. UTC | #2
On Thu, Jan 19, 2023 at 11:09 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Thu, Jan 19, 2023 at 04:14:55PM -0500, Eric Chanudet wrote:
> > From: Alexander Larsson <alexl@redhat.com>
> >
> > Use call_rcu to defer releasing the umount'ed or detached filesystem
> > when calling namepsace_unlock().
> >
> > Calling synchronize_rcu_expedited() has a significant cost on RT kernel
> > that default to rcupdate.rcu_normal_after_boot=1.
> >
> > For example, on a 6.2-rt1 kernel:
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
> >            0.07464 +- 0.00396 seconds time elapsed  ( +-  5.31% )
> >
> > With this change applied:
> > perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
> >         0.00162604 +- 0.00000637 seconds time elapsed  ( +-  0.39% )
> >
> > Waiting for the grace period before completing the syscall does not seem
> > mandatory. The struct mount umount'ed are queued up for release in a
> > separate list and no longer accessible to following syscalls.
>
> Again, NAK.  If a filesystem is expected to be shut down by umount(2),
> userland expects it to have been already shut down by the time the
> syscall returns.
>
> It's not just visibility in namespace; it's "can I pull the disk out?".
> Or "can the shutdown get to taking the network down?", for that matter.

In the usecase we're worrying about, all the unmounts are lazy (i.e.
MNT_DETACH). What about delaying the destroy in that case? That seems
in line with the expected behaviour of lazy shutdown. I.e. you can't
rely on it to be settled anyway.
diff mbox series

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index ab467ee58341..11d219a6e83c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -44,6 +44,11 @@  static unsigned int m_hash_shift __read_mostly;
 static unsigned int mp_hash_mask __read_mostly;
 static unsigned int mp_hash_shift __read_mostly;
 
+struct mount_delayed_release {
+	struct rcu_head rcu;
+	struct hlist_head release_list;
+};
+
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
@@ -1582,11 +1587,31 @@  int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list)
 {
-	struct hlist_head head;
 	struct hlist_node *p;
 	struct mount *m;
+
+	hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
+}
+
+static void delayed_mount_release(struct rcu_head *head)
+{
+	struct mount_delayed_release *drelease =
+		container_of(head, struct mount_delayed_release, rcu);
+
+	free_mounts(&drelease->release_list);
+	kfree(drelease);
+}
+
+static void namespace_unlock(void)
+{
+	struct hlist_head head;
+	struct mount_delayed_release *drelease;
+
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
@@ -1599,12 +1624,15 @@  static void namespace_unlock(void)
 	if (likely(hlist_empty(&head)))
 		return;
 
-	synchronize_rcu_expedited();
-
-	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
-		hlist_del(&m->mnt_umount);
-		mntput(&m->mnt);
+	drelease = kmalloc(sizeof(*drelease), GFP_KERNEL);
+	if (unlikely(!drelease)) {
+		synchronize_rcu_expedited();
+		free_mounts(&head);
+		return;
 	}
+
+	hlist_move_list(&head, &drelease->release_list);
+	call_rcu(&drelease->rcu, delayed_mount_release);
 }
 
 static inline void namespace_lock(void)