diff mbox series

[v4,6/7] mm/maps: read proc/pid/maps under per-vma lock

Message ID 20250604231151.799834-7-surenb@google.com
State New
Headers show
Series use per-vma locks for /proc/pid/maps reads and PROCMAP_QUERY | expand

Commit Message

Suren Baghdasaryan June 4, 2025, 11:11 p.m. UTC
With maple_tree supporting vma tree traversal under RCU and per-vma
locks, /proc/pid/maps can be read while holding individual vma locks
instead of locking the entire address space.
Completely lockless approach would be quite complex with the main issue
being get_vma_name() using callbacks which might not work correctly with
a stable vma copy, requiring original (unstable) vma.
When per-vma lock acquisition fails, we take the mmap_lock for reading,
lock the vma, release the mmap_lock and continue. This guarantees the
reader to make forward progress even during lock contention. This will
interfere with the writer but for a very short time while we are
acquiring the per-vma lock and only when there was contention on the
vma reader is interested in.
One case requiring special handling is when vma changes between the
time it was found and the time it got locked. A problematic case would
be if vma got shrunk so that it's start moved higher in the address
space and a new vma was installed at the beginning:

reader found:               |--------VMA A--------|
VMA is modified:            |-VMA B-|----VMA A----|
reader locks modified VMA A
reader reports VMA A:       |  gap  |----VMA A----|

This would result in reporting a gap in the address space that does not
exist. To prevent this we retry the lookup after locking the vma, however
we do that only when we identify a gap and detect that the address space
was changed after we found the vma.
This change is designed to reduce mmap_lock contention and prevent a
process reading /proc/pid/maps files (often a low priority task, such
as monitoring/data collection services) from blocking address space
updates. Note that this change has a userspace visible disadvantage:
it allows for sub-page data tearing as opposed to the previous mechanism
where data tearing could happen only between pages of generated output
data. Since current userspace considers data tearing between pages to be
acceptable, we assume is will be able to handle sub-page data tearing
as well.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 fs/proc/internal.h |   6 ++
 fs/proc/task_mmu.c | 177 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 175 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 96122e91c645..3728c9012687 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -379,6 +379,12 @@  struct proc_maps_private {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct vma_iterator iter;
+	loff_t last_pos;
+#ifdef CONFIG_PER_VMA_LOCK
+	bool mmap_locked;
+	unsigned int mm_wr_seq;
+	struct vm_area_struct *locked_vma;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *task_mempolicy;
 #endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 27972c0749e7..36d883c4f394 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -127,13 +127,172 @@  static void release_task_mempolicy(struct proc_maps_private *priv)
 }
 #endif
 
-static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
-						loff_t *ppos)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static struct vm_area_struct *trylock_vma(struct proc_maps_private *priv,
+					  struct vm_area_struct *vma,
+					  unsigned long last_pos,
+					  bool mm_unstable)
+{
+	vma = vma_start_read(priv->mm, vma);
+	if (IS_ERR_OR_NULL(vma))
+		return NULL;
+
+	/* Check if the vma we locked is the right one. */
+	if (unlikely(vma->vm_mm != priv->mm))
+		goto err;
+
+	/* vma should not be ahead of the last search position. */
+	if (unlikely(last_pos >= vma->vm_end))
+		goto err;
+
+	/*
+	 * vma ahead of last search position is possible but we need to
+	 * verify that it was not shrunk after we found it, and another
+	 * vma has not been installed ahead of it. Otherwise we might
+	 * observe a gap that should not be there.
+	 */
+	if (mm_unstable && last_pos < vma->vm_start) {
+		/* Verify only if the address space changed since vma lookup. */
+		if ((priv->mm_wr_seq & 1) ||
+		    mmap_lock_speculate_retry(priv->mm, priv->mm_wr_seq)) {
+			vma_iter_init(&priv->iter, priv->mm, last_pos);
+			if (vma != vma_next(&priv->iter))
+				goto err;
+		}
+	}
+
+	priv->locked_vma = vma;
+
+	return vma;
+err:
+	vma_end_read(vma);
+	return NULL;
+}
+
+
+static void unlock_vma(struct proc_maps_private *priv)
+{
+	if (priv->locked_vma) {
+		vma_end_read(priv->locked_vma);
+		priv->locked_vma = NULL;
+	}
+}
+
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_content(struct seq_file *m,
+				struct proc_maps_private *priv)
+{
+	/*
+	 * smaps and numa_maps perform page table walk, therefore require
+	 * mmap_lock but maps can be read with locked vma only.
+	 */
+	if (m->op != &proc_pid_maps_op) {
+		if (mmap_read_lock_killable(priv->mm))
+			return false;
+
+		priv->mmap_locked = true;
+	} else {
+		rcu_read_lock();
+		priv->locked_vma = NULL;
+		priv->mmap_locked = false;
+	}
+
+	return true;
+}
+
+static inline void unlock_content(struct proc_maps_private *priv)
+{
+	if (priv->mmap_locked) {
+		mmap_read_unlock(priv->mm);
+	} else {
+		unlock_vma(priv);
+		rcu_read_unlock();
+	}
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+					   loff_t last_pos)
 {
-	struct vm_area_struct *vma = vma_next(&priv->iter);
+	struct vm_area_struct *vma;
+	int ret;
+
+	if (priv->mmap_locked)
+		return vma_next(&priv->iter);
+
+	unlock_vma(priv);
+	/*
+	 * Record sequence number ahead of vma lookup.
+	 * Odd seqcount means address space modification is in progress.
+	 */
+	mmap_lock_speculate_try_begin(priv->mm, &priv->mm_wr_seq);
+	vma = vma_next(&priv->iter);
+	if (!vma)
+		return NULL;
+
+	vma = trylock_vma(priv, vma, last_pos, true);
+	if (vma)
+		return vma;
+
+	/* Address space got modified, vma might be stale. Re-lock and retry */
+	rcu_read_unlock();
+	ret = mmap_read_lock_killable(priv->mm);
+	rcu_read_lock();
+	if (ret)
+		return ERR_PTR(ret);
+
+	/* Lookup the vma at the last position again under mmap_read_lock */
+	vma_iter_init(&priv->iter, priv->mm, last_pos);
+	vma = vma_next(&priv->iter);
+	if (vma) {
+		vma = trylock_vma(priv, vma, last_pos, false);
+		WARN_ON(!vma); /* mm is stable, has to succeed */
+	}
+	mmap_read_unlock(priv->mm);
+
+	return vma;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
 
+static inline bool lock_content(struct seq_file *m,
+				struct proc_maps_private *priv)
+{
+	return mmap_read_lock_killable(priv->mm) == 0;
+}
+
+static inline void unlock_content(struct proc_maps_private *priv)
+{
+	mmap_read_unlock(priv->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+					   loff_t last_pos)
+{
+	return vma_next(&priv->iter);
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma;
+
+	vma = get_next_vma(priv, *ppos);
+	if (IS_ERR(vma))
+		return vma;
+
+	/* Store previous position to be able to restart if needed */
+	priv->last_pos = *ppos;
 	if (vma) {
-		*ppos = vma->vm_start;
+		/*
+		 * Track the end of the reported vma to ensure position changes
+		 * even if previous vma was merged with the next vma and we
+		 * found the extended vma with the same vm_start.
+		 */
+		*ppos = vma->vm_end;
 	} else {
 		*ppos = -2UL;
 		vma = get_gate_vma(priv->mm);
@@ -163,19 +322,21 @@  static void *m_start(struct seq_file *m, loff_t *ppos)
 		return NULL;
 	}
 
-	if (mmap_read_lock_killable(mm)) {
+	if (!lock_content(m, priv)) {
 		mmput(mm);
 		put_task_struct(priv->task);
 		priv->task = NULL;
 		return ERR_PTR(-EINTR);
 	}
 
+	if (last_addr > 0)
+		*ppos = last_addr = priv->last_pos;
 	vma_iter_init(&priv->iter, mm, last_addr);
 	hold_task_mempolicy(priv);
 	if (last_addr == -2UL)
 		return get_gate_vma(mm);
 
-	return proc_get_vma(priv, ppos);
+	return proc_get_vma(m, ppos);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
@@ -184,7 +345,7 @@  static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
 		*ppos = -1UL;
 		return NULL;
 	}
-	return proc_get_vma(m->private, ppos);
+	return proc_get_vma(m, ppos);
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -196,7 +357,7 @@  static void m_stop(struct seq_file *m, void *v)
 		return;
 
 	release_task_mempolicy(priv);
-	mmap_read_unlock(mm);
+	unlock_content(priv);
 	mmput(mm);
 	put_task_struct(priv->task);
 	priv->task = NULL;