diff mbox series

[RFC] drm/msm: Add GPU memory traces

Message ID 20240301185346.10412-1-robdclark@gmail.com
State New
Headers show
Series [RFC] drm/msm: Add GPU memory traces | expand

Commit Message

Rob Clark March 1, 2024, 6:53 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

Perfetto can use these traces to track global and per-process GPU memory
usage.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
I realized the tracepoint that perfetto uses to show GPU memory usage
globally and per-process was already upstream, but with no users.

This overlaps a bit with fdinfo, but ftrace is a lighter weight
mechanism and fits better with perfetto (plus is already supported in
trace_processor and perfetto UI, whereas something fdinfo based would
require new code to be added in perfetto.

We could probably do this more globally (ie. drm_gem_get/put_pages() and
drm_gem_handle_create_tail()/drm_gem_object_release_handle() if folks
prefer.  Not sure where that leaves the TTM drivers.

 drivers/gpu/drm/msm/Kconfig   |  1 +
 drivers/gpu/drm/msm/msm_drv.h |  5 +++++
 drivers/gpu/drm/msm/msm_gem.c | 37 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/msm/msm_gpu.h |  8 ++++++++
 4 files changed, 51 insertions(+)

Comments

Rob Clark March 5, 2024, 2:03 a.m. UTC | #1
On Mon, Mar 4, 2024 at 5:38 PM Gurchetan Singh
<gurchetansingh@chromium.org> wrote:
>
>
>
>
> On Fri, Mar 1, 2024 at 10:54 AM Rob Clark <robdclark@gmail.com> wrote:
>>
>> From: Rob Clark <robdclark@chromium.org>
>>
>> Perfetto can use these traces to track global and per-process GPU memory
>> usage.
>>
>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>> ---
>> I realized the tracepoint that perfetto uses to show GPU memory usage
>> globally and per-process was already upstream, but with no users.
>>
>> This overlaps a bit with fdinfo, but ftrace is a lighter weight
>> mechanism and fits better with perfetto (plus is already supported in
>> trace_processor and perfetto UI, whereas something fdinfo based would
>> require new code to be added in perfetto.
>>
>> We could probably do this more globally (ie. drm_gem_get/put_pages() and
>> drm_gem_handle_create_tail()/drm_gem_object_release_handle() if folks
>> prefer.  Not sure where that leaves the TTM drivers.
>>
>>  drivers/gpu/drm/msm/Kconfig   |  1 +
>>  drivers/gpu/drm/msm/msm_drv.h |  5 +++++
>>  drivers/gpu/drm/msm/msm_gem.c | 37 +++++++++++++++++++++++++++++++++++
>>  drivers/gpu/drm/msm/msm_gpu.h |  8 ++++++++
>>  4 files changed, 51 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
>> index f202f26adab2..e4c912fcaf22 100644
>> --- a/drivers/gpu/drm/msm/Kconfig
>> +++ b/drivers/gpu/drm/msm/Kconfig
>> @@ -33,6 +33,7 @@ config DRM_MSM
>>         select PM_OPP
>>         select NVMEM
>>         select PM_GENERIC_DOMAINS
>> +       select TRACE_GPU_MEM
>>         help
>>           DRM/KMS driver for MSM/snapdragon.
>>
>> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
>> index 16a7cbc0b7dd..cb8f7e804b5b 100644
>> --- a/drivers/gpu/drm/msm/msm_drv.h
>> +++ b/drivers/gpu/drm/msm/msm_drv.h
>> @@ -137,6 +137,11 @@ struct msm_drm_private {
>>         struct msm_rd_state *hangrd;   /* debugfs to dump hanging submits */
>>         struct msm_perf_state *perf;
>>
>> +       /**
>> +        * total_mem: Total/global amount of memory backing GEM objects.
>> +        */
>> +       atomic64_t total_mem;
>> +
>>         /**
>>          * List of all GEM objects (mainly for debugfs, protected by obj_lock
>>          * (acquire before per GEM object lock)
>> diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
>> index 175ee4ab8a6f..e04c4af5d154 100644
>> --- a/drivers/gpu/drm/msm/msm_gem.c
>> +++ b/drivers/gpu/drm/msm/msm_gem.c
>> @@ -12,6 +12,9 @@
>>  #include <linux/pfn_t.h>
>>
>>  #include <drm/drm_prime.h>
>> +#include <drm/drm_file.h>
>> +
>> +#include <trace/events/gpu_mem.h>
>>
>>  #include "msm_drv.h"
>>  #include "msm_fence.h"
>> @@ -33,6 +36,34 @@ static bool use_pages(struct drm_gem_object *obj)
>>         return !msm_obj->vram_node;
>>  }
>>
>> +static void update_device_mem(struct msm_drm_private *priv, ssize_t size)
>> +{
>> +       uint64_t total_mem = atomic64_add_return(size, &priv->total_mem);
>> +       trace_gpu_mem_total(0, 0, total_mem);
>> +}
>> +
>> +static void update_ctx_mem(struct drm_file *file, ssize_t size)
>> +{
>> +       struct msm_file_private *ctx = file->driver_priv;
>> +       uint64_t ctx_mem = atomic64_add_return(size, &ctx->ctx_mem);
>> +
>> +       rcu_read_lock(); /* Locks file->pid! */
>> +       trace_gpu_mem_total(0, pid_nr(file->pid), ctx_mem);
>> +       rcu_read_unlock();
>> +
>> +}
>> +
>> +static int msm_gem_open(struct drm_gem_object *obj, struct drm_file *file)
>> +{
>> +       update_ctx_mem(file, obj->size);
>> +       return 0;
>> +}
>> +
>> +static void msm_gem_close(struct drm_gem_object *obj, struct drm_file *file)
>> +{
>> +       update_ctx_mem(file, -obj->size);
>> +}
>> +
>>  /*
>>   * Cache sync.. this is a bit over-complicated, to fit dma-mapping
>>   * API.  Really GPU cache is out of scope here (handled on cmdstream)
>> @@ -156,6 +187,8 @@ static struct page **get_pages(struct drm_gem_object *obj)
>>                         return p;
>>                 }
>>
>> +               update_device_mem(dev->dev_private, obj->size);
>> +
>>                 msm_obj->pages = p;
>>
>>                 msm_obj->sgt = drm_prime_pages_to_sg(obj->dev, p, npages);
>> @@ -209,6 +242,8 @@ static void put_pages(struct drm_gem_object *obj)
>>                         msm_obj->sgt = NULL;
>>                 }
>>
>> +               update_device_mem(obj->dev->dev_private, -obj->size);
>> +
>>                 if (use_pages(obj))
>>                         drm_gem_put_pages(obj, msm_obj->pages, true, false);
>>                 else
>> @@ -1118,6 +1153,8 @@ static const struct vm_operations_struct vm_ops = {
>>
>>  static const struct drm_gem_object_funcs msm_gem_object_funcs = {
>>         .free = msm_gem_free_object,
>> +       .open = msm_gem_open,
>> +       .close = msm_gem_close,
>>         .pin = msm_gem_prime_pin,
>>         .unpin = msm_gem_prime_unpin,
>>         .get_sg_table = msm_gem_prime_get_sg_table,
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
>> index 2bfcb222e353..f7d2a7d6f8cc 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>> @@ -428,6 +428,14 @@ struct msm_file_private {
>>          * level.
>>          */
>>         struct drm_sched_entity *entities[NR_SCHED_PRIORITIES * MSM_GPU_MAX_RINGS];
>> +
>> +       /**
>> +        * ctx_mem:
>> +        *
>> +        * Total amount of memory of GEM buffers with handles attached for
>> +        * this context.
>> +        */
>> +       atomic64_t ctx_mem;
>>  };
>
>
>
> Just for added context, past discussions on TRACE_GPU_MEM:
>
> https://lists.freedesktop.org/archives/dri-devel/2021-October/328260.html
> https://lists.freedesktop.org/archives/dri-devel/2021-January/295120.html
>
> Some have even suggested deleting the tracepoint altogether.
>
> Personally, I think we should land an internal user in a non-breaking way, since userspace (Perfetto) already depends on it.  Right now, we're in limbo for multiple years ...

For better or for worse, the tracepoint already landed.. and tbh I
don't see any real problem with it.  And it defn seems like a valid
option to land support for in-driver and later refactor for more
shared code.  We already have the uapi and the userspace consuming it,
so doesn't seem like there is any debate there.  Maybe there is
something from the original series that could be recycled for
something less driver specific.

Re: some of the discussion about cgroups, I think that is a
non-sequitur because (AFAICT) perfetto wants a global view of pids/etc
(at least I'm not really sure what the value of system tracing is if
it isn't, you know, system level.. I deliberately avoided using
virtual-pid's for that reason)

BR,
-R

>>
>>  /**
>> --
>> 2.44.0
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index f202f26adab2..e4c912fcaf22 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -33,6 +33,7 @@  config DRM_MSM
 	select PM_OPP
 	select NVMEM
 	select PM_GENERIC_DOMAINS
+	select TRACE_GPU_MEM
 	help
 	  DRM/KMS driver for MSM/snapdragon.
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 16a7cbc0b7dd..cb8f7e804b5b 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -137,6 +137,11 @@  struct msm_drm_private {
 	struct msm_rd_state *hangrd;   /* debugfs to dump hanging submits */
 	struct msm_perf_state *perf;
 
+	/**
+	 * total_mem: Total/global amount of memory backing GEM objects.
+	 */
+	atomic64_t total_mem;
+
 	/**
 	 * List of all GEM objects (mainly for debugfs, protected by obj_lock
 	 * (acquire before per GEM object lock)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 175ee4ab8a6f..e04c4af5d154 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -12,6 +12,9 @@ 
 #include <linux/pfn_t.h>
 
 #include <drm/drm_prime.h>
+#include <drm/drm_file.h>
+
+#include <trace/events/gpu_mem.h>
 
 #include "msm_drv.h"
 #include "msm_fence.h"
@@ -33,6 +36,34 @@  static bool use_pages(struct drm_gem_object *obj)
 	return !msm_obj->vram_node;
 }
 
+static void update_device_mem(struct msm_drm_private *priv, ssize_t size)
+{
+	uint64_t total_mem = atomic64_add_return(size, &priv->total_mem);
+	trace_gpu_mem_total(0, 0, total_mem);
+}
+
+static void update_ctx_mem(struct drm_file *file, ssize_t size)
+{
+	struct msm_file_private *ctx = file->driver_priv;
+	uint64_t ctx_mem = atomic64_add_return(size, &ctx->ctx_mem);
+
+	rcu_read_lock(); /* Locks file->pid! */
+	trace_gpu_mem_total(0, pid_nr(file->pid), ctx_mem);
+	rcu_read_unlock();
+
+}
+
+static int msm_gem_open(struct drm_gem_object *obj, struct drm_file *file)
+{
+	update_ctx_mem(file, obj->size);
+	return 0;
+}
+
+static void msm_gem_close(struct drm_gem_object *obj, struct drm_file *file)
+{
+	update_ctx_mem(file, -obj->size);
+}
+
 /*
  * Cache sync.. this is a bit over-complicated, to fit dma-mapping
  * API.  Really GPU cache is out of scope here (handled on cmdstream)
@@ -156,6 +187,8 @@  static struct page **get_pages(struct drm_gem_object *obj)
 			return p;
 		}
 
+		update_device_mem(dev->dev_private, obj->size);
+
 		msm_obj->pages = p;
 
 		msm_obj->sgt = drm_prime_pages_to_sg(obj->dev, p, npages);
@@ -209,6 +242,8 @@  static void put_pages(struct drm_gem_object *obj)
 			msm_obj->sgt = NULL;
 		}
 
+		update_device_mem(obj->dev->dev_private, -obj->size);
+
 		if (use_pages(obj))
 			drm_gem_put_pages(obj, msm_obj->pages, true, false);
 		else
@@ -1118,6 +1153,8 @@  static const struct vm_operations_struct vm_ops = {
 
 static const struct drm_gem_object_funcs msm_gem_object_funcs = {
 	.free = msm_gem_free_object,
+	.open = msm_gem_open,
+	.close = msm_gem_close,
 	.pin = msm_gem_prime_pin,
 	.unpin = msm_gem_prime_unpin,
 	.get_sg_table = msm_gem_prime_get_sg_table,
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 2bfcb222e353..f7d2a7d6f8cc 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -428,6 +428,14 @@  struct msm_file_private {
 	 * level.
 	 */
 	struct drm_sched_entity *entities[NR_SCHED_PRIORITIES * MSM_GPU_MAX_RINGS];
+
+	/**
+	 * ctx_mem:
+	 *
+	 * Total amount of memory of GEM buffers with handles attached for
+	 * this context.
+	 */
+	atomic64_t ctx_mem;
 };
 
 /**