diff mbox series

[v3] drm/sched: Fix kernel NULL pointer dereference error

Message ID 20221017143006.2419-1-Arvind.Yadav@amd.com
State New
Headers show
Series [v3] drm/sched: Fix kernel NULL pointer dereference error | expand

Commit Message

Arvind Yadav Oct. 17, 2022, 2:30 p.m. UTC
-This is purely a timing issue. Here, sometimes Job free
is happening before the job is done.
To fix this issue moving 'dma_fence_cb' callback from
job(struct drm_sched_job) to scheduler fence (struct drm_sched_fence).

- Added drm_sched_fence_set_parent() function(and others *_parent_cb)
in sched_fence.c. Moved parent fence intilization and callback 
installation into this (this just cleanup).


BUG: kernel NULL pointer dereference, address: 0000000000000088
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP NOPTI
 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1
 Arvind : [dma_fence_default_wait _START] timeout = -1
 Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018
 RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched]
 Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0
 RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087
 RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018
 RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000
 RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
 R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708
 FS:  0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0
 Call Trace:
  <IRQ>
  drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
  dma_fence_signal_timestamp_locked+0x7e/0x110
  dma_fence_signal+0x31/0x60
  amdgpu_fence_process+0xc4/0x140 [amdgpu]
  gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu]
  amdgpu_irq_dispatch+0xb7/0x210 [amdgpu]
  amdgpu_ih_process+0x86/0x100 [amdgpu]
  amdgpu_irq_handler+0x24/0x60 [amdgpu]
  __handle_irq_event_percpu+0x4b/0x190
  handle_irq_event_percpu+0x15/0x50
  handle_irq_event+0x39/0x60
  handle_edge_irq+0xaf/0x210
  __common_interrupt+0x6e/0x110
  common_interrupt+0xc1/0xe0
  </IRQ>
  <TASK>

Signed-off-by: Arvind Yadav <Arvind.Yadav@amd.com>
---

Changes in v2: Moving 'dma_fence_cb' callback from
job(struct drm_sched_job) to scheduler fence(struct drm_sched_fence)
instead of adding NULL check for s_fence.

Changes in v3: Added drm_sched_fence_set_parent() function(and others *_parent_cb)
in sched_fence.c. Moved parent fence intilization and callback
installation into this (this just cleanup).

---
 drivers/gpu/drm/scheduler/sched_fence.c | 53 +++++++++++++++++++++++++
 drivers/gpu/drm/scheduler/sched_main.c  | 38 +++++-------------
 include/drm/gpu_scheduler.h             | 12 +++++-
 3 files changed, 72 insertions(+), 31 deletions(-)

Comments

Yadav, Arvind Oct. 18, 2022, 12:20 p.m. UTC | #1
On 10/17/2022 8:20 PM, Christian König wrote:
> Am 17.10.22 um 16:30 schrieb Arvind Yadav:
>> -This is purely a timing issue. Here, sometimes Job free
>> is happening before the job is done.
>> To fix this issue moving 'dma_fence_cb' callback from
>> job(struct drm_sched_job) to scheduler fence (struct drm_sched_fence).
>>
>> - Added drm_sched_fence_set_parent() function(and others *_parent_cb)
>> in sched_fence.c. Moved parent fence intilization and callback
>> installation into this (this just cleanup).
>>
>>
>> BUG: kernel NULL pointer dereference, address: 0000000000000088
>>   #PF: supervisor read access in kernel mode
>>   #PF: error_code(0x0000) - not-present page
>>   PGD 0 P4D 0
>>   Oops: 0000 [#1] PREEMPT SMP NOPTI
>>   CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1
>>   Arvind : [dma_fence_default_wait _START] timeout = -1
>>   Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018
>>   RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched]
>>   Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 
>> 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 
>> <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0
>>   RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087
>>   RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018
>>   RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000
>>   RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000
>>   R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
>>   R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708
>>   FS:  0000000000000000(0000) GS:ffff8a48a0a80000(0000) 
>> knlGS:0000000000000000
>>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>   CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0
>>   Call Trace:
>>    <IRQ>
>>    drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
>>    dma_fence_signal_timestamp_locked+0x7e/0x110
>>    dma_fence_signal+0x31/0x60
>>    amdgpu_fence_process+0xc4/0x140 [amdgpu]
>>    gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu]
>>    amdgpu_irq_dispatch+0xb7/0x210 [amdgpu]
>>    amdgpu_ih_process+0x86/0x100 [amdgpu]
>>    amdgpu_irq_handler+0x24/0x60 [amdgpu]
>>    __handle_irq_event_percpu+0x4b/0x190
>>    handle_irq_event_percpu+0x15/0x50
>>    handle_irq_event+0x39/0x60
>>    handle_edge_irq+0xaf/0x210
>>    __common_interrupt+0x6e/0x110
>>    common_interrupt+0xc1/0xe0
>>    </IRQ>
>>    <TASK>
>>
>> Signed-off-by: Arvind Yadav <Arvind.Yadav@amd.com>
>> ---
>>
>> Changes in v2: Moving 'dma_fence_cb' callback from
>> job(struct drm_sched_job) to scheduler fence(struct drm_sched_fence)
>> instead of adding NULL check for s_fence.
>>
>> Changes in v3: Added drm_sched_fence_set_parent() function(and others 
>> *_parent_cb)
>> in sched_fence.c. Moved parent fence intilization and callback
>> installation into this (this just cleanup).
>>
>> ---
>>   drivers/gpu/drm/scheduler/sched_fence.c | 53 +++++++++++++++++++++++++
>>   drivers/gpu/drm/scheduler/sched_main.c  | 38 +++++-------------
>>   include/drm/gpu_scheduler.h             | 12 +++++-
>>   3 files changed, 72 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c 
>> b/drivers/gpu/drm/scheduler/sched_fence.c
>> index 7fd869520ef2..f6808f363261 100644
>> --- a/drivers/gpu/drm/scheduler/sched_fence.c
>> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
>> @@ -77,6 +77,59 @@ static void drm_sched_fence_free_rcu(struct 
>> rcu_head *rcu)
>>       if (!WARN_ON_ONCE(!fence))
>>           kmem_cache_free(sched_fence_slab, fence);
>>   }
>
> Please add an empty line here.
I will fix in the next version of patch.
>
>> +/**
>> + * drm_sched_job_done_cb - the callback for a done job
>> + * @f: fence
>> + * @cb: fence callbacks
>> + */
>> +static void drm_sched_job_done_cb(struct dma_fence *f, struct 
>> dma_fence_cb *cb)
>
> Probably best to rename this to something like 
> drm_sched_fence_parent_cb().
>
I will rename in the next version of patch.
>> +{
>> +    struct drm_sched_fence *s_fence = container_of(cb, struct 
>> drm_sched_fence,
>> +                               cb);
>> +    struct drm_gpu_scheduler *sched = s_fence->sched;
>> +
>> +    atomic_dec(&sched->hw_rq_count);
>> +    atomic_dec(sched->score);
>> +
>> +    dma_fence_get(&s_fence->finished);
>
> We should probably make sure that this reference is taken before 
> installing the callback.

Here, we are signaling the finished fence and dma_fence_signal is 
checking the reference.

So we do not need to check here.

>
>> +    drm_sched_fence_finished(s_fence);
>> +    dma_fence_put(&s_fence->finished);
>> +    wake_up_interruptible(&sched->wake_up_worker);
>> +}
>> +
>> +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
>> +                  struct drm_sched_fence *s_fence)
>> +{
>> +    return dma_fence_add_callback(fence, &s_fence->cb,
>> +                      drm_sched_job_done_cb);
>> +}
>> +
>> +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence)
>> +{
>> +    return dma_fence_remove_callback(s_fence->parent,
>> +                     &s_fence->cb);
>> +}
>
> Do we really need separate functions for that?
>
We can use  'drm_sched_fence_set_parent' but we need to add extra NULL 
check before

adding in the callback otherwise add-callback will throw the warning 
message every time.

If I add NULL check then will never get any callback warning message for 
setting NULL parent fence.

So I have kept separate functions.
>> +/**
>> + * drm_sched_fence_set_parent - set the parent fence and add the 
>> callback
>> + * fence: pointer to the hw fence
>> + * @s_fence: pointer to the fence
>
> Reverse the parameter order, s_fence is the object we work on.
I will change order in next version of patch.
>
>> + *
>> + * Set the parent fence and intall the callback for a done job.
>
> You need to document that we take the reference of the parent fence.
>
>> + */
>> +int drm_sched_fence_set_parent(struct dma_fence *fence,
>> +                   struct drm_sched_fence *s_fence)
>> +{
>> +    if (s_fence->parent &&
>> +       dma_fence_remove_callback(s_fence->parent, &s_fence->cb))
>> +        dma_fence_put(s_fence->parent);
>> +
>> +    s_fence->parent = dma_fence_get(fence);
>> +    /* Drop for original kref_init of the fence */
>> +    dma_fence_put(fence);
>
> This leaks the reference to the old parent and the get/put dance is 
> not optimal either.
>
> Better do something like this.
>
> /* We keep the reference of the parent fence here. */
> swap(s_fence->parent, fence);
> dma_fence_put(fence);
>
>
I will change this in next version of patch.
>> +    return dma_fence_add_callback(fence, &s_fence->cb,
>> +                      drm_sched_job_done_cb);
>> +}
>
> When installing the callback fails we usually call the callback 
> function instead of returning the error.
>
>
I will call the drm_sched_job_done_cb(NULL, &s_fence->cb) callback.


>
>>     /**
>>    * drm_sched_fence_free - free up an uninitialized fence
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index 4cc59bae38dd..cfb52e15f5b0 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -253,13 +253,12 @@ drm_sched_rq_select_entity_fifo(struct 
>> drm_sched_rq *rq)
>>     /**
>>    * drm_sched_job_done - complete a job
>> - * @s_job: pointer to the job which is done
>> + * @s_fence: pointer to the fence of a done job
>>    *
>>    * Finish the job's fence and wake up the worker thread.
>>    */
>> -static void drm_sched_job_done(struct drm_sched_job *s_job)
>> +static void drm_sched_job_done(struct drm_sched_fence *s_fence)
>>   {
>> -    struct drm_sched_fence *s_fence = s_job->s_fence;
>>       struct drm_gpu_scheduler *sched = s_fence->sched;
>>         atomic_dec(&sched->hw_rq_count);
>> @@ -273,18 +272,6 @@ static void drm_sched_job_done(struct 
>> drm_sched_job *s_job)
>>       wake_up_interruptible(&sched->wake_up_worker);
>>   }
>>   -/**
>> - * drm_sched_job_done_cb - the callback for a done job
>> - * @f: fence
>> - * @cb: fence callbacks
>> - */
>> -static void drm_sched_job_done_cb(struct dma_fence *f, struct 
>> dma_fence_cb *cb)
>> -{
>> -    struct drm_sched_job *s_job = container_of(cb, struct 
>> drm_sched_job, cb);
>> -
>> -    drm_sched_job_done(s_job);
>> -}
>> -
>>   /**
>>    * drm_sched_dependency_optimized - test if the dependency can be 
>> optimized
>>    *
>> @@ -505,8 +492,7 @@ void drm_sched_stop(struct drm_gpu_scheduler 
>> *sched, struct drm_sched_job *bad)
>>       list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
>>                        list) {
>>           if (s_job->s_fence->parent &&
>> - dma_fence_remove_callback(s_job->s_fence->parent,
>> -                          &s_job->cb)) {
>> + drm_sched_fence_remove_parent_cb(s_job->s_fence)) {
>>               dma_fence_put(s_job->s_fence->parent);
>>               s_job->s_fence->parent = NULL;
>
> Better just call drm_sched_fence_set_parent() with NULL here to clear 
> the currently installed parent.
>
> This moves all this dance into the scheduler fence code.
>
>> atomic_dec(&sched->hw_rq_count);
>> @@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler 
>> *sched, bool full_recovery)
>>               continue;
>>             if (fence) {
>> -            r = dma_fence_add_callback(fence, &s_job->cb,
>> -                           drm_sched_job_done_cb);
>> +            r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence);
>>               if (r == -ENOENT)
>> -                drm_sched_job_done(s_job);
>> +                drm_sched_job_done(s_job->s_fence);
>>               else if (r)
>>                   DRM_DEV_ERROR(sched->dev, "fence add callback 
>> failed (%d)\n",
>
> Completely nuke that here. All of this should be done in the single 
> drm_sched_fence_set_parent() function.
>
> And an error message is completely superfluous. We just need to handle 
> the case that the callback can't be installed because the fence is 
> already signaled.
>
I will do the changes as per your review comments, Thank you for the review.

Thanks,

~Arvind

> Regards,
> Christian.
>
>>                         r);
>>           } else
>> -            drm_sched_job_done(s_job);
>> +            drm_sched_job_done(s_job->s_fence);
>>       }
>>         if (full_recovery) {
>> @@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param)
>>           drm_sched_fence_scheduled(s_fence);
>>             if (!IS_ERR_OR_NULL(fence)) {
>> -            s_fence->parent = dma_fence_get(fence);
>> -            /* Drop for original kref_init of the fence */
>> -            dma_fence_put(fence);
>> -
>> -            r = dma_fence_add_callback(fence, &sched_job->cb,
>> -                           drm_sched_job_done_cb);
>> +            r = drm_sched_fence_set_parent(fence, s_fence);
>>               if (r == -ENOENT)
>> -                drm_sched_job_done(sched_job);
>> +                drm_sched_job_done(s_fence);
>>               else if (r)
>>                   DRM_DEV_ERROR(sched->dev, "fence add callback 
>> failed (%d)\n",
>>                         r);
>> @@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param)
>>               if (IS_ERR(fence))
>>                   dma_fence_set_error(&s_fence->finished, 
>> PTR_ERR(fence));
>>   -            drm_sched_job_done(sched_job);
>> +            drm_sched_job_done(s_fence);
>>           }
>>             wake_up(&sched->job_scheduled);
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 1f7d9dd1a444..7258e2fa195f 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -281,6 +281,10 @@ struct drm_sched_fence {
>>            * @owner: job owner for debugging
>>            */
>>       void                *owner;
>> +    /**
>> +     * @cb: callback
>> +     */
>> +    struct dma_fence_cb cb;
>>   };
>>     struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>> @@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct 
>> dma_fence *f);
>>    *         be scheduled further.
>>    * @s_priority: the priority of the job.
>>    * @entity: the entity to which this job belongs.
>> - * @cb: the callback for the parent fence in s_fence.
>>    *
>>    * A job is created by the driver using drm_sched_job_init(), and
>>    * should call drm_sched_entity_push_job() once it wants the scheduler
>> @@ -325,7 +328,6 @@ struct drm_sched_job {
>>       atomic_t            karma;
>>       enum drm_sched_priority        s_priority;
>>       struct drm_sched_entity         *entity;
>> -    struct dma_fence_cb        cb;
>>       /**
>>        * @dependencies:
>>        *
>> @@ -559,6 +561,12 @@ void drm_sched_fence_free(struct drm_sched_fence 
>> *fence);
>>   void drm_sched_fence_scheduled(struct drm_sched_fence *fence);
>>   void drm_sched_fence_finished(struct drm_sched_fence *fence);
>>   +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
>> +                  struct drm_sched_fence *s_fence);
>> +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence);
>> +int drm_sched_fence_set_parent(struct dma_fence *fence,
>> +                   struct drm_sched_fence *s_fence);
>> +
>>   unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler 
>> *sched);
>>   void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>                           unsigned long remaining);
>
Christian König Oct. 18, 2022, 12:34 p.m. UTC | #2
Am 18.10.22 um 14:20 schrieb Yadav, Arvind:
> [SNIP]
>>
>>> +    drm_sched_fence_finished(s_fence);
>>> +    dma_fence_put(&s_fence->finished);
>>> +    wake_up_interruptible(&sched->wake_up_worker);
>>> +}
>>> +
>>> +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
>>> +                  struct drm_sched_fence *s_fence)
>>> +{
>>> +    return dma_fence_add_callback(fence, &s_fence->cb,
>>> +                      drm_sched_job_done_cb);
>>> +}
>>> +
>>> +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence)
>>> +{
>>> +    return dma_fence_remove_callback(s_fence->parent,
>>> +                     &s_fence->cb);
>>> +}
>>
>> Do we really need separate functions for that?
>>
> We can use  'drm_sched_fence_set_parent' but we need to add extra NULL 
> check before
>
> adding in the callback otherwise add-callback will throw the warning 
> message every time.
>
> If I add NULL check then will never get any callback warning message 
> for setting NULL parent fence.
>
> So I have kept separate functions.

I rather prefer having a single function and allowing the parent fence 
to be set to NULL.

Alternatively we could have a drm_sched_fence_set_parent() and 
drm_sched_fence_clear_parent() function if you really think it's cleaner 
that way.

>>> atomic_dec(&sched->hw_rq_count);
>>> @@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler 
>>> *sched, bool full_recovery)
>>>               continue;
>>>             if (fence) {
>>> -            r = dma_fence_add_callback(fence, &s_job->cb,
>>> -                           drm_sched_job_done_cb);
>>> +            r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence);
>>>               if (r == -ENOENT)
>>> -                drm_sched_job_done(s_job);
>>> +                drm_sched_job_done(s_job->s_fence);
>>>               else if (r)
>>>                   DRM_DEV_ERROR(sched->dev, "fence add callback 
>>> failed (%d)\n",
>>
>> Completely nuke that here. All of this should be done in the single 
>> drm_sched_fence_set_parent() function.
>>
>> And an error message is completely superfluous. We just need to 
>> handle the case that the callback can't be installed because the 
>> fence is already signaled.
>>
> I will do the changes as per your review comments, Thank you for the 
> review.

Just to clarify, you should nuke the error message. Error handling is 
rather pointless here.

Thanks,
Christian.

>
> Thanks,
>
> ~Arvind
>
>> Regards,
>> Christian.
>>
>>>                         r);
>>>           } else
>>> -            drm_sched_job_done(s_job);
>>> +            drm_sched_job_done(s_job->s_fence);
>>>       }
>>>         if (full_recovery) {
>>> @@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param)
>>>           drm_sched_fence_scheduled(s_fence);
>>>             if (!IS_ERR_OR_NULL(fence)) {
>>> -            s_fence->parent = dma_fence_get(fence);
>>> -            /* Drop for original kref_init of the fence */
>>> -            dma_fence_put(fence);
>>> -
>>> -            r = dma_fence_add_callback(fence, &sched_job->cb,
>>> -                           drm_sched_job_done_cb);
>>> +            r = drm_sched_fence_set_parent(fence, s_fence);
>>>               if (r == -ENOENT)
>>> -                drm_sched_job_done(sched_job);
>>> +                drm_sched_job_done(s_fence);
>>>               else if (r)
>>>                   DRM_DEV_ERROR(sched->dev, "fence add callback 
>>> failed (%d)\n",
>>>                         r);
>>> @@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param)
>>>               if (IS_ERR(fence))
>>> dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>>   -            drm_sched_job_done(sched_job);
>>> +            drm_sched_job_done(s_fence);
>>>           }
>>>             wake_up(&sched->job_scheduled);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 1f7d9dd1a444..7258e2fa195f 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -281,6 +281,10 @@ struct drm_sched_fence {
>>>            * @owner: job owner for debugging
>>>            */
>>>       void                *owner;
>>> +    /**
>>> +     * @cb: callback
>>> +     */
>>> +    struct dma_fence_cb cb;
>>>   };
>>>     struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>>> @@ -300,7 +304,6 @@ struct drm_sched_fence 
>>> *to_drm_sched_fence(struct dma_fence *f);
>>>    *         be scheduled further.
>>>    * @s_priority: the priority of the job.
>>>    * @entity: the entity to which this job belongs.
>>> - * @cb: the callback for the parent fence in s_fence.
>>>    *
>>>    * A job is created by the driver using drm_sched_job_init(), and
>>>    * should call drm_sched_entity_push_job() once it wants the 
>>> scheduler
>>> @@ -325,7 +328,6 @@ struct drm_sched_job {
>>>       atomic_t            karma;
>>>       enum drm_sched_priority        s_priority;
>>>       struct drm_sched_entity         *entity;
>>> -    struct dma_fence_cb        cb;
>>>       /**
>>>        * @dependencies:
>>>        *
>>> @@ -559,6 +561,12 @@ void drm_sched_fence_free(struct 
>>> drm_sched_fence *fence);
>>>   void drm_sched_fence_scheduled(struct drm_sched_fence *fence);
>>>   void drm_sched_fence_finished(struct drm_sched_fence *fence);
>>>   +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
>>> +                  struct drm_sched_fence *s_fence);
>>> +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence 
>>> *s_fence);
>>> +int drm_sched_fence_set_parent(struct dma_fence *fence,
>>> +                   struct drm_sched_fence *s_fence);
>>> +
>>>   unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler 
>>> *sched);
>>>   void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>>                           unsigned long remaining);
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
index 7fd869520ef2..f6808f363261 100644
--- a/drivers/gpu/drm/scheduler/sched_fence.c
+++ b/drivers/gpu/drm/scheduler/sched_fence.c
@@ -77,6 +77,59 @@  static void drm_sched_fence_free_rcu(struct rcu_head *rcu)
 	if (!WARN_ON_ONCE(!fence))
 		kmem_cache_free(sched_fence_slab, fence);
 }
+/**
+ * drm_sched_job_done_cb - the callback for a done job
+ * @f: fence
+ * @cb: fence callbacks
+ */
+static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
+{
+	struct drm_sched_fence *s_fence = container_of(cb, struct drm_sched_fence,
+						       cb);
+	struct drm_gpu_scheduler *sched = s_fence->sched;
+
+	atomic_dec(&sched->hw_rq_count);
+	atomic_dec(sched->score);
+
+	dma_fence_get(&s_fence->finished);
+	drm_sched_fence_finished(s_fence);
+	dma_fence_put(&s_fence->finished);
+	wake_up_interruptible(&sched->wake_up_worker);
+}
+
+int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
+				  struct drm_sched_fence *s_fence)
+{
+	return dma_fence_add_callback(fence, &s_fence->cb,
+				      drm_sched_job_done_cb);
+}
+
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence)
+{
+	return dma_fence_remove_callback(s_fence->parent,
+					 &s_fence->cb);
+}
+
+/**
+ * drm_sched_fence_set_parent - set the parent fence and add the callback
+ * fence: pointer to the hw fence
+ * @s_fence: pointer to the fence
+ *
+ * Set the parent fence and intall the callback for a done job.
+ */
+int drm_sched_fence_set_parent(struct dma_fence *fence,
+			       struct drm_sched_fence *s_fence)
+{
+	if (s_fence->parent &&
+	   dma_fence_remove_callback(s_fence->parent, &s_fence->cb))
+		dma_fence_put(s_fence->parent);
+
+	s_fence->parent = dma_fence_get(fence);
+	/* Drop for original kref_init of the fence */
+	dma_fence_put(fence);
+	return dma_fence_add_callback(fence, &s_fence->cb,
+				      drm_sched_job_done_cb);
+}
 
 /**
  * drm_sched_fence_free - free up an uninitialized fence
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 4cc59bae38dd..cfb52e15f5b0 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -253,13 +253,12 @@  drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
 
 /**
  * drm_sched_job_done - complete a job
- * @s_job: pointer to the job which is done
+ * @s_fence: pointer to the fence of a done job
  *
  * Finish the job's fence and wake up the worker thread.
  */
-static void drm_sched_job_done(struct drm_sched_job *s_job)
+static void drm_sched_job_done(struct drm_sched_fence *s_fence)
 {
-	struct drm_sched_fence *s_fence = s_job->s_fence;
 	struct drm_gpu_scheduler *sched = s_fence->sched;
 
 	atomic_dec(&sched->hw_rq_count);
@@ -273,18 +272,6 @@  static void drm_sched_job_done(struct drm_sched_job *s_job)
 	wake_up_interruptible(&sched->wake_up_worker);
 }
 
-/**
- * drm_sched_job_done_cb - the callback for a done job
- * @f: fence
- * @cb: fence callbacks
- */
-static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
-{
-	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
-
-	drm_sched_job_done(s_job);
-}
-
 /**
  * drm_sched_dependency_optimized - test if the dependency can be optimized
  *
@@ -505,8 +492,7 @@  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 	list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
 					 list) {
 		if (s_job->s_fence->parent &&
-		    dma_fence_remove_callback(s_job->s_fence->parent,
-					      &s_job->cb)) {
+		    drm_sched_fence_remove_parent_cb(s_job->s_fence)) {
 			dma_fence_put(s_job->s_fence->parent);
 			s_job->s_fence->parent = NULL;
 			atomic_dec(&sched->hw_rq_count);
@@ -576,15 +562,14 @@  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 			continue;
 
 		if (fence) {
-			r = dma_fence_add_callback(fence, &s_job->cb,
-						   drm_sched_job_done_cb);
+			r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence);
 			if (r == -ENOENT)
-				drm_sched_job_done(s_job);
+				drm_sched_job_done(s_job->s_fence);
 			else if (r)
 				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
 					  r);
 		} else
-			drm_sched_job_done(s_job);
+			drm_sched_job_done(s_job->s_fence);
 	}
 
 	if (full_recovery) {
@@ -1049,14 +1034,9 @@  static int drm_sched_main(void *param)
 		drm_sched_fence_scheduled(s_fence);
 
 		if (!IS_ERR_OR_NULL(fence)) {
-			s_fence->parent = dma_fence_get(fence);
-			/* Drop for original kref_init of the fence */
-			dma_fence_put(fence);
-
-			r = dma_fence_add_callback(fence, &sched_job->cb,
-						   drm_sched_job_done_cb);
+			r = drm_sched_fence_set_parent(fence, s_fence);
 			if (r == -ENOENT)
-				drm_sched_job_done(sched_job);
+				drm_sched_job_done(s_fence);
 			else if (r)
 				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
 					  r);
@@ -1064,7 +1044,7 @@  static int drm_sched_main(void *param)
 			if (IS_ERR(fence))
 				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
 
-			drm_sched_job_done(sched_job);
+			drm_sched_job_done(s_fence);
 		}
 
 		wake_up(&sched->job_scheduled);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 1f7d9dd1a444..7258e2fa195f 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -281,6 +281,10 @@  struct drm_sched_fence {
          * @owner: job owner for debugging
          */
 	void				*owner;
+	/**
+	 * @cb: callback
+	 */
+	struct dma_fence_cb cb;
 };
 
 struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
@@ -300,7 +304,6 @@  struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
  *         be scheduled further.
  * @s_priority: the priority of the job.
  * @entity: the entity to which this job belongs.
- * @cb: the callback for the parent fence in s_fence.
  *
  * A job is created by the driver using drm_sched_job_init(), and
  * should call drm_sched_entity_push_job() once it wants the scheduler
@@ -325,7 +328,6 @@  struct drm_sched_job {
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
 	struct drm_sched_entity         *entity;
-	struct dma_fence_cb		cb;
 	/**
 	 * @dependencies:
 	 *
@@ -559,6 +561,12 @@  void drm_sched_fence_free(struct drm_sched_fence *fence);
 void drm_sched_fence_scheduled(struct drm_sched_fence *fence);
 void drm_sched_fence_finished(struct drm_sched_fence *fence);
 
+int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
+				  struct drm_sched_fence *s_fence);
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence);
+int drm_sched_fence_set_parent(struct dma_fence *fence,
+			       struct drm_sched_fence *s_fence);
+
 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
 		                unsigned long remaining);