diff mbox series

[1/2] drm/amdgpu: Don't query CE and UE errors

Message ID 20210512170302.64951-1-luben.tuikov@amd.com
State New
Headers show
Series [1/2] drm/amdgpu: Don't query CE and UE errors | expand

Commit Message

Luben Tuikov May 12, 2021, 5:03 p.m. UTC
On QUERY2 IOCTL don't query counts of correctable
and uncorrectable errors, since when RAS is
enabled and supported on Vega20 server boards,
this takes insurmountably long time, in O(n^3),
which slows the system down to the point of it
being unusable when we have GUI up.

Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2")
Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

Comments

Luben Tuikov May 13, 2021, 7:37 p.m. UTC | #1
On 2021-05-13 3:56 a.m., Christian König wrote:
>
> Am 12.05.21 um 19:03 schrieb Luben Tuikov:
>> On QUERY2 IOCTL don't query counts of correctable
>> and uncorrectable errors, since when RAS is
>> enabled and supported on Vega20 server boards,
>> this takes insurmountably long time, in O(n^3),
>> which slows the system down to the point of it
>> being unusable when we have GUI up.
>>
>> Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2")
>> Cc: Alexander Deucher <Alexander.Deucher@amd.com>
>> Cc: stable@vger.kernel.org
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++-------------
>>   1 file changed, 13 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index 01fe60fedcbe..d481a33f4eaf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -363,19 +363,19 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>>   		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>>   
>>   	/*query ue count*/
>> -	ras_counter = amdgpu_ras_query_error_count(adev, false);
>> -	/*ras counter is monotonic increasing*/
>> -	if (ras_counter != ctx->ras_counter_ue) {
>> -		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
>> -		ctx->ras_counter_ue = ras_counter;
>> -	}
>> -
>> -	/*query ce count*/
>> -	ras_counter = amdgpu_ras_query_error_count(adev, true);
>> -	if (ras_counter != ctx->ras_counter_ce) {
>> -		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
>> -		ctx->ras_counter_ce = ras_counter;
>> -	}
>> +	/* ras_counter = amdgpu_ras_query_error_count(adev, false); */
>> +	/* /\*ras counter is monotonic increasing*\/ */
>> +	/* if (ras_counter != ctx->ras_counter_ue) { */
>> +	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
>> +	/* 	ctx->ras_counter_ue = ras_counter; */
>> +	/* } */
>> +
>> +	/* /\*query ce count*\/ */
>> +	/* ras_counter = amdgpu_ras_query_error_count(adev, true); */
>> +	/* if (ras_counter != ctx->ras_counter_ce) { */
>> +	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
>> +	/* 	ctx->ras_counter_ce = ras_counter; */
>> +	/* } */
> Please completely drop the code. We usually don't keep commented out 
> code in the driver.

1. Alex suggested this when we chatted--this is why it is commented.
2. He suggested the same thing last night and 2.5 hours before your email,
    I posted a patch in which the code is commented out--did you not see it?
    It's threaded, it appears above, 2.5 hours before your email.

Regards,
Luben

>
> With that done the patch is Reviewed-by: Christian König 
> <christian.koenig@amd.com>
>
> Christian.
>
>>   
>>   	mutex_unlock(&mgr->lock);
>>   	return 0;
Christian König May 14, 2021, 7:31 a.m. UTC | #2
Am 13.05.21 um 21:37 schrieb Luben Tuikov:
> On 2021-05-13 3:56 a.m., Christian König wrote:

>> Am 12.05.21 um 19:03 schrieb Luben Tuikov:

>>> On QUERY2 IOCTL don't query counts of correctable

>>> and uncorrectable errors, since when RAS is

>>> enabled and supported on Vega20 server boards,

>>> this takes insurmountably long time, in O(n^3),

>>> which slows the system down to the point of it

>>> being unusable when we have GUI up.

>>>

>>> Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2")

>>> Cc: Alexander Deucher <Alexander.Deucher@amd.com>

>>> Cc: stable@vger.kernel.org

>>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>

>>> ---

>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++-------------

>>>    1 file changed, 13 insertions(+), 13 deletions(-)

>>>

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

>>> index 01fe60fedcbe..d481a33f4eaf 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

>>> @@ -363,19 +363,19 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,

>>>    		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;

>>>    

>>>    	/*query ue count*/

>>> -	ras_counter = amdgpu_ras_query_error_count(adev, false);

>>> -	/*ras counter is monotonic increasing*/

>>> -	if (ras_counter != ctx->ras_counter_ue) {

>>> -		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;

>>> -		ctx->ras_counter_ue = ras_counter;

>>> -	}

>>> -

>>> -	/*query ce count*/

>>> -	ras_counter = amdgpu_ras_query_error_count(adev, true);

>>> -	if (ras_counter != ctx->ras_counter_ce) {

>>> -		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;

>>> -		ctx->ras_counter_ce = ras_counter;

>>> -	}

>>> +	/* ras_counter = amdgpu_ras_query_error_count(adev, false); */

>>> +	/* /\*ras counter is monotonic increasing*\/ */

>>> +	/* if (ras_counter != ctx->ras_counter_ue) { */

>>> +	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */

>>> +	/* 	ctx->ras_counter_ue = ras_counter; */

>>> +	/* } */

>>> +

>>> +	/* /\*query ce count*\/ */

>>> +	/* ras_counter = amdgpu_ras_query_error_count(adev, true); */

>>> +	/* if (ras_counter != ctx->ras_counter_ce) { */

>>> +	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */

>>> +	/* 	ctx->ras_counter_ce = ras_counter; */

>>> +	/* } */

>> Please completely drop the code. We usually don't keep commented out

>> code in the driver.

> 1. Alex suggested this when we chatted--this is why it is commented.


Sounds like a misunderstanding to me, usually it is Alex who insists on 
dropping the code.

> 2. He suggested the same thing last night and 2.5 hours before your email,

>      I posted a patch in which the code is commented out--did you not see it?

>      It's threaded, it appears above, 2.5 hours before your email.


Sorry for the redundancy, didn't had seen that in my inbox yet when I 
wrote the reply.

Regards,
Christian.

>

> Regards,

> Luben

>

>> With that done the patch is Reviewed-by: Christian König

>> <christian.koenig@amd.com>

>>

>> Christian.

>>

>>>    

>>>    	mutex_unlock(&mgr->lock);

>>>    	return 0;
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 01fe60fedcbe..d481a33f4eaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -363,19 +363,19 @@  static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
 	/*query ue count*/
-	ras_counter = amdgpu_ras_query_error_count(adev, false);
-	/*ras counter is monotonic increasing*/
-	if (ras_counter != ctx->ras_counter_ue) {
-		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
-		ctx->ras_counter_ue = ras_counter;
-	}
-
-	/*query ce count*/
-	ras_counter = amdgpu_ras_query_error_count(adev, true);
-	if (ras_counter != ctx->ras_counter_ce) {
-		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
-		ctx->ras_counter_ce = ras_counter;
-	}
+	/* ras_counter = amdgpu_ras_query_error_count(adev, false); */
+	/* /\*ras counter is monotonic increasing*\/ */
+	/* if (ras_counter != ctx->ras_counter_ue) { */
+	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
+	/* 	ctx->ras_counter_ue = ras_counter; */
+	/* } */
+
+	/* /\*query ce count*\/ */
+	/* ras_counter = amdgpu_ras_query_error_count(adev, true); */
+	/* if (ras_counter != ctx->ras_counter_ce) { */
+	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
+	/* 	ctx->ras_counter_ce = ras_counter; */
+	/* } */
 
 	mutex_unlock(&mgr->lock);
 	return 0;