Skip to content

Commit

Permalink
drm/amdgpu: Add reset_context flag for host FLR
Browse files Browse the repository at this point in the history
[ Upstream commit 25c0119 ]

There are other reset sources that pass NULL as the job pointer, such as
amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if
the FLR comes from the host does not work.

Add a flag in reset_context to explicitly mark host triggered reset, and
set this flag when we receive host reset notification.

Signed-off-by: Yunxiang Li <[email protected]>
Reviewed-by: Emily Deng <[email protected]>
Reviewed-by: Zhigang Luo <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
Stable-dep-of: 6e4aa08 ("drm/amdgpu: Fix amdgpu_device_reset_sriov retry logic")
Signed-off-by: Sasha Levin <[email protected]>
  • Loading branch information
yunxiali authored and gregkh committed Sep 12, 2024
1 parent 1f49070 commit 3adb4ae
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 5 deletions.
13 changes: 8 additions & 5 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -5057,13 +5057,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
* amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
*
* @adev: amdgpu_device pointer
* @from_hypervisor: request from hypervisor
* @reset_context: amdgpu reset context pointer
*
* do VF FLR and reinitialize Asic
* return 0 means succeeded otherwise failed
*/
static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
bool from_hypervisor)
struct amdgpu_reset_context *reset_context)
{
int r;
struct amdgpu_hive_info *hive = NULL;
Expand All @@ -5072,12 +5072,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
retry:
amdgpu_amdkfd_pre_reset(adev);

if (from_hypervisor)
if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true);
else
} else {
r = amdgpu_virt_reset_gpu(adev);
}
if (r)
return r;

amdgpu_ras_set_fed(adev, false);
amdgpu_irq_gpu_reset_resume_helper(adev);

Expand Down Expand Up @@ -5831,7 +5834,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_device_reset_sriov(adev, job ? false : true);
r = amdgpu_device_reset_sriov(adev, reset_context);
if (r)
adev->asic_reset_res = r;

Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_SKIP_COREDUMP = 2,
AMDGPU_HOST_FLR = 3,
};

struct amdgpu_reset_context {
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
Expand Down

0 comments on commit 3adb4ae

Please sign in to comment.