Skip to content
/ linux Public
forked from torvalds/linux

Commit

Permalink
drm/amdgpu/gfx12: fallback to driver reset compute queue directly
Browse files Browse the repository at this point in the history
Since the MES FW resets kernel compute queue always failed, this
may caused by the KIQ failed to process unmap KCQ. So, before MES
FW work properly that will fallback to driver executes dequeue and
resets SPI directly. Besides, rework the ring reset function and make
the busy ring type reset in each function respectively.

Acked-by: Vitaly Prosyak <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
  • Loading branch information
alexdeucher committed Sep 2, 2024
1 parent 2480599 commit 8fe4fde
Showing 1 changed file with 79 additions and 14 deletions.
93 changes: 79 additions & 14 deletions drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
Original file line number Diff line number Diff line change
Expand Up @@ -2916,13 +2916,13 @@ static int gfx_v12_0_gfx_mqd_init(struct amdgpu_device *adev, void *m,
return 0;
}

static int gfx_v12_0_gfx_init_queue(struct amdgpu_ring *ring)
static int gfx_v12_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset)
{
struct amdgpu_device *adev = ring->adev;
struct v12_gfx_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.gfx_ring[0];

if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
Expand Down Expand Up @@ -2958,7 +2958,7 @@ static int gfx_v12_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev)

r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
if (!r) {
r = gfx_v12_0_gfx_init_queue(ring);
r = gfx_v12_0_kgq_init_queue(ring, false);
amdgpu_bo_kunmap(ring->mqd_obj);
ring->mqd_ptr = NULL;
}
Expand Down Expand Up @@ -3262,13 +3262,13 @@ static int gfx_v12_0_kiq_init_queue(struct amdgpu_ring *ring)
return 0;
}

static int gfx_v12_0_kcq_init_queue(struct amdgpu_ring *ring)
static int gfx_v12_0_kcq_init_queue(struct amdgpu_ring *ring, bool reset)
{
struct amdgpu_device *adev = ring->adev;
struct v12_compute_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0];

if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
Expand Down Expand Up @@ -3332,7 +3332,7 @@ static int gfx_v12_0_kcq_resume(struct amdgpu_device *adev)
goto done;
r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
if (!r) {
r = gfx_v12_0_kcq_init_queue(ring);
r = gfx_v12_0_kcq_init_queue(ring, false);
amdgpu_bo_kunmap(ring->mqd_obj);
ring->mqd_ptr = NULL;
}
Expand Down Expand Up @@ -5158,18 +5158,83 @@ static void gfx_v12_ip_dump(void *handle)
amdgpu_gfx_off_ctrl(adev, true);
}

static int gfx_v12_0_reset_ring(struct amdgpu_ring *ring, unsigned int vmid)
static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
{
struct amdgpu_device *adev = ring->adev;
int r;

r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid);
if (r)
if (r) {
dev_err(adev->dev, "reset via MES failed %d\n", r);
return r;
}

/* reset the ring */
ring->wptr = 0;
*ring->wptr_cpu_addr = 0;
amdgpu_ring_clear_ring(ring);
r = amdgpu_bo_reserve(ring->mqd_obj, false);
if (unlikely(r != 0)) {
dev_err(adev->dev, "fail to resv mqd_obj\n");
return r;
}
r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
if (!r) {
r = gfx_v12_0_kgq_init_queue(ring, true);
amdgpu_bo_kunmap(ring->mqd_obj);
ring->mqd_ptr = NULL;
}
amdgpu_bo_unreserve(ring->mqd_obj);
if (r) {
DRM_ERROR("fail to unresv mqd_obj\n");
return r;
}

r = amdgpu_mes_map_legacy_queue(adev, ring);
if (r) {
dev_err(adev->dev, "failed to remap kgq\n");
return r;
}

return amdgpu_ring_test_ring(ring);
}

static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring, unsigned int vmid)
{
struct amdgpu_device *adev = ring->adev;
int r, i;

gfx_v12_0_set_safe_mode(adev, 0);
mutex_lock(&adev->srbm_mutex);
soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
for (i = 0; i < adev->usec_timeout; i++) {
if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
break;
udelay(1);
}
soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
gfx_v12_0_unset_safe_mode(adev, 0);

r = amdgpu_bo_reserve(ring->mqd_obj, false);
if (unlikely(r != 0)) {
DRM_ERROR("fail to resv mqd_obj\n");
return r;
}
r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
if (!r) {
r = gfx_v12_0_kcq_init_queue(ring, true);
amdgpu_bo_kunmap(ring->mqd_obj);
ring->mqd_ptr = NULL;
}
amdgpu_bo_unreserve(ring->mqd_obj);
if (r) {
DRM_ERROR("fail to unresv mqd_obj\n");
return r;
}
r = amdgpu_mes_map_legacy_queue(adev, ring);
if (r) {
dev_err(adev->dev, "failed to remap kcq\n");
return r;
}

return amdgpu_ring_test_ring(ring);
}
Expand Down Expand Up @@ -5236,7 +5301,7 @@ static const struct amdgpu_ring_funcs gfx_v12_0_ring_funcs_gfx = {
.emit_reg_write_reg_wait = gfx_v12_0_ring_emit_reg_write_reg_wait,
.soft_recovery = gfx_v12_0_ring_soft_recovery,
.emit_mem_sync = gfx_v12_0_emit_mem_sync,
.reset = gfx_v12_0_reset_ring,
.reset = gfx_v12_0_reset_kgq,
};

static const struct amdgpu_ring_funcs gfx_v12_0_ring_funcs_compute = {
Expand Down Expand Up @@ -5271,7 +5336,7 @@ static const struct amdgpu_ring_funcs gfx_v12_0_ring_funcs_compute = {
.emit_reg_write_reg_wait = gfx_v12_0_ring_emit_reg_write_reg_wait,
.soft_recovery = gfx_v12_0_ring_soft_recovery,
.emit_mem_sync = gfx_v12_0_emit_mem_sync,
.reset = gfx_v12_0_reset_ring,
.reset = gfx_v12_0_reset_kcq,
};

static const struct amdgpu_ring_funcs gfx_v12_0_ring_funcs_kiq = {
Expand Down

0 comments on commit 8fe4fde

Please sign in to comment.