Skip to content

Commit

Permalink
Tweak synchronous memcpy implementation (ROCm#1809)
Browse files Browse the repository at this point in the history
The existing one can have issues on certain systems, therefore this limits use of direct memcpy via largeBAR to sizes where it is unequivocally better.

Also addresses SWDEV-220030 and SWDEV-222237.
  • Loading branch information
AlexVlx authored Feb 18, 2020
1 parent 92cc29a commit 9b4f39e
Showing 1 changed file with 43 additions and 62 deletions.
105 changes: 43 additions & 62 deletions src/hip_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ __device__ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
namespace hip_internal {

namespace {

inline
const char* hsa_to_string(hsa_status_t err) noexcept
{
Expand Down Expand Up @@ -149,13 +148,14 @@ namespace {
const_cast<void*>(p), &r, nullptr, nullptr, nullptr),
__FILE__, __func__, __LINE__);

r.size = is_large_BAR || (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) ?
UINT32_MAX : sizeof(hsa_amd_pointer_info_t);
if (is_large_BAR) r.size = UINT32_MAX;
else if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = INT32_MAX;

return r;
}

constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages.
constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages.
constexpr size_t max_std_memcpy_sz{8 * 1024}; // 8 KiB.

thread_local const std::unique_ptr<void, void (*)(void *)> staging_buffer{
[]() {
Expand Down Expand Up @@ -202,8 +202,8 @@ namespace {
} // Unnamed namespace.

inline
void do_copy(void* __restrict dst, const void* __restrict src, std::size_t n,
hsa_agent_t da, hsa_agent_t sa) {
void do_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_agent_t da, hsa_agent_t sa) {
hsa_signal_silent_store_relaxed(copy_signal, 1);
throwing_result_check(
hsa_amd_memory_async_copy(dst, da, src, sa, n, 0, nullptr, copy_signal),
Expand All @@ -224,10 +224,10 @@ void do_std_memcpy(
inline
void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t si) {
// TODO: characterise direct largeBAR reads from agent-allocated memory.
// if (si.size == UINT32_MAX) {
// return do_std_memcpy(dst, src, n);
// }
if (si.size == INT32_MAX) return do_std_memcpy(dst, src, n);
if (si.size == UINT32_MAX && n <= max_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}

const auto di{info(dst)};

Expand Down Expand Up @@ -256,16 +256,17 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
inline
void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t di) {
if (di.size == UINT32_MAX) {
if (di.size == INT32_MAX) return do_std_memcpy(dst, src, n);
if (di.size == UINT32_MAX && n <= max_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}

const auto si{info(const_cast<void*>(src))};

if (si.type == HSA_EXT_POINTER_TYPE_LOCKED) {
src = static_cast<char*>(si.agentBaseAddress) +
(static_cast<const char*>(src) -
static_cast<char*>(si.hostBaseAddress));
(static_cast<const char*>(src) -
static_cast<char*>(si.hostBaseAddress));
do_copy(dst, src, n, di.agentOwner, di.agentOwner);
}
else if (n <= staging_sz) {
Expand All @@ -288,53 +289,30 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
inline
void generic_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t di, hsa_amd_pointer_info_t si) {
if (di.size == UINT32_MAX && si.size == UINT32_MAX) {
if (di.size == INT32_MAX && si.size == INT32_MAX) {
return do_std_memcpy(dst, src, n);
}
if (di.size == UINT32_MAX && si.size == UINT32_MAX &&
n <= max_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}

std::unique_ptr<void, void (*)(void*)> lck0{
nullptr, [](void* p) { hsa_amd_memory_unlock(p); }};
std::unique_ptr<void, void (*)(void*)> lck1{nullptr, lck0.get_deleter()};

switch (si.type) {
case HSA_EXT_POINTER_TYPE_HSA:
if (di.type == HSA_EXT_POINTER_TYPE_HSA) {
hsa_memory_copy(dst, src, n);
return; // TODO: do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}

if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN ||
di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
return d2h_copy(dst, src, n, si);
}
break;
case HSA_EXT_POINTER_TYPE_LOCKED:
if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
std::memcpy(dst, si.hostBaseAddress, n);

return;
}
if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
std::memcpy(di.hostBaseAddress, si.hostBaseAddress, n);

return;
switch (type(si.agentOwner)) {
case HSA_DEVICE_TYPE_GPU:
if (type(di.agentOwner) == HSA_DEVICE_TYPE_GPU) {
throwing_result_check(
hsa_amd_agents_allow_access(
1u, &si.agentOwner, nullptr, di.agentBaseAddress),
__FILE__, __func__, __LINE__);
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
src = si.agentBaseAddress;
si.agentOwner = di.agentOwner;
break;
case HSA_EXT_POINTER_TYPE_UNKNOWN:
if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
std::memcpy(dst, src, n);

return;
}
if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
std::memcpy(di.hostBaseAddress, src, n);

return;
return d2h_copy(dst, src, n, si);
case HSA_DEVICE_TYPE_CPU:
if (type(di.agentOwner) == HSA_DEVICE_TYPE_CPU) {
return do_std_memcpy(dst, src, n);
}
return h2d_copy(dst, src, n, di);
default: do_copy(dst, src, n, di.agentOwner, si.agentOwner); break;
default: throw std::runtime_error{"Unsupported copy type."};
}
}

Expand All @@ -343,14 +321,17 @@ void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n,
hipMemcpyKind k) noexcept {
switch (k) {
case hipMemcpyHostToHost: std::memcpy(dst, src, n); break;
case hipMemcpyHostToDevice:
return is_large_BAR ? do_std_memcpy(dst, src, n)
: h2d_copy(dst, src, n, info(dst));
case hipMemcpyDeviceToHost:
// TODO: characterise direct largeBAR reads from agent-allocated memory.
return /*is_large_BAR ? do_std_memcpy(dst, src, n)
: */d2h_copy(dst, src, n, info(src));
case hipMemcpyDeviceToDevice: hsa_memory_copy(dst, src, n); break;
case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst));
case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src));
case hipMemcpyDeviceToDevice: {
const auto di{info(dst)};
const auto si{info(src)};
throwing_result_check(
hsa_amd_agents_allow_access(
1u, &si.agentOwner, nullptr, di.agentBaseAddress),
__FILE__, __func__, __LINE__);
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
default: return generic_copy(dst, src, n, info(dst), info(src));
}
}
Expand Down

0 comments on commit 9b4f39e

Please sign in to comment.