Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tweak synchronous memcpy implementation #1809

Merged
merged 29 commits into from
Feb 18, 2020
Merged
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
6848b7d
Fix late-coming issues.
AlexVlx Dec 7, 2019
86936d1
Update hip_runtime_api.h
AlexVlx Dec 7, 2019
002f2d2
use launch_fence instead of barrier_bit
jeffdaily Dec 9, 2019
6bf31e9
missing HipTest::freeArrays in hipMemcpyWithStream.cpp
jeffdaily Dec 9, 2019
86f627a
Merge pull request #1727 from jeffdaily/feature_memcpy_the_2nd
AlexVlx Dec 9, 2019
c2780ca
Merge branch 'master' of https://github.com/ROCm-Developer-Tools/HIP …
AlexVlx Dec 15, 2019
17e7007
Merge branch 'master' of https://github.com/ROCm-Developer-Tools/HIP …
AlexVlx Jan 23, 2020
ae950bd
Clean up noise.
AlexVlx Jan 23, 2020
125960b
Clean up more noise.
AlexVlx Jan 23, 2020
2ec8e34
Clean up even more noise.
AlexVlx Jan 23, 2020
c5a2fda
Final clean up.
AlexVlx Jan 23, 2020
9e2326b
Tweak synchronous memcpy.
AlexVlx Jan 23, 2020
a759682
Correct failed appy.
AlexVlx Jan 23, 2020
1a55cb4
Try to improve D2D.
AlexVlx Jan 23, 2020
187815d
No need for `hsa_memory_copy`, it's slow.
AlexVlx Jan 23, 2020
f7f3caf
Correct typo
AlexVlx Jan 23, 2020
a5b3e39
Correctly handle locked pointers.
AlexVlx Jan 24, 2020
638a09f
Remove dead code
AlexVlx Jan 29, 2020
aaa19e5
Guard one remaining danger-zone.
AlexVlx Jan 29, 2020
969e3bc
Merge branch 'master' of https://github.com/ROCm-Developer-Tools/HIP …
AlexVlx Jan 30, 2020
491d4f7
Use owner type as opposed to pointer type do decide copy direction.
AlexVlx Jan 30, 2020
a7de6b7
Merge branch 'master' of https://github.com/ROCm-Developer-Tools/HIP …
AlexVlx Jan 30, 2020
e1988a6
Handle degenerate case where an IPC pointer may be hoovered in.
AlexVlx Feb 3, 2020
9bfc9a4
Degenerate case should not be handled.
AlexVlx Feb 5, 2020
56af960
Tweak D2H for cases where LargeBAR is available.
AlexVlx Feb 5, 2020
b07b411
Fix typo.
AlexVlx Feb 5, 2020
08ed04d
Fix typo 2.
AlexVlx Feb 5, 2020
cc19247
Merge branch 'feature_memcpy_the_2nd' of https://github.com/ROCm-Deve…
AlexVlx Feb 16, 2020
5110eb3
Merge branch 'master' of https://github.com/ROCm-Developer-Tools/HIP …
AlexVlx Feb 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 43 additions & 62 deletions src/hip_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ __device__ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
namespace hip_internal {

namespace {

inline
const char* hsa_to_string(hsa_status_t err) noexcept
{
Expand Down Expand Up @@ -149,13 +148,14 @@ namespace {
const_cast<void*>(p), &r, nullptr, nullptr, nullptr),
__FILE__, __func__, __LINE__);

r.size = is_large_BAR || (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) ?
UINT32_MAX : sizeof(hsa_amd_pointer_info_t);
if (is_large_BAR) r.size = UINT32_MAX;
else if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = INT32_MAX;

return r;
}

constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages.
constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages.
constexpr size_t max_std_memcpy_sz{8 * 1024}; // 8 KiB.

thread_local const std::unique_ptr<void, void (*)(void *)> staging_buffer{
[]() {
Expand Down Expand Up @@ -202,8 +202,8 @@ namespace {
} // Unnamed namespace.

inline
void do_copy(void* __restrict dst, const void* __restrict src, std::size_t n,
hsa_agent_t da, hsa_agent_t sa) {
void do_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_agent_t da, hsa_agent_t sa) {
hsa_signal_silent_store_relaxed(copy_signal, 1);
throwing_result_check(
hsa_amd_memory_async_copy(dst, da, src, sa, n, 0, nullptr, copy_signal),
Expand All @@ -224,10 +224,10 @@ void do_std_memcpy(
inline
void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t si) {
// TODO: characterise direct largeBAR reads from agent-allocated memory.
// if (si.size == UINT32_MAX) {
// return do_std_memcpy(dst, src, n);
// }
if (si.size == INT32_MAX) return do_std_memcpy(dst, src, n);
if (si.size == UINT32_MAX && n <= max_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}

const auto di{info(dst)};

Expand Down Expand Up @@ -256,16 +256,17 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n,
inline
void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t di) {
if (di.size == UINT32_MAX) {
if (di.size == INT32_MAX) return do_std_memcpy(dst, src, n);
if (di.size == UINT32_MAX && n <= max_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}

const auto si{info(const_cast<void*>(src))};

if (si.type == HSA_EXT_POINTER_TYPE_LOCKED) {
src = static_cast<char*>(si.agentBaseAddress) +
(static_cast<const char*>(src) -
static_cast<char*>(si.hostBaseAddress));
(static_cast<const char*>(src) -
static_cast<char*>(si.hostBaseAddress));
do_copy(dst, src, n, di.agentOwner, di.agentOwner);
}
else if (n <= staging_sz) {
Expand All @@ -288,53 +289,30 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n,
inline
void generic_copy(void* __restrict dst, const void* __restrict src, size_t n,
hsa_amd_pointer_info_t di, hsa_amd_pointer_info_t si) {
if (di.size == UINT32_MAX && si.size == UINT32_MAX) {
if (di.size == INT32_MAX && si.size == INT32_MAX) {
return do_std_memcpy(dst, src, n);
}
if (di.size == UINT32_MAX && si.size == UINT32_MAX &&
n <= max_std_memcpy_sz) {
return do_std_memcpy(dst, src, n);
}

std::unique_ptr<void, void (*)(void*)> lck0{
nullptr, [](void* p) { hsa_amd_memory_unlock(p); }};
std::unique_ptr<void, void (*)(void*)> lck1{nullptr, lck0.get_deleter()};

switch (si.type) {
case HSA_EXT_POINTER_TYPE_HSA:
if (di.type == HSA_EXT_POINTER_TYPE_HSA) {
hsa_memory_copy(dst, src, n);
return; // TODO: do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}

if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN ||
di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
return d2h_copy(dst, src, n, si);
}
break;
case HSA_EXT_POINTER_TYPE_LOCKED:
if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
std::memcpy(dst, si.hostBaseAddress, n);

return;
}
if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
std::memcpy(di.hostBaseAddress, si.hostBaseAddress, n);

return;
switch (type(si.agentOwner)) {
case HSA_DEVICE_TYPE_GPU:
if (type(di.agentOwner) == HSA_DEVICE_TYPE_GPU) {
throwing_result_check(
hsa_amd_agents_allow_access(
1u, &si.agentOwner, nullptr, di.agentBaseAddress),
__FILE__, __func__, __LINE__);
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
src = si.agentBaseAddress;
si.agentOwner = di.agentOwner;
break;
case HSA_EXT_POINTER_TYPE_UNKNOWN:
if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) {
std::memcpy(dst, src, n);

return;
}
if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) {
std::memcpy(di.hostBaseAddress, src, n);

return;
return d2h_copy(dst, src, n, si);
case HSA_DEVICE_TYPE_CPU:
if (type(di.agentOwner) == HSA_DEVICE_TYPE_CPU) {
return do_std_memcpy(dst, src, n);
}
return h2d_copy(dst, src, n, di);
default: do_copy(dst, src, n, di.agentOwner, si.agentOwner); break;
default: throw std::runtime_error{"Unsupported copy type."};
}
}

Expand All @@ -343,14 +321,17 @@ void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n,
hipMemcpyKind k) noexcept {
switch (k) {
case hipMemcpyHostToHost: std::memcpy(dst, src, n); break;
case hipMemcpyHostToDevice:
return is_large_BAR ? do_std_memcpy(dst, src, n)
: h2d_copy(dst, src, n, info(dst));
case hipMemcpyDeviceToHost:
// TODO: characterise direct largeBAR reads from agent-allocated memory.
return /*is_large_BAR ? do_std_memcpy(dst, src, n)
: */d2h_copy(dst, src, n, info(src));
case hipMemcpyDeviceToDevice: hsa_memory_copy(dst, src, n); break;
case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst));
case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src));
case hipMemcpyDeviceToDevice: {
const auto di{info(dst)};
const auto si{info(src)};
throwing_result_check(
hsa_amd_agents_allow_access(
1u, &si.agentOwner, nullptr, di.agentBaseAddress),
__FILE__, __func__, __LINE__);
return do_copy(dst, src, n, di.agentOwner, si.agentOwner);
}
default: return generic_copy(dst, src, n, info(dst), info(src));
}
}
Expand Down