Skip to content

Commit

Permalink
Refactor gpu data up-/download; improve performance by using CPU-cach…
Browse files Browse the repository at this point in the history
…ed buffers for download
  • Loading branch information
FabianWildgrube committed Jan 19, 2024
1 parent 4ce19d9 commit 9872498
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 43 deletions.
11 changes: 11 additions & 0 deletions src/pal/pal_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,17 @@ Pal::GpuHeap find_gpu_local_heap(const Pal::IDevice* device, Pal::gpusize memory
return Pal::GpuHeap::GpuHeapCount;
}

bool allocation_is_host_visible(Pal::IGpuMemory* gpu_allocation) {
const Pal::GpuMemoryDesc& memory_desc = gpu_allocation->Desc();
for (Pal::uint32 i = 0; i < memory_desc.heapCount; ++i) {
if (memory_desc.heaps[i] == Pal::GpuHeap::GpuHeapInvisible) {
return true;
}
}

return false;
}

ShaderSrc::ShaderSrc(const std::string& filename, const std::string& src_code, const std::string& kernelname)
: kernelname(kernelname)
, src_code(src_code)
Expand Down
2 changes: 2 additions & 0 deletions src/pal/pal_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ void read_from_memory(void* dst_buffer, Pal::IGpuMemory* src_memory, int64_t src
// Returns Pal::GpuHeap::GpuHeapCount if no appropriate heap can be found.
Pal::GpuHeap find_gpu_local_heap(const Pal::IDevice* device, Pal::gpusize memory_size);

bool allocation_is_host_visible(Pal::IGpuMemory* gpu_allocation);

llvm::MDNode* get_metadata_mdnode(const llvm::Function* func, const char* key, int index = 0);
llvm::StringRef get_metadata_string(const llvm::Function* func, const char* key);
uint64_t get_metadata_uint(const llvm::Function* func, const char* key, int index = 0);
Expand Down
74 changes: 32 additions & 42 deletions src/pal_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,6 @@ void* PALPlatform::alloc_unified(DeviceId dev, int64_t size) {
return reinterpret_cast<void*>(device.allocate_shared_virtual_memory(size));
}

void* PALPlatform::alloc_upload(DeviceId dev, int64_t size) {
auto& device = devices_[dev];
return reinterpret_cast<void*>(device.allocate_gpu_memory(size, Pal::GpuHeap::GpuHeapLocal));
}

void PALPlatform::release(DeviceId dev, void* ptr) {
auto& device = devices_[dev];
device.release_gpu_memory(ptr);
Expand Down Expand Up @@ -248,49 +243,44 @@ void PALPlatform::copy(DeviceId dev_src, const void* src, int64_t offset_src, De

void PALPlatform::copy_from_host(
const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size) {
Pal::IGpuMemory* dst_memory = devices_[dev_dst].get_memory_object(dst);

// Query if GPU memory is host visible.
const Pal::GpuMemoryDesc& dst_memory_desc = dst_memory->Desc();
for (Pal::uint32 i = 0; i < dst_memory_desc.heapCount; ++i) {
if (dst_memory_desc.heaps[i] == Pal::GpuHeap::GpuHeapInvisible) {
// Create a temporary CPU-visible buffer on the GPU.
// Upload the data from the host and then copy into the given GPU destination buffer.
void* virtual_gpu_address = alloc_upload(dev_dst, size);
Pal::IGpuMemory* intermediate_mem = devices_[dev_dst].get_memory_object(virtual_gpu_address);
pal_utils::write_to_memory(
intermediate_mem, 0, static_cast<const uint8_t*>(src) + offset_src, size);

copy(dev_dst, virtual_gpu_address, 0, dev_dst, dst, offset_dst, size);
release(dev_dst, virtual_gpu_address);
return;
}
auto& device = devices_[dev_dst];
Pal::IGpuMemory* dst_memory = device.get_memory_object(dst);

if (!pal_utils::allocation_is_host_visible(dst_memory)) {
// Create a temporary CPU-visible buffer on the GPU.
// memcpy the data from the host space to the CPU-visible buffer
// and then copy into the given GPU destination buffer.
void* virtual_gpu_address = reinterpret_cast<void*>(device.allocate_gpu_memory(size, Pal::GpuHeap::GpuHeapLocal));
Pal::IGpuMemory* intermediate_mem = device.get_memory_object(virtual_gpu_address);
pal_utils::write_to_memory(
intermediate_mem, 0, static_cast<const uint8_t*>(src) + offset_src, size);

copy(dev_dst, virtual_gpu_address, 0, dev_dst, dst, offset_dst, size);
release(dev_dst, virtual_gpu_address);
} else {
// Map and memcpy directly to CPU-visible GPU allocation.
pal_utils::write_to_memory(dst_memory, offset_dst, static_cast<const uint8_t*>(src) + offset_src, size);
}

pal_utils::write_to_memory(dst_memory, offset_dst, static_cast<const uint8_t*>(src) + offset_src, size);
}

void PALPlatform::copy_to_host(
DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size) {
Pal::IGpuMemory* src_memory = devices_[dev_src].get_memory_object(src);

// Query if GPU memory is host visible.
const Pal::GpuMemoryDesc& src_memory_desc = src_memory->Desc();
for (Pal::uint32 i = 0; i < src_memory_desc.heapCount; ++i) {
if (src_memory_desc.heaps[i] == Pal::GpuHeap::GpuHeapInvisible) {
// Create a temporary CPU-visible buffer on the GPU and copy the GPU only data into the
// CPU-visible buffer, then memcpy the data to the host destination space.
void* virtual_gpu_address = alloc_upload(dev_src, size);
copy(dev_src, src, offset_src, dev_src, virtual_gpu_address, 0, size);

Pal::IGpuMemory* intermediate_mem = devices_[dev_src].get_memory_object(virtual_gpu_address);
pal_utils::read_from_memory(static_cast<uint8_t*>(dst) + offset_dst, intermediate_mem, 0, size);
release(dev_src, virtual_gpu_address);
return;
}
auto& device = devices_[dev_src];
Pal::IGpuMemory* src_memory = device.get_memory_object(src);

if (!pal_utils::allocation_is_host_visible(src_memory)) {
// Create a temporary GPU-visible cached buffer on the CPU and copy into that
// CPU-visible buffer, then memcpy the data to the host destination space.
// Using a cached buffer seems to be crucial for performance!
void* virtual_gpu_address = reinterpret_cast<void*>(device.allocate_gpu_memory(size, Pal::GpuHeap::GpuHeapGartCacheable));
copy(dev_src, src, offset_src, dev_src, virtual_gpu_address, 0, size);

Pal::IGpuMemory* intermediate_mem = device.get_memory_object(virtual_gpu_address);
pal_utils::read_from_memory(static_cast<uint8_t*>(dst) + offset_dst, intermediate_mem, 0, size);
release(dev_src, virtual_gpu_address);
} else {
pal_utils::read_from_memory(static_cast<uint8_t*>(dst) + offset_dst, src_memory, offset_src, size);
}

pal_utils::read_from_memory(static_cast<uint8_t*>(dst) + offset_dst, src_memory, offset_src, size);
}

Pal::IPipeline* PALPlatform::load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname) {
Expand Down
1 change: 0 additions & 1 deletion src/pal_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ class PALPlatform : public Platform {
void* alloc(DeviceId dev, int64_t size) override;
void* alloc_host(DeviceId dev, int64_t size) override;
void* alloc_unified(DeviceId dev, int64_t size) override;
void* alloc_upload(DeviceId dev, int64_t size);
void* get_device_ptr(DeviceId, void*) override { command_unavailable("get_device_ptr"); }
void release(DeviceId dev, void* ptr) override;
void release_host(DeviceId dev, void* ptr) override;
Expand Down

0 comments on commit 9872498

Please sign in to comment.