Skip to content

Commit

Permalink
Fixes for remote-mqpu platform (NVIDIA#1158)
Browse files Browse the repository at this point in the history
(1) Clean shutdown behavior

- Client side: enforce cleanup order: qpus first then any auto-launch
  server instances. This prevents some stray errors during shut down:
  the server is shut down before the client, which causes the client to
report an error just moment before it being destructed anyway.

- Server app: make sure that we always have a loaded simulator in the
  runtime to prevent segfault during shutdow when the runtime would try
to find one.

(2) Harden the tensornet scratch memory allocation against race
condition.
  • Loading branch information
1tnguyen authored Feb 4, 2024
1 parent ad3eac4 commit ac8ce9e
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ void __nvqir__setCircuitSimulator(nvqir::CircuitSimulator *);

namespace {
using namespace mlir;
// Encapsulates a dynamically-loaded NVQIR simulator library
struct SimulatorHandle {
std::string name;
void *libHandle;
};

class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
int m_port = -1;
Expand All @@ -65,8 +70,18 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
std::vector<std::string> passes;
};
std::unordered_map<std::size_t, CodeTransformInfo> m_codeTransform;
// Currently-loaded NVQIR simulator.
SimulatorHandle m_simHandle;
// Default backend for initialization.
// Note: we always need to preload a default backend on the server runtime
// since cudaq runtime relies on that.
static constexpr const char *DEFAULT_NVQIR_SIMULATION_BACKEND = "qpp";

public:
RemoteRestRuntimeServer()
: cudaq::RemoteRuntimeServer(),
m_simHandle(DEFAULT_NVQIR_SIMULATION_BACKEND,
loadNvqirSimLib(DEFAULT_NVQIR_SIMULATION_BACKEND)) {}
virtual void
init(const std::unordered_map<std::string, std::string> &configs) override {
const auto portIter = configs.find("port");
Expand Down Expand Up @@ -124,7 +139,15 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
std::string_view ir, std::string_view kernelName,
void *kernelArgs, std::uint64_t argsSize,
std::size_t seed) override {
void *handle = loadNvqirSimLib(backendSimName);

// If we're changing the backend, load the new simulator library from file.
if (m_simHandle.name != backendSimName) {
if (m_simHandle.libHandle)
dlclose(m_simHandle.libHandle);

m_simHandle =
SimulatorHandle(backendSimName, loadNvqirSimLib(backendSimName));
}
if (seed != 0)
cudaq::set_random_seed(seed);
auto &platform = cudaq::get_platform();
Expand All @@ -139,7 +162,6 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
std::string(kernelName));
platform.reset_exec_ctx();
dlclose(handle);
}

private:
Expand Down
9 changes: 8 additions & 1 deletion runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@ class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
m_remoteServers;

public:
~MultiQPUQuantumPlatform() = default;
~MultiQPUQuantumPlatform() {
// Make sure that we clean up the client QPUs first before cleaning up the
// remote servers.
platformQPUs.clear();
platformNumQPUs = 0;
m_remoteServers.clear();
}

MultiQPUQuantumPlatform() {
if (cudaq::registry::isRegistered<cudaq::QPU>("GPUEmulatedQPU")) {
int nDevices = cudaq::getCudaGetDeviceCount();
Expand Down
19 changes: 17 additions & 2 deletions runtime/nvqir/cutensornet/tensornet_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,28 @@ randomValues(uint64_t num_samples, double max_value,
struct ScratchDeviceMem {
void *d_scratch = nullptr;
std::size_t scratchSize = 0;
ScratchDeviceMem() {
// Compute the scratch size to allocate.
void computeScratchSize() {
// Query the free memory on Device
std::size_t freeSize{0}, totalSize{0};
HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
scratchSize = (freeSize - (freeSize % 4096)) /
2; // use half of available memory with alignment
HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
}

ScratchDeviceMem() {
computeScratchSize();
// Try allocate device memory
auto errCode = cudaMalloc(&d_scratch, scratchSize);
if (errCode == cudaErrorMemoryAllocation) {
// This indicates race condition whereby other GPU code is allocating
// memory while we are calling cudaMemGetInfo.
// Attempt to redo the allocation with an updated cudaMemGetInfo data.
computeScratchSize();
HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
} else {
HANDLE_CUDA_ERROR(errCode);
}
}
~ScratchDeviceMem() { HANDLE_CUDA_ERROR(cudaFree(d_scratch)); }
};
Expand Down

0 comments on commit ac8ce9e

Please sign in to comment.