Fixes for remote-mqpu platform (NVIDIA#1158)

(1) Clean shutdown behavior - Client side: enforce cleanup order: qpus first then any auto-launch server instances. This prevents some stray errors during shut down: the server is shut down before the client, which causes the client to report an error just moment before it being destructed anyway. - Server app: make sure that we always have a loaded simulator in the runtime to prevent segfault during shutdow when the runtime would try to find one. (2) Harden the tensornet scratch memory allocation against race condition.
svetlanasieber · Feb 4, 2024 · ac8ce9e · ac8ce9e
1 parent ad3eac4
commit ac8ce9e
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 5 deletions.
diff --git a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
@@ -54,6 +54,11 @@ void __nvqir__setCircuitSimulator(nvqir::CircuitSimulator *);
 
 namespace {
 using namespace mlir;
+// Encapsulates a dynamically-loaded NVQIR simulator library
+struct SimulatorHandle {
+  std::string name;
+  void *libHandle;
+};
 
 class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
   int m_port = -1;
@@ -65,8 +70,18 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
     std::vector<std::string> passes;
   };
   std::unordered_map<std::size_t, CodeTransformInfo> m_codeTransform;
+  // Currently-loaded NVQIR simulator.
+  SimulatorHandle m_simHandle;
+  // Default backend for initialization.
+  // Note: we always need to preload a default backend on the server runtime
+  // since cudaq runtime relies on that.
+  static constexpr const char *DEFAULT_NVQIR_SIMULATION_BACKEND = "qpp";
 
 public:
+  RemoteRestRuntimeServer()
+      : cudaq::RemoteRuntimeServer(),
+        m_simHandle(DEFAULT_NVQIR_SIMULATION_BACKEND,
+                    loadNvqirSimLib(DEFAULT_NVQIR_SIMULATION_BACKEND)) {}
   virtual void
   init(const std::unordered_map<std::string, std::string> &configs) override {
     const auto portIter = configs.find("port");
@@ -124,7 +139,15 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
                              std::string_view ir, std::string_view kernelName,
                              void *kernelArgs, std::uint64_t argsSize,
                              std::size_t seed) override {
-    void *handle = loadNvqirSimLib(backendSimName);
+
+    // If we're changing the backend, load the new simulator library from file.
+    if (m_simHandle.name != backendSimName) {
+      if (m_simHandle.libHandle)
+        dlclose(m_simHandle.libHandle);
+
+      m_simHandle =
+          SimulatorHandle(backendSimName, loadNvqirSimLib(backendSimName));
+    }
     if (seed != 0)
       cudaq::set_random_seed(seed);
     auto &platform = cudaq::get_platform();
@@ -139,7 +162,6 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
       invokeMlirKernel(m_mlirContext, ir, requestInfo.passes,
                        std::string(kernelName));
     platform.reset_exec_ctx();
-    dlclose(handle);
   }
 
 private:

diff --git a/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp b/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
@@ -25,7 +25,14 @@ class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
       m_remoteServers;
 
 public:
-  ~MultiQPUQuantumPlatform() = default;
+  ~MultiQPUQuantumPlatform() {
+    // Make sure that we clean up the client QPUs first before cleaning up the
+    // remote servers.
+    platformQPUs.clear();
+    platformNumQPUs = 0;
+    m_remoteServers.clear();
+  }
+
   MultiQPUQuantumPlatform() {
     if (cudaq::registry::isRegistered<cudaq::QPU>("GPUEmulatedQPU")) {
       int nDevices = cudaq::getCudaGetDeviceCount();

diff --git a/runtime/nvqir/cutensornet/tensornet_utils.h b/runtime/nvqir/cutensornet/tensornet_utils.h
@@ -64,13 +64,28 @@ randomValues(uint64_t num_samples, double max_value,
 struct ScratchDeviceMem {
   void *d_scratch = nullptr;
   std::size_t scratchSize = 0;
-  ScratchDeviceMem() {
+  // Compute the scratch size to allocate.
+  void computeScratchSize() {
     // Query the free memory on Device
     std::size_t freeSize{0}, totalSize{0};
     HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
     scratchSize = (freeSize - (freeSize % 4096)) /
                   2; // use half of available memory with alignment
-    HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
+  }
+
+  ScratchDeviceMem() {
+    computeScratchSize();
+    // Try allocate device memory
+    auto errCode = cudaMalloc(&d_scratch, scratchSize);
+    if (errCode == cudaErrorMemoryAllocation) {
+      // This indicates race condition whereby other GPU code is allocating
+      // memory while we are calling cudaMemGetInfo.
+      // Attempt to redo the allocation with an updated cudaMemGetInfo data.
+      computeScratchSize();
+      HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
+    } else {
+      HANDLE_CUDA_ERROR(errCode);
+    }
   }
   ~ScratchDeviceMem() { HANDLE_CUDA_ERROR(cudaFree(d_scratch)); }
 };