Refactored cp

srg-imperial · Mar 17, 2011 · cddaaba · cddaaba
1 parent 0d20c68
commit cddaaba
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 132 deletions.
diff --git a/benchmarks/cp/src/opencl/Makefile b/benchmarks/cp/src/opencl/Makefile
@@ -6,18 +6,21 @@ VERSION=opencl
 COMMON_DIR=../../../../common
 CLFAGS=
 
-all: obj/main.o obj/common.o obj/ocl-wrapper.o
-	g++ -lOpenCL obj/common.o obj/ocl-wrapper.o obj/main.o -o ../../bin/$(BENCHMARK)-$(VERSION)
+all: obj/main.o obj/cuenergy.o obj/common.o obj/ocl-wrapper.o
+	$(CXXLD) -lOpenCL $^ -o ../../bin/$(BENCHMARK)-$(VERSION)
 	cp cuenergy_pre8_coalesce.cl ../../bin/
 
 obj/main.o: main.cpp cuenergy.h
-	g++ $(CFLAGS) -I$(COMMON_DIR) -D OPENCL_PLATFORM="\"$(OPENCL_PLATFORM)\"" -D OPENCL_DEVICE_ID=$(OPENCL_DEVICE_ID) -c main.cpp -o obj/main.o
+	$(CXX) $(CXXFLAGS) -I$(COMMON_DIR) -D OPENCL_PLATFORM="\"$(OPENCL_PLATFORM)\"" -D OPENCL_DEVICE_ID=$(OPENCL_DEVICE_ID) -c main.cpp -o obj/main.o
+
+obj/cuenergy.o: cuenergy.cpp cuenergy.h
+	$(CXX) $(CXXFLAGS) -I$(COMMON_DIR) -D OPENCL_PLATFORM="\"$(OPENCL_PLATFORM)\"" -D OPENCL_DEVICE_ID=$(OPENCL_DEVICE_ID) -c cuenergy.cpp -o obj/cuenergy.o
 
 obj/common.o: $(COMMON_DIR)/common.cpp $(COMMON_DIR)/common.h
-	g++ $(CFLAGS) -c $(COMMON_DIR)/common.cpp -o obj/common.o
+	$(CXX) $(CXXFLAGS) -c $(COMMON_DIR)/common.cpp -o obj/common.o
 
 obj/ocl-wrapper.o: $(COMMON_DIR)/ocl-wrapper.cpp $(COMMON_DIR)/ocl-wrapper.h
-	g++ $(CFLAGS) -c $(COMMON_DIR)/ocl-wrapper.cpp -o obj/ocl-wrapper.o
+	$(CXX) $(CXXFLAGS) -c $(COMMON_DIR)/ocl-wrapper.cpp -o obj/ocl-wrapper.o
 
 clean:
 	rm -f obj/*

diff --git a/benchmarks/cp/src/opencl/cuenergy.cpp b/benchmarks/cp/src/opencl/cuenergy.cpp
@@ -0,0 +1,151 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2007 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+/*
+ * CUDA accelerated coulombic potential grid test code
+ *   John E. Stone <[email protected]>
+ *   http://www.ks.uiuc.edu/~johns/
+ *
+ * Coulombic potential grid calculation microbenchmark based on the time
+ * consuming portions of the 'cionize' ion placement tool.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include<algorithm>
+
+#include <CL/cl.h>
+
+#include "common.h"
+#include "ocl-wrapper.h"
+
+#include "cuenergy.h"
+
+// This function copies atoms from the CPU to the GPU and
+// precalculates (z^2) for each atom.
+
+int copyatomstoconstbuf(cl_command_queue cmdQueue, cl_mem d_atoms, const float *atoms, int count, float zplane) {
+  if (count > MAXATOMS) {
+    printf("Atom count exceeds constant buffer storage capacity\n");
+    return -1;
+  }
+
+  float atompre[4*MAXATOMS];
+  int i;
+  for (i=0; i<count*4; i+=4) {
+    atompre[i    ] = atoms[i    ];
+    atompre[i + 1] = atoms[i + 1];
+    float dz = zplane - atoms[i + 2];
+    atompre[i + 2]  = dz*dz;
+    atompre[i + 3] = atoms[i + 3];
+  }
+
+  clEnqueueWriteBuffer(cmdQueue, d_atoms, CL_TRUE, 0, count * 4 * sizeof(float), atompre, 0, NULL, NULL);
+
+  return 0;
+}
+
+
+int gpuenergy(size_t volsize[3], size_t globalWorkSize[3], size_t localWorkSize[3],
+              int atomcount, float gridspacing, const float *atoms, float *&energy) {
+  // Size of buffer on GPU
+  int volmemsz;
+
+  // initialize OpenCL context and create command queue
+  cl_int ciErr;
+  OclWrapper *ocl = new OclWrapper;
+  if (!ocl->init (OPENCL_PLATFORM, OPENCL_DEVICE_ID))
+    return 1;
+
+  // load and build kernel
+  cl_program program;
+  if (!ocl->buildProgram ("cuenergy_pre8_coalesce.cl", &program))
+    return 1;
+  cl_kernel kernel = clCreateKernel(program, "cenergy", &ciErr);
+  if (!OclWrapper::checkErr(ciErr, "create kernel")) return 1;
+
+  // allocate and initialize the GPU output array
+  volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];
+
+  // profile kernel executions
+  int numIterations = (atomcount%MAXATOMS) ? (atomcount/MAXATOMS + 1) : (atomcount/MAXATOMS);
+  cl_event *evKernel = (cl_event*) malloc (sizeof(cl_event) * numIterations);
+
+#ifdef PROFILING
+  inf_timer tMain;
+  startTimer(&tMain);
+#endif
+
+  // Main computation
+  {
+    cl_mem d_output;	// Output on device
+    cl_mem d_atoms;     // memory to store constant atom data
+    int iterations=0;
+    int atomstart;
+    cl_int ciErr;
+
+    // create output array and initialize it with 0s
+    d_output = clCreateBuffer(ocl->getContext(), CL_MEM_READ_WRITE, volmemsz, NULL, &ciErr);
+    OclWrapper::checkErr(ciErr, "create output buffer");
+
+    energy = (float*)malloc(volmemsz);
+    memset(energy, 0, volmemsz);
+    clEnqueueWriteBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);
+
+    // create constant memory buffer for atoms
+    d_atoms = clCreateBuffer(ocl->getContext(), CL_MEM_READ_ONLY, MAXATOMS*4*sizeof(float), NULL, &ciErr);
+
+
+    // set kernel arguments
+    float gridspacing = 0.1;
+    ciErr = clSetKernelArg(kernel, 1, sizeof(float), (void*)&gridspacing);
+    ciErr = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_output);
+    ciErr = clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_atoms);
+
+    for (atomstart=0; atomstart<atomcount; atomstart+=MAXATOMS) {   
+      int atomsremaining = atomcount - atomstart;
+      int runatoms = (atomsremaining > MAXATOMS) ? MAXATOMS : atomsremaining;
+      iterations++;
+
+      // copy the atoms to the GPU
+      if (copyatomstoconstbuf(ocl->getCmdQueue(), d_atoms, atoms + 4*atomstart, runatoms, 0*gridspacing)) 
+        return -1;
+
+      //if (parameters->synchronizeGpu) cudaThreadSynchronize();
+
+      // set dynamic kernel arguments
+      ciErr = clSetKernelArg(kernel, 0, sizeof(int), (void*)&runatoms);
+
+      // RUN the kernel...
+      ciErr = clEnqueueNDRangeKernel (ocl->getCmdQueue(), kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evKernel[iterations-1]);
+
+    }
+#if 0
+    printf("Done\n");
+#endif
+
+    // Copy the GPU output data back to the host
+    ciErr = clEnqueueReadBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);
+
+    clReleaseMemObject(d_output);
+    clReleaseMemObject(d_atoms);
+  }
+
+#ifdef PROFILING
+  stopTimer(&tMain);
+  printf ("main computation: %fms\n", elapsedTime(tMain));
+#endif
+
+  clReleaseKernel(kernel);
+  clReleaseProgram(program);
+  delete ocl;
+
+  free(evKernel);
+
+  return 0;
+}
diff --git a/benchmarks/cp/src/opencl/cuenergy.h b/benchmarks/cp/src/opencl/cuenergy.h
@@ -30,6 +30,5 @@
 #define VOLSIZEY 512
 #define ATOMCOUNT 40000
 
-int copyatomstoconstbuf(float *atoms,
-			int count,
-			float zplane);
+int gpuenergy(size_t volsize[3], size_t globalWorkSize[3], size_t localWorkSize[3],
+              int atomcount, float gridspacing, const float *atoms, float *&energy);
diff --git a/benchmarks/cp/src/opencl/main.cpp b/benchmarks/cp/src/opencl/main.cpp
@@ -19,10 +19,7 @@
 #include <string.h>
 #include<algorithm>
 
-#include <CL/cl.h>
-
 #include "common.h"
-#include "ocl-wrapper.h"
 
 #include "cuenergy.h"
 
@@ -101,29 +98,6 @@ writeenergy(char *filename, float *energy, size_t* volsize)
 
   return 0;
 }
-// This function copies atoms from the CPU to the GPU and
-// precalculates (z^2) for each atom.
-
-int copyatomstoconstbuf(cl_command_queue cmdQueue, cl_mem d_atoms, float *atoms, int count, float zplane) {
-  if (count > MAXATOMS) {
-    printf("Atom count exceeds constant buffer storage capacity\n");
-    return -1;
-  }
-
-  float atompre[4*MAXATOMS];
-  int i;
-  for (i=0; i<count*4; i+=4) {
-    atompre[i    ] = atoms[i    ];
-    atompre[i + 1] = atoms[i + 1];
-    float dz = zplane - atoms[i + 2];
-    atompre[i + 2]  = dz*dz;
-    atompre[i + 3] = atoms[i + 3];
-  }
-
-  clEnqueueWriteBuffer(cmdQueue, d_atoms, CL_TRUE, 0, count * 4 * sizeof(float), atompre, 0, NULL, NULL);
-
-  return 0;
-}
 
 
 int main(int argc, char** argv) {
@@ -147,26 +121,10 @@ int main(int argc, char** argv) {
   // voxel spacing
   const float gridspacing = 0.1;
 
-  // Size of buffer on GPU
-  int volmemsz;
-
   //printf("CUDA accelerated coulombic potential microbenchmark\n");
   //printf("Original version by John E. Stone <[email protected]>\n");
   //printf("This version maintained by Chris Rodrigues\n");
 
-  // initialize OpenCL context and create command queue
-  cl_int ciErr;
-  OclWrapper *ocl = new OclWrapper;
-  if (!ocl->init (OPENCL_PLATFORM, OPENCL_DEVICE_ID))
-    return 1;
-
-  // load and build kernel
-  cl_program program;
-  if (!ocl->buildProgram ("cuenergy_pre8_coalesce.cl", &program))
-    return 1;
-  cl_kernel kernel = clCreateKernel(program, "cenergy", &ciErr);
-  if (!OclWrapper::checkErr(ciErr, "create kernel")) return 1;
-
   // setup CUDA grid and block sizes (default)
   localWorkSize[0] = BLOCKSIZEX;		// each thread does multiple Xs
   localWorkSize[1] = BLOCKSIZEY;
@@ -183,84 +141,8 @@ int main(int argc, char** argv) {
   if (initatoms(&atoms, atomcount, volsize, gridspacing))
     return -1;
 
-  // allocate and initialize the GPU output array
-  volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];
-
-  // profile kernel executions
-  int numIterations = (atomcount%MAXATOMS) ? (atomcount/MAXATOMS + 1) : (atomcount/MAXATOMS);
-  cl_event *evKernel = (cl_event*) malloc (sizeof(cl_event) * numIterations);
-
-#ifdef PROFILING
-  inf_timer tMain;
-  startTimer(&tMain);
-#endif
-
-  // Main computation
-  {
-    cl_mem d_output;	// Output on device
-    cl_mem d_atoms;     // memory to store constant atom data
-    int iterations=0;
-    int atomstart;
-    cl_int ciErr;
-
-    // create output array and initialize it with 0s
-    d_output = clCreateBuffer(ocl->getContext(), CL_MEM_READ_WRITE, volmemsz, NULL, &ciErr);
-    OclWrapper::checkErr(ciErr, "create output buffer");
-
-    energy = (float*)malloc(volmemsz);
-    memset(energy, 0, volmemsz);
-    clEnqueueWriteBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);
-
-    // create constant memory buffer for atoms
-    d_atoms = clCreateBuffer(ocl->getContext(), CL_MEM_READ_ONLY, MAXATOMS*4*sizeof(float), NULL, &ciErr);
-
-
-    // set kernel arguments
-    float gridspacing = 0.1;
-    ciErr = clSetKernelArg(kernel, 1, sizeof(float), (void*)&gridspacing);
-    ciErr = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_output);
-    ciErr = clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_atoms);
-
-    for (atomstart=0; atomstart<atomcount; atomstart+=MAXATOMS) {   
-      int atomsremaining = atomcount - atomstart;
-      int runatoms = (atomsremaining > MAXATOMS) ? MAXATOMS : atomsremaining;
-      iterations++;
-
-      // copy the atoms to the GPU
-      if (copyatomstoconstbuf(ocl->getCmdQueue(), d_atoms, atoms + 4*atomstart, runatoms, 0*gridspacing)) 
-        return -1;
-
-      //if (parameters->synchronizeGpu) cudaThreadSynchronize();
-
-      // set dynamic kernel arguments
-      ciErr = clSetKernelArg(kernel, 0, sizeof(int), (void*)&runatoms);
-
-      // RUN the kernel...
-      ciErr = clEnqueueNDRangeKernel (ocl->getCmdQueue(), kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evKernel[iterations-1]);
-
-    }
-#if 0
-    printf("Done\n");
-#endif
-
-    // Copy the GPU output data back to the host
-    ciErr = clEnqueueReadBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);
-
-    clReleaseMemObject(d_output);
-    clReleaseMemObject(d_atoms);
-  }
-
-#ifdef PROFILING
-  stopTimer(&tMain);
-  printf ("main computation: %fms\n", elapsedTime(tMain));
-#endif
-
-  free(atoms);
-  free(energy);
-
-  clReleaseKernel(kernel);
-  clReleaseProgram(program);
-  delete ocl;
+  if (gpuenergy(volsize, globalWorkSize, localWorkSize, atomcount, gridspacing, atoms, energy))
+    return -1;
 
 #ifdef PROFILING
   // output kernel runtimes
@@ -279,10 +161,8 @@ int main(int argc, char** argv) {
   printf ("kernel avg: %fms\n", avg/numIterations);
 #endif
 
-  free(evKernel);
+  free(atoms);
+  free(energy);
 
   return 0;
 }
-
-
-