Skip to content

Commit

Permalink
Refactored cp
Browse files Browse the repository at this point in the history
  • Loading branch information
Peter Collingbourne committed Mar 17, 2011
1 parent 0d20c68 commit cddaaba
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 132 deletions.
13 changes: 8 additions & 5 deletions benchmarks/cp/src/opencl/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,21 @@ VERSION=opencl
COMMON_DIR=../../../../common
CLFAGS=

all: obj/main.o obj/common.o obj/ocl-wrapper.o
g++ -lOpenCL obj/common.o obj/ocl-wrapper.o obj/main.o -o ../../bin/$(BENCHMARK)-$(VERSION)
all: obj/main.o obj/cuenergy.o obj/common.o obj/ocl-wrapper.o
$(CXXLD) -lOpenCL $^ -o ../../bin/$(BENCHMARK)-$(VERSION)
cp cuenergy_pre8_coalesce.cl ../../bin/

obj/main.o: main.cpp cuenergy.h
g++ $(CFLAGS) -I$(COMMON_DIR) -D OPENCL_PLATFORM="\"$(OPENCL_PLATFORM)\"" -D OPENCL_DEVICE_ID=$(OPENCL_DEVICE_ID) -c main.cpp -o obj/main.o
$(CXX) $(CXXFLAGS) -I$(COMMON_DIR) -D OPENCL_PLATFORM="\"$(OPENCL_PLATFORM)\"" -D OPENCL_DEVICE_ID=$(OPENCL_DEVICE_ID) -c main.cpp -o obj/main.o

obj/cuenergy.o: cuenergy.cpp cuenergy.h
$(CXX) $(CXXFLAGS) -I$(COMMON_DIR) -D OPENCL_PLATFORM="\"$(OPENCL_PLATFORM)\"" -D OPENCL_DEVICE_ID=$(OPENCL_DEVICE_ID) -c cuenergy.cpp -o obj/cuenergy.o

obj/common.o: $(COMMON_DIR)/common.cpp $(COMMON_DIR)/common.h
g++ $(CFLAGS) -c $(COMMON_DIR)/common.cpp -o obj/common.o
$(CXX) $(CXXFLAGS) -c $(COMMON_DIR)/common.cpp -o obj/common.o

obj/ocl-wrapper.o: $(COMMON_DIR)/ocl-wrapper.cpp $(COMMON_DIR)/ocl-wrapper.h
g++ $(CFLAGS) -c $(COMMON_DIR)/ocl-wrapper.cpp -o obj/ocl-wrapper.o
$(CXX) $(CXXFLAGS) -c $(COMMON_DIR)/ocl-wrapper.cpp -o obj/ocl-wrapper.o

clean:
rm -f obj/*
Expand Down
151 changes: 151 additions & 0 deletions benchmarks/cp/src/opencl/cuenergy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/***************************************************************************
*cr
*cr (C) Copyright 2007 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* CUDA accelerated coulombic potential grid test code
* John E. Stone <[email protected]>
* http://www.ks.uiuc.edu/~johns/
*
* Coulombic potential grid calculation microbenchmark based on the time
* consuming portions of the 'cionize' ion placement tool.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include<algorithm>

#include <CL/cl.h>

#include "common.h"
#include "ocl-wrapper.h"

#include "cuenergy.h"

// This function copies atoms from the CPU to the GPU and
// precalculates (z^2) for each atom.

int copyatomstoconstbuf(cl_command_queue cmdQueue, cl_mem d_atoms, const float *atoms, int count, float zplane) {
if (count > MAXATOMS) {
printf("Atom count exceeds constant buffer storage capacity\n");
return -1;
}

float atompre[4*MAXATOMS];
int i;
for (i=0; i<count*4; i+=4) {
atompre[i ] = atoms[i ];
atompre[i + 1] = atoms[i + 1];
float dz = zplane - atoms[i + 2];
atompre[i + 2] = dz*dz;
atompre[i + 3] = atoms[i + 3];
}

clEnqueueWriteBuffer(cmdQueue, d_atoms, CL_TRUE, 0, count * 4 * sizeof(float), atompre, 0, NULL, NULL);

return 0;
}


int gpuenergy(size_t volsize[3], size_t globalWorkSize[3], size_t localWorkSize[3],
int atomcount, float gridspacing, const float *atoms, float *&energy) {
// Size of buffer on GPU
int volmemsz;

// initialize OpenCL context and create command queue
cl_int ciErr;
OclWrapper *ocl = new OclWrapper;
if (!ocl->init (OPENCL_PLATFORM, OPENCL_DEVICE_ID))
return 1;

// load and build kernel
cl_program program;
if (!ocl->buildProgram ("cuenergy_pre8_coalesce.cl", &program))
return 1;
cl_kernel kernel = clCreateKernel(program, "cenergy", &ciErr);
if (!OclWrapper::checkErr(ciErr, "create kernel")) return 1;

// allocate and initialize the GPU output array
volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];

// profile kernel executions
int numIterations = (atomcount%MAXATOMS) ? (atomcount/MAXATOMS + 1) : (atomcount/MAXATOMS);
cl_event *evKernel = (cl_event*) malloc (sizeof(cl_event) * numIterations);

#ifdef PROFILING
inf_timer tMain;
startTimer(&tMain);
#endif

// Main computation
{
cl_mem d_output; // Output on device
cl_mem d_atoms; // memory to store constant atom data
int iterations=0;
int atomstart;
cl_int ciErr;

// create output array and initialize it with 0s
d_output = clCreateBuffer(ocl->getContext(), CL_MEM_READ_WRITE, volmemsz, NULL, &ciErr);
OclWrapper::checkErr(ciErr, "create output buffer");

energy = (float*)malloc(volmemsz);
memset(energy, 0, volmemsz);
clEnqueueWriteBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);

// create constant memory buffer for atoms
d_atoms = clCreateBuffer(ocl->getContext(), CL_MEM_READ_ONLY, MAXATOMS*4*sizeof(float), NULL, &ciErr);


// set kernel arguments
float gridspacing = 0.1;
ciErr = clSetKernelArg(kernel, 1, sizeof(float), (void*)&gridspacing);
ciErr = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_output);
ciErr = clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_atoms);

for (atomstart=0; atomstart<atomcount; atomstart+=MAXATOMS) {
int atomsremaining = atomcount - atomstart;
int runatoms = (atomsremaining > MAXATOMS) ? MAXATOMS : atomsremaining;
iterations++;

// copy the atoms to the GPU
if (copyatomstoconstbuf(ocl->getCmdQueue(), d_atoms, atoms + 4*atomstart, runatoms, 0*gridspacing))
return -1;

//if (parameters->synchronizeGpu) cudaThreadSynchronize();

// set dynamic kernel arguments
ciErr = clSetKernelArg(kernel, 0, sizeof(int), (void*)&runatoms);

// RUN the kernel...
ciErr = clEnqueueNDRangeKernel (ocl->getCmdQueue(), kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evKernel[iterations-1]);

}
#if 0
printf("Done\n");
#endif

// Copy the GPU output data back to the host
ciErr = clEnqueueReadBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);

clReleaseMemObject(d_output);
clReleaseMemObject(d_atoms);
}

#ifdef PROFILING
stopTimer(&tMain);
printf ("main computation: %fms\n", elapsedTime(tMain));
#endif

clReleaseKernel(kernel);
clReleaseProgram(program);
delete ocl;

free(evKernel);

return 0;
}
5 changes: 2 additions & 3 deletions benchmarks/cp/src/opencl/cuenergy.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,5 @@
#define VOLSIZEY 512
#define ATOMCOUNT 40000

int copyatomstoconstbuf(float *atoms,
int count,
float zplane);
int gpuenergy(size_t volsize[3], size_t globalWorkSize[3], size_t localWorkSize[3],
int atomcount, float gridspacing, const float *atoms, float *&energy);
128 changes: 4 additions & 124 deletions benchmarks/cp/src/opencl/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@
#include <string.h>
#include<algorithm>

#include <CL/cl.h>

#include "common.h"
#include "ocl-wrapper.h"

#include "cuenergy.h"

Expand Down Expand Up @@ -101,29 +98,6 @@ writeenergy(char *filename, float *energy, size_t* volsize)

return 0;
}
// This function copies atoms from the CPU to the GPU and
// precalculates (z^2) for each atom.

int copyatomstoconstbuf(cl_command_queue cmdQueue, cl_mem d_atoms, float *atoms, int count, float zplane) {
if (count > MAXATOMS) {
printf("Atom count exceeds constant buffer storage capacity\n");
return -1;
}

float atompre[4*MAXATOMS];
int i;
for (i=0; i<count*4; i+=4) {
atompre[i ] = atoms[i ];
atompre[i + 1] = atoms[i + 1];
float dz = zplane - atoms[i + 2];
atompre[i + 2] = dz*dz;
atompre[i + 3] = atoms[i + 3];
}

clEnqueueWriteBuffer(cmdQueue, d_atoms, CL_TRUE, 0, count * 4 * sizeof(float), atompre, 0, NULL, NULL);

return 0;
}


int main(int argc, char** argv) {
Expand All @@ -147,26 +121,10 @@ int main(int argc, char** argv) {
// voxel spacing
const float gridspacing = 0.1;

// Size of buffer on GPU
int volmemsz;

//printf("CUDA accelerated coulombic potential microbenchmark\n");
//printf("Original version by John E. Stone <[email protected]>\n");
//printf("This version maintained by Chris Rodrigues\n");

// initialize OpenCL context and create command queue
cl_int ciErr;
OclWrapper *ocl = new OclWrapper;
if (!ocl->init (OPENCL_PLATFORM, OPENCL_DEVICE_ID))
return 1;

// load and build kernel
cl_program program;
if (!ocl->buildProgram ("cuenergy_pre8_coalesce.cl", &program))
return 1;
cl_kernel kernel = clCreateKernel(program, "cenergy", &ciErr);
if (!OclWrapper::checkErr(ciErr, "create kernel")) return 1;

// setup CUDA grid and block sizes (default)
localWorkSize[0] = BLOCKSIZEX; // each thread does multiple Xs
localWorkSize[1] = BLOCKSIZEY;
Expand All @@ -183,84 +141,8 @@ int main(int argc, char** argv) {
if (initatoms(&atoms, atomcount, volsize, gridspacing))
return -1;

// allocate and initialize the GPU output array
volmemsz = sizeof(float) * volsize[0] * volsize[1] * volsize[2];

// profile kernel executions
int numIterations = (atomcount%MAXATOMS) ? (atomcount/MAXATOMS + 1) : (atomcount/MAXATOMS);
cl_event *evKernel = (cl_event*) malloc (sizeof(cl_event) * numIterations);

#ifdef PROFILING
inf_timer tMain;
startTimer(&tMain);
#endif

// Main computation
{
cl_mem d_output; // Output on device
cl_mem d_atoms; // memory to store constant atom data
int iterations=0;
int atomstart;
cl_int ciErr;

// create output array and initialize it with 0s
d_output = clCreateBuffer(ocl->getContext(), CL_MEM_READ_WRITE, volmemsz, NULL, &ciErr);
OclWrapper::checkErr(ciErr, "create output buffer");

energy = (float*)malloc(volmemsz);
memset(energy, 0, volmemsz);
clEnqueueWriteBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);

// create constant memory buffer for atoms
d_atoms = clCreateBuffer(ocl->getContext(), CL_MEM_READ_ONLY, MAXATOMS*4*sizeof(float), NULL, &ciErr);


// set kernel arguments
float gridspacing = 0.1;
ciErr = clSetKernelArg(kernel, 1, sizeof(float), (void*)&gridspacing);
ciErr = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_output);
ciErr = clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_atoms);

for (atomstart=0; atomstart<atomcount; atomstart+=MAXATOMS) {
int atomsremaining = atomcount - atomstart;
int runatoms = (atomsremaining > MAXATOMS) ? MAXATOMS : atomsremaining;
iterations++;

// copy the atoms to the GPU
if (copyatomstoconstbuf(ocl->getCmdQueue(), d_atoms, atoms + 4*atomstart, runatoms, 0*gridspacing))
return -1;

//if (parameters->synchronizeGpu) cudaThreadSynchronize();

// set dynamic kernel arguments
ciErr = clSetKernelArg(kernel, 0, sizeof(int), (void*)&runatoms);

// RUN the kernel...
ciErr = clEnqueueNDRangeKernel (ocl->getCmdQueue(), kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evKernel[iterations-1]);

}
#if 0
printf("Done\n");
#endif

// Copy the GPU output data back to the host
ciErr = clEnqueueReadBuffer(ocl->getCmdQueue(), d_output, CL_TRUE, 0, volmemsz, energy, 0, NULL, NULL);

clReleaseMemObject(d_output);
clReleaseMemObject(d_atoms);
}

#ifdef PROFILING
stopTimer(&tMain);
printf ("main computation: %fms\n", elapsedTime(tMain));
#endif

free(atoms);
free(energy);

clReleaseKernel(kernel);
clReleaseProgram(program);
delete ocl;
if (gpuenergy(volsize, globalWorkSize, localWorkSize, atomcount, gridspacing, atoms, energy))
return -1;

#ifdef PROFILING
// output kernel runtimes
Expand All @@ -279,10 +161,8 @@ int main(int argc, char** argv) {
printf ("kernel avg: %fms\n", avg/numIterations);
#endif

free(evKernel);
free(atoms);
free(energy);

return 0;
}



0 comments on commit cddaaba

Please sign in to comment.