Java源码示例:jcuda.driver.CUdeviceptr
示例1
/**
* Returns whether the given pointers refer to the same memory address.<br>
* <br>
* <b>NOTE:<b><br>
* <br>
* This method does NOT implement a general way for comparing arbitrary
* pointers. The concept of equality of pointers is subtle, and by
* default NOT implemented in the pointer classes. This method is
* SOLELY intended for the test cases in which it is used.
*
* @param p0 The first pointer
* @param p1 The second pointer
* @return Whether the pointers are equal
*/
static boolean equal(CUdeviceptr p0, CUdeviceptr p1)
{
class TestCUdeviceptr extends CUdeviceptr
{
TestCUdeviceptr(CUdeviceptr other)
{
super(other);
}
@Override
public long getNativePointer()
{
return super.getNativePointer();
}
}
TestCUdeviceptr tp0 = new TestCUdeviceptr(p0);
TestCUdeviceptr tp1 = new TestCUdeviceptr(p1);
return tp0.getNativePointer() == tp1.getNativePointer();
}
示例2
/**
* Create device data containing the given float value, the given number
* of times
*
* @param numElements The number of elements
* @param value The value of the elements
* @return The pointer to the data
*/
private static CUdeviceptr createDeviceData(int numElements, float value)
{
float hostData[] = new float[numElements];
for (int i = 0; i < numElements; i++)
{
hostData[i] = value;
}
CUdeviceptr deviceData = new CUdeviceptr();
cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceData, Pointer.to(hostData),
numElements * Sizeof.FLOAT);
return deviceData;
}
示例3
/**
* Initialize the context, module, function and other elements used
* in this sample
*/
private static void init()
{
// Initialize the driver API and create a context for the first device
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
context = new CUcontext();
cuCtxCreate(context, 0, device);
// Create the PTX file by calling the NVCC
String ptxFileName = JCudaSamplesUtils.preparePtxFile(
"src/main/resources/kernels/JCudaReductionKernel.cu");
// Load the module from the PTX file
module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "reduce" function.
function = new CUfunction();
cuModuleGetFunction(function, module, "reduce");
// Allocate a chunk of temporary memory (must be at least
// numberOfBlocks * Sizeof.FLOAT)
deviceBuffer = new CUdeviceptr();
cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT);
}
示例4
public static CUdeviceptr allocateInput(float[] input) {
int typeSize = Sizeof.FLOAT;
Pointer ptr = Pointer.to(input);
int size = input.length;
CUdeviceptr dInput = new CUdeviceptr();
cuMemAlloc(dInput, size * Sizeof.FLOAT);
cuMemcpyHtoD(dInput, ptr, size * typeSize);
return dInput;
}
示例5
public void startup(float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices) {
this.whiteTemplates = whiteTemplates;
this.blackTemplates = blackTemplates;
this.templateNumIndices = templateNumIndices;
this.templateIndicesOffsets = templateIndicesOffsets;
this.maxTemplateWidth = maxTemplateWidth;
this.minTemplateWidth = minTemplateWidth;
this.totalTemplateNumIndices = totalTemplateNumIndices;
int numTemplateWidths = (maxTemplateWidth-minTemplateWidth)+1;
int extendedMaxSeqLength = (BLOCK_SIZE_X*ROLL_X) * (int) Math.ceil(((double) maxSequenceLength) / (BLOCK_SIZE_X*ROLL_X));
this.d_Ow = new CUdeviceptr();
cuMemAlloc(d_Ow, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
this.d_Ob = new CUdeviceptr();
cuMemAlloc(d_Ob, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
this.d_scores = new CUdeviceptr();
cuMemAlloc(d_scores, maxSequenceLength*totalTemplateNumIndices * Sizeof.FLOAT);
this.d_Tw = new CUdeviceptr[numTemplateWidths];
this.d_Tb = new CUdeviceptr[numTemplateWidths];
for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) {
if (templateNumIndices[tw-minTemplateWidth] > 0) {
d_Tw[tw-minTemplateWidth] = new CUdeviceptr();
cuMemAlloc(d_Tw[tw-minTemplateWidth], whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
cuMemcpyHtoD(d_Tw[tw-minTemplateWidth], Pointer.to(whiteTemplates[tw-minTemplateWidth]), whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
d_Tb[tw-minTemplateWidth] = new CUdeviceptr();
cuMemAlloc(d_Tb[tw-minTemplateWidth], blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
cuMemcpyHtoD(d_Tb[tw-minTemplateWidth], Pointer.to(blackTemplates[tw-minTemplateWidth]), blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
}
}
}
示例6
public static CUdeviceptr allocateInput(float[] input) {
int typeSize = Sizeof.FLOAT;
Pointer ptr = Pointer.to(input);
int size = input.length;
CUdeviceptr dInput = new CUdeviceptr();
cuMemAlloc(dInput, size * Sizeof.FLOAT);
cuMemcpyHtoD(dInput, ptr, size * typeSize);
return dInput;
}
示例7
public static void main(String[] args)
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Afterwards, initialize the vector library, which will
// attach to the current context
VecFloat.init();
// Allocate and fill the host input data
int n = 50000;
float hostX[] = new float[n];
float hostY[] = new float[n];
for(int i = 0; i < n; i++)
{
hostX[i] = (float)i;
hostY[i] = (float)i;
}
// Allocate the device pointers, and copy the
// host input data to the device
CUdeviceptr deviceX = new CUdeviceptr();
cuMemAlloc(deviceX, n * Sizeof.FLOAT);
cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.FLOAT);
CUdeviceptr deviceY = new CUdeviceptr();
cuMemAlloc(deviceY, n * Sizeof.FLOAT);
cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.FLOAT);
CUdeviceptr deviceResult = new CUdeviceptr();
cuMemAlloc(deviceResult, n * Sizeof.FLOAT);
// Perform the vector operations
VecFloat.cos(n, deviceX, deviceX); // x = cos(x)
VecFloat.mul(n, deviceX, deviceX, deviceX); // x = x*x
VecFloat.sin(n, deviceY, deviceY); // y = sin(y)
VecFloat.mul(n, deviceY, deviceY, deviceY); // y = y*y
VecFloat.add(n, deviceResult, deviceX, deviceY); // result = x+y
// Allocate host output memory and copy the device output
// to the host.
float hostResult[] = new float[n];
cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.FLOAT);
// Verify the result
boolean passed = true;
for(int i = 0; i < n; i++)
{
float expected = (float)(
Math.cos(hostX[i])*Math.cos(hostX[i])+
Math.sin(hostY[i])*Math.sin(hostY[i]));
if (Math.abs(hostResult[i] - expected) > 1e-5)
{
System.out.println(
"At index "+i+ " found "+hostResult[i]+
" but expected "+expected);
passed = false;
break;
}
}
System.out.println("Test "+(passed?"PASSED":"FAILED"));
// Clean up.
cuMemFree(deviceX);
cuMemFree(deviceY);
cuMemFree(deviceResult);
VecFloat.shutdown();
}
示例8
public static void main(String[] args)
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Afterwards, initialize the vector library, which will
// attach to the current context
VecDouble.init();
// Allocate and fill the host input data
int n = 50000;
double hostX[] = new double[n];
double hostY[] = new double[n];
for(int i = 0; i < n; i++)
{
hostX[i] = (double)i;
hostY[i] = (double)i;
}
// Allocate the device pointers, and copy the
// host input data to the device
CUdeviceptr deviceX = new CUdeviceptr();
cuMemAlloc(deviceX, n * Sizeof.DOUBLE);
cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.DOUBLE);
CUdeviceptr deviceY = new CUdeviceptr();
cuMemAlloc(deviceY, n * Sizeof.DOUBLE);
cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.DOUBLE);
CUdeviceptr deviceResult = new CUdeviceptr();
cuMemAlloc(deviceResult, n * Sizeof.DOUBLE);
// Perform the vector operations
VecDouble.cos(n, deviceX, deviceX); // x = cos(x)
VecDouble.mul(n, deviceX, deviceX, deviceX); // x = x*x
VecDouble.sin(n, deviceY, deviceY); // y = sin(y)
VecDouble.mul(n, deviceY, deviceY, deviceY); // y = y*y
VecDouble.add(n, deviceResult, deviceX, deviceY); // result = x+y
// Allocate host output memory and copy the device output
// to the host.
double hostResult[] = new double[n];
cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.DOUBLE);
// Verify the result
boolean passed = true;
for(int i = 0; i < n; i++)
{
double expected =
Math.cos(hostX[i])*Math.cos(hostX[i])+
Math.sin(hostY[i])*Math.sin(hostY[i]);
if (Math.abs(hostResult[i] - expected) > 1e-14)
{
System.out.println(
"At index "+i+ " found "+hostResult[i]+
" but expected "+expected);
passed = false;
break;
}
}
System.out.println("Test "+(passed?"PASSED":"FAILED"));
// Clean up.
cuMemFree(deviceX);
cuMemFree(deviceY);
cuMemFree(deviceResult);
VecDouble.shutdown();
}
示例9
/**
* Call the kernel function, rendering the 3D volume data image
* into the PBO
*/
private void render()
{
// Map the PBO to get a CUDA device pointer
CUdeviceptr d_output = new CUdeviceptr();
cuGraphicsMapResources(
1, new CUgraphicsResource[]{ pboGraphicsResource }, null);
cuGraphicsResourceGetMappedPointer(
d_output, new long[1], pboGraphicsResource);
cuMemsetD32(d_output, 0, width * height);
// Set up the execution parameters for the kernel:
// - One pointer for the output that is mapped to the PBO
// - Two ints for the width and height of the image to render
// - Four floats for the visualization parameters of the renderer
Pointer dOut = Pointer.to(d_output);
Pointer pWidth = Pointer.to(new int[]{width});
Pointer pHeight = Pointer.to(new int[]{height});
Pointer pDensity = Pointer.to(new float[]{density});
Pointer pBrightness = Pointer.to(new float[]{brightness});
Pointer pTransferOffset = Pointer.to(new float[]{transferOffset});
Pointer pTransferScale = Pointer.to(new float[]{transferScale});
Pointer kernelParameters = Pointer.to(
dOut,
pWidth,
pHeight,
pDensity,
pBrightness,
pTransferOffset,
pTransferScale
);
// Call the kernel function.
cuLaunchKernel(function,
gridSize.x, gridSize.y, gridSize.z,
blockSize.x, blockSize.y, blockSize.z,
0, null, kernelParameters, null);
cuCtxSynchronize();
// Unmap buffer object
cuGraphicsUnmapResources(
1, new CUgraphicsResource[]{pboGraphicsResource}, null);
}
示例10
/**
* Run the CUDA computation to create new vertex positions
* inside the vertexBufferObject.
*/
private void runCuda()
{
// Map the vertexBufferObject for writing from CUDA.
// The basePointer will afterwards point to the
// beginning of the memory area of the VBO.
CUdeviceptr basePointer = new CUdeviceptr();
cuGraphicsMapResources(
1, new CUgraphicsResource[]{vboGraphicsResource}, null);
cuGraphicsResourceGetMappedPointer(
basePointer, new long[1], vboGraphicsResource);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values. One
// pointer to the base pointer of the geometry data,
// one int for the mesh width, one int for the mesh
// height, and one float for the current animation state.
Pointer kernelParameters = Pointer.to(
Pointer.to(basePointer),
Pointer.to(new int[]{meshWidth}),
Pointer.to(new int[]{meshHeight}),
Pointer.to(new float[]{animationState})
);
// Call the kernel function.
int blockX = 8;
int blockY = 8;
int gridX = meshWidth / blockX;
int gridY = meshHeight / blockY;
cuLaunchKernel(function,
gridX, gridY, 1, // Grid dimension
blockX, blockY, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
cuCtxSynchronize();
// Unmap buffer object
cuGraphicsUnmapResources(
1, new CUgraphicsResource[]{vboGraphicsResource}, null);
}
示例11
/**
* Run the CUDA computation to create new vertex positions
* inside the vertexBufferObject.
*
* @param gl The current GL.
*/
private void runCuda(GL gl)
{
// Map the vertexBufferObject for writing from CUDA.
// The basePointer will afterwards point to the
// beginning of the memory area of the VBO.
CUdeviceptr basePointer = new CUdeviceptr();
cuGraphicsMapResources(
1, new CUgraphicsResource[]{vboGraphicsResource}, null);
cuGraphicsResourceGetMappedPointer(
basePointer, new long[1], vboGraphicsResource);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values. One
// pointer to the base pointer of the geometry data,
// one int for the mesh width, one int for the mesh
// height, and one float for the current animation state.
Pointer kernelParameters = Pointer.to(
Pointer.to(basePointer),
Pointer.to(new int[]{meshWidth}),
Pointer.to(new int[]{meshHeight}),
Pointer.to(new float[]{animationState})
);
// Call the kernel function.
int blockX = 8;
int blockY = 8;
int gridX = meshWidth / blockX;
int gridY = meshHeight / blockY;
cuLaunchKernel(function,
gridX, gridY, 1, // Grid dimension
blockX, blockY, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
cuCtxSynchronize();
// Unmap buffer object
cuGraphicsUnmapResources(
1, new CUgraphicsResource[]{vboGraphicsResource}, null);
}
示例12
/**
* Entry point of this sample
*
* @param args Not used
*/
public static void main(String args[])
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
init();
boolean passed = true;
for (int n = 100000; n <= 26500000; n *= 2)
{
float hostInput[] = createRandomArray(n);
long timeNs0 = 0;
long timeNs1 = 0;
// Copy the input data to the device
timeNs0 = System.nanoTime();
CUdeviceptr deviceInput = new CUdeviceptr();
cuMemAlloc(deviceInput, hostInput.length * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInput, Pointer.to(hostInput),
hostInput.length * Sizeof.FLOAT);
timeNs1 = System.nanoTime();
long durationCopyNs = timeNs1 - timeNs0;
// Execute the reduction with CUDA
timeNs0 = System.nanoTime();
float resultJCuda = reduce(deviceInput, hostInput.length);
timeNs1 = System.nanoTime();
long durationCompNs = timeNs1 - timeNs0;
cuMemFree(deviceInput);
// Execute the reduction with Java
timeNs0 = System.nanoTime();
float resultJava = reduceHost(hostInput);
timeNs1 = System.nanoTime();
long durationJavaNs = timeNs1 - timeNs0;
System.out.println("Reduction of " + n + " elements");
System.out.printf(Locale.ENGLISH,
" JCuda: %7.3f ms, result: %f " +
"(copy: %7.3f ms, comp: %7.3f ms)\n",
(durationCopyNs + durationCompNs) / 1e6, resultJCuda,
durationCopyNs / 1e6, durationCompNs / 1e6);
System.out.printf(Locale.ENGLISH,
" Java : %7.3f ms, result: %f\n",
durationJavaNs / 1e6, resultJava);
passed &=
Math.abs(resultJCuda - resultJava) < resultJava * 1e-5;
}
System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
shutdown();
}
示例13
/**
* Create a Workload instance. This method is called by multiple host
* threads, to create the individual workloads, and to send the
* commands for processing the workloads to CUDA
*
* @param index The index of the workload
* @param executor The executor service
*/
private static void createWorkloadOnHost(
final int index, final ExecutorService executor)
{
// Make sure that the CUDA context is current for the calling thread
cuCtxSetCurrent(context);
// Initialize the workload, and create the CUDA stream
System.out.println(index + ": Initializing workload");
final Workload workload = new Workload();
workload.index = index;
workload.stream = new CUstream();
cuStreamCreate(workload.stream, 0);
// Create the host data of the workload
System.out.println(index + ": Create host data");
workload.hostData = new Pointer();
cuMemHostAlloc(workload.hostData, WORKLOAD_SIZE * Sizeof.INT, 0);
ByteBuffer hostByteBuffer =
workload.hostData.getByteBuffer(0, WORKLOAD_SIZE * Sizeof.INT);
IntBuffer hostIntBuffer =
hostByteBuffer.order(ByteOrder.nativeOrder()).asIntBuffer();
for (int i = 0; i < WORKLOAD_SIZE; i++)
{
hostIntBuffer.put(i, i);
}
workload.deviceData = new CUdeviceptr();
cuMemAlloc(workload.deviceData, WORKLOAD_SIZE * Sizeof.INT);
// Execute the CUDA commands:
// - Copy the host data to the device
// - Execute the kernel
// - Copy the modified device data back to the host
// All this is done asynchronously
System.out.println(index + ": Execute CUDA commands");
cuMemcpyHtoDAsync(workload.deviceData, workload.hostData,
WORKLOAD_SIZE * Sizeof.INT, workload.stream);
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{WORKLOAD_SIZE}),
Pointer.to(workload.deviceData)
);
int blockSizeX = 256;
int gridSizeX = (WORKLOAD_SIZE + blockSizeX - 1) / blockSizeX;
cuLaunchKernel(function, gridSizeX, 1, 1, blockSizeX, 1, 1,
0, workload.stream, kernelParameters, null);
cuMemcpyDtoHAsync(workload.hostData, workload.deviceData,
WORKLOAD_SIZE * Sizeof.INT, workload.stream);
// Define the callback that will be called when all CUDA commands
// on the stream have finished. This callback will forward the
// workload to the "finishWorkloadOnHost" method.
CUstreamCallback callback = new CUstreamCallback()
{
@Override
public void call(
CUstream hStream, int status, final Object userData)
{
System.out.println(index + ": Callback was called");
Runnable runnable = new Runnable()
{
@Override
public void run()
{
finishWorkloadOnHost(userData);
}
};
executor.submit(runnable);
}
};
cuStreamAddCallback(workload.stream, callback, workload, 0);
}
示例14
public static void main(String[] args)
{
JCudaDriver.setExceptionsEnabled(true);
// Initialize a context for the first device
cuInit(0);
CUcontext context = new CUcontext();
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
cuCtxCreate(context, 0, device);
// Create the CUBIN file by calling the NVCC.
// See the prepareDefaultCubinFile method for the details about
// the NVCC parameters that are used here.
String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile(
"src/main/resources/kernels/JCudaDynamicParallelismKernel.cu");
// Load the CUBIN file
CUmodule module = new CUmodule();
cuModuleLoad(module, cubinFileName);
// Obtain a function pointer to the "parentKernel" function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "parentKernel");
// Define the nesting structure.
//
// NOTE: The number of child threads MUST match the value that
// is used in the kernel, for the childKernel<<<1, 8>>> call!
//
int numParentThreads = 8;
int numChildThreads = 8;
// Allocate the device data that will be filled by the kernel
int numElements = numParentThreads * numChildThreads;
CUdeviceptr deviceData = new CUdeviceptr();
cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] { numElements }),
Pointer.to(deviceData)
);
// Call the kernel function.
int blockSizeX = numParentThreads;
int gridSizeX = (numElements + numElements - 1) / blockSizeX;
cuLaunchKernel(function,
gridSizeX, 1, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
cuCtxSynchronize();
// Copy the device data to the host
float hostData[] = new float[numElements];
for(int i = 0; i < numElements; i++)
{
hostData[i] = i;
}
cuMemcpyDtoH(Pointer.to(hostData),
deviceData, numElements * Sizeof.FLOAT);
// Compare the host data with the expected values
float hostDataRef[] = new float[numElements];
for(int i = 0; i < numParentThreads; i++)
{
for (int j=0; j < numChildThreads; j++)
{
hostDataRef[i * numChildThreads + j] = i + 0.1f * j;
}
}
System.out.println("Result: "+Arrays.toString(hostData));
boolean passed = Arrays.equals(hostData, hostDataRef);
System.out.println(passed ? "PASSED" : "FAILED");
// Clean up.
cuMemFree(deviceData);
}
示例15
public static void main(String[] args)
{
JCudaDriver.setExceptionsEnabled(true);
JCublas.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Check if the device supports managed memory
int supported[] = { 0 };
cuDeviceGetAttribute(supported,
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, device);
if (supported[0] == 0)
{
System.err.println("Device does not support managed memory");
return;
}
// Allocate managed memory that is accessible to the host
int n = 10;
long size = n * Sizeof.FLOAT;
CUdeviceptr p = new CUdeviceptr();
cuMemAllocManaged(p, size, CU_MEM_ATTACH_HOST);
// Obtain the byte buffer from the pointer. This is supported only
// for memory that was allocated to be accessible on the host:
ByteBuffer bb = p.getByteBuffer(0, size);
System.out.println("Buffer on host side: " + bb);
// Fill the buffer with sample data
FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer();
for (int i = 0; i < n; i++)
{
fb.put(i, i);
}
// Make the buffer accessible to all devices
cuStreamAttachMemAsync(null, p, 0, CU_MEM_ATTACH_GLOBAL);
cuStreamSynchronize(null);
// Use the pointer in a device operation (here, a dot product with
// JCublas, for example). The data that was filled in by the host
// will now be used by the device.
cublasHandle handle = new cublasHandle();
cublasCreate(handle);
float result[] = { -1.0f };
cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result));
System.out.println("Result: " + result[0]);
}
示例16
public static void main(String[] args) throws IOException
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Create the PTX file by calling the NVCC
String ptxFileName = JCudaSamplesUtils.preparePtxFile(
"src/main/resources/kernels/JCudaConstantMemoryKernel.cu");
// Load the PTX file.
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain the pointer to the constant memory, and print some info
CUdeviceptr constantMemoryPointer = new CUdeviceptr();
long constantMemorySizeArray[] = { 0 };
cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray,
module, "constantMemoryData");
int constantMemorySize = (int)constantMemorySizeArray[0];
System.out.println("constantMemoryPointer: " + constantMemoryPointer);
System.out.println("constantMemorySize: " + constantMemorySize);
// Copy some host data to the constant memory
int numElements = constantMemorySize / Sizeof.FLOAT;
float hostData[] = new float[numElements];
for (int i = 0; i < numElements; i++)
{
hostData[i] = i;
}
cuMemcpyHtoD(constantMemoryPointer,
Pointer.to(hostData), constantMemorySize);
// Now use the constant memory in the kernel call:
// Obtain a function pointer to the "constantMemoryKernel" function.
CUfunction kernel = new CUfunction();
cuModuleGetFunction(kernel, module, "constantMemoryKernel");
// Allocate some device memory
CUdeviceptr deviceData = new CUdeviceptr();
cuMemAlloc(deviceData, constantMemorySize);
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(
Pointer.to(deviceData),
Pointer.to(new int[]{numElements})
);
// Launch the kernel
int blockSizeX = numElements;
int gridSizeX = 1;
cuLaunchKernel(kernel,
gridSizeX, 1, 1,
blockSizeX, 1, 1,
0, null,
kernelParameters, null
);
cuCtxSynchronize();
// Copy the result back to the host, and verify that it is
// the same that was copied to the constant memory
float hostResult[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize);
boolean passed = Arrays.equals(hostData, hostResult);
System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
}
示例17
/**
* Test the 1D float texture access
*/
private boolean test_float_1D()
{
// Create the array on the device
CUarray array = new CUarray();
CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
ad.Format = CU_AD_FORMAT_FLOAT;
ad.Width = sizeX;
ad.Height = 1;
ad.NumChannels = 1;
cuArrayCreate(array, ad);
// Copy the host input to the array
Pointer pInput = Pointer.to(input_float_1D);
cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT);
// Set up the texture reference
CUtexref texref = new CUtexref();
cuModuleGetTexRef(texref, module, "texture_float_1D");
cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
// Prepare the output device memory
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, Sizeof.FLOAT * 1);
// Obtain the test function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "test_float_1D");
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(
Pointer.to(dOutput),
Pointer.to(new float[]{ posX })
);
// Call the kernel function.
cuLaunchKernel(function, 1, 1, 1,
1, 1, 1, 0, null, kernelParameters, null);
cuCtxSynchronize();
// Obtain the output on the host
float hOutput[] = new float[1];
cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);
// Print the results
log("Result float 1D " + Arrays.toString(hOutput));
float expected[] = new float[]{ 0.5f };
boolean passed = Arrays.equals(hOutput, expected);
log("Test float 1D " + (passed ? "PASSED" : "FAILED"));
// Clean up
cuArrayDestroy(array);
cuMemFree(dOutput);
return passed;
}
示例18
/**
* Test the 2D float texture access
*/
private boolean test_float_2D()
{
// Create the array on the device
CUarray array = new CUarray();
CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
ad.Format = CU_AD_FORMAT_FLOAT;
ad.Width = sizeX;
ad.Height = sizeY;
ad.NumChannels = 1;
cuArrayCreate(array, ad);
// Copy the host input to the array
CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
copyHD.srcHost = Pointer.to(input_float_2D);
copyHD.srcPitch = sizeX * Sizeof.FLOAT;
copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
copyHD.dstArray = array;
copyHD.WidthInBytes = sizeX * Sizeof.FLOAT;
copyHD.Height = sizeY;
cuMemcpy2D(copyHD);
// Set up the texture reference
CUtexref texref = new CUtexref();
cuModuleGetTexRef(texref, module, "texture_float_2D");
cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
// Prepare the output device memory
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, Sizeof.FLOAT * 1);
// Obtain the test function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "test_float_2D");
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(
Pointer.to(dOutput),
Pointer.to(new float[]{ posX }),
Pointer.to(new float[]{ posY })
);
// Call the kernel function.
cuLaunchKernel(function, 1, 1, 1,
1, 1, 1, 0, null, kernelParameters, null);
cuCtxSynchronize();
// Obtain the output on the host
float hOutput[] = new float[1];
cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);
// Print the results
log("Result float 2D " + Arrays.toString(hOutput));
float expected[] = new float[]{ 1.5f };
boolean passed = Arrays.equals(hOutput, expected);
log("Test float 2D " + (passed ? "PASSED" : "FAILED"));
// Clean up
cuArrayDestroy(array);
cuMemFree(dOutput);
return passed;
}
示例19
/**
* Test the 3D float texture access
*/
private boolean test_float_3D()
{
// Create the array on the device
CUarray array = new CUarray();
CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
ad.Format = CU_AD_FORMAT_FLOAT;
ad.Width = sizeX;
ad.Height = sizeY;
ad.Depth = sizeZ;
ad.NumChannels = 1;
cuArray3DCreate(array, ad);
// Copy the host input to the array
CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
copy.srcHost = Pointer.to(input_float_3D);
copy.srcPitch = sizeX * Sizeof.FLOAT;
copy.srcHeight = sizeY;
copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
copy.dstArray = array;
copy.dstHeight = sizeX;
copy.WidthInBytes = sizeX * Sizeof.FLOAT;
copy.Height = sizeY;
copy.Depth = sizeZ;
cuMemcpy3D(copy);
// Set up the texture reference
CUtexref texref = new CUtexref();
cuModuleGetTexRef(texref, module, "texture_float_3D");
cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
// Prepare the output device memory
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, Sizeof.FLOAT * 1);
// Obtain the test function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "test_float_3D");
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(
Pointer.to(dOutput),
Pointer.to(new float[]{ posX }),
Pointer.to(new float[]{ posY }),
Pointer.to(new float[]{ posZ })
);
// Call the kernel function.
cuLaunchKernel(function, 1, 1, 1,
1, 1, 1, 0, null, kernelParameters, null);
cuCtxSynchronize();
// Obtain the output on the host
float hOutput[] = new float[1];
cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);
// Print the results
log("Result float 3D " + Arrays.toString(hOutput));
float expected[] = new float[]{ 3.5f };
boolean passed = Arrays.equals(hOutput, expected);
log("Test float 3D " + (passed ? "PASSED" : "FAILED"));
// Clean up
cuArrayDestroy(array);
cuMemFree(dOutput);
return passed;
}
示例20
/**
* Test the 1D float4 texture access
*/
private boolean test_float4_1D()
{
// Create the array on the device
CUarray array = new CUarray();
CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
ad.Format = CU_AD_FORMAT_FLOAT;
ad.Width = sizeX;
ad.Height = 1;
ad.NumChannels = 4;
cuArrayCreate(array, ad);
// Copy the host input to the array
Pointer pInput = Pointer.to(input_float4_1D);
cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT * 4);
// Set up the texture reference
CUtexref texref = new CUtexref();
cuModuleGetTexRef(texref, module, "texture_float4_1D");
cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
// Prepare the output device memory
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, Sizeof.FLOAT * 4);
// Obtain the test function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "test_float4_1D");
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(
Pointer.to(dOutput),
Pointer.to(new float[]{ posX })
);
// Call the kernel function.
cuLaunchKernel(function, 1, 1, 1,
1, 1, 1, 0, null, kernelParameters, null);
cuCtxSynchronize();
// Obtain the output on the host
float hOutput[] = new float[4];
cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);
// Print the results
log("Result float4 1D " + Arrays.toString(hOutput));
float expected[] = new float[]{ 0.5f, 0.5f, 0.5f, 0.5f };
boolean passed = Arrays.equals(hOutput, expected);
log("Test float4 1D " + (passed ? "PASSED" : "FAILED"));
// Clean up
cuArrayDestroy(array);
cuMemFree(dOutput);
return passed;
}
示例21
/**
* Test the 2D float4 texture access
*/
private boolean test_float4_2D()
{
// Create the array on the device
CUarray array = new CUarray();
CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
ad.Format = CU_AD_FORMAT_FLOAT;
ad.Width = sizeX;
ad.Height = sizeY;
ad.NumChannels = 4;
cuArrayCreate(array, ad);
// Copy the host input to the array
CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
copyHD.srcHost = Pointer.to(input_float4_2D);
copyHD.srcPitch = sizeX * Sizeof.FLOAT * 4;
copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
copyHD.dstArray = array;
copyHD.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
copyHD.Height = sizeY;
cuMemcpy2D(copyHD);
// Set up the texture reference
CUtexref texref = new CUtexref();
cuModuleGetTexRef(texref, module, "texture_float4_2D");
cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
// Prepare the output device memory
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, Sizeof.FLOAT * 4);
// Obtain the test function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "test_float4_2D");
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(
Pointer.to(dOutput),
Pointer.to(new float[]{ posX }),
Pointer.to(new float[]{ posY })
);
// Call the kernel function.
cuLaunchKernel(function, 1, 1, 1,
1, 1, 1, 0, null, kernelParameters, null);
cuCtxSynchronize();
// Obtain the output on the host
float hOutput[] = new float[4];
cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);
// Print the results
log("Result float4 2D " + Arrays.toString(hOutput));
float expected[] = new float[]{ 1.5f, 1.5f, 1.5f, 1.5f };
boolean passed = Arrays.equals(hOutput, expected);
log("Test float4 2D " + (passed ? "PASSED" : "FAILED"));
// Clean up
cuArrayDestroy(array);
cuMemFree(dOutput);
return passed;
}
示例22
/**
* Test the 3D float4 texture access
*/
private boolean test_float4_3D()
{
// Create the array on the device
CUarray array = new CUarray();
CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
ad.Format = CU_AD_FORMAT_FLOAT;
ad.Width = sizeX;
ad.Height = sizeY;
ad.Depth = sizeZ;
ad.NumChannels = 4;
cuArray3DCreate(array, ad);
// Copy the host input to the array
CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
copy.srcHost = Pointer.to(input_float4_3D);
copy.srcPitch = sizeX * Sizeof.FLOAT * 4;
copy.srcHeight = sizeY;
copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
copy.dstArray = array;
copy.dstHeight = sizeX;
copy.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
copy.Height = sizeY;
copy.Depth = sizeZ;
cuMemcpy3D(copy);
// Set up the texture reference
CUtexref texref = new CUtexref();
cuModuleGetTexRef(texref, module, "texture_float4_3D");
cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
// Prepare the output device memory
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, Sizeof.FLOAT * 4);
// Obtain the test function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "test_float4_3D");
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(
Pointer.to(dOutput),
Pointer.to(new float[]{ posX }),
Pointer.to(new float[]{ posY }),
Pointer.to(new float[]{ posZ })
);
// Call the kernel function.
cuLaunchKernel(function, 1, 1, 1,
1, 1, 1, 0, null, kernelParameters, null);
cuCtxSynchronize();
// Obtain the output on the host
float hOutput[] = new float[4];
cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);
// Print the results
log("Result float4 3D " + Arrays.toString(hOutput));
float expected[] = new float[]{ 3.5f, 3.5f, 3.5f, 3.5f };
boolean passed = Arrays.equals(hOutput, expected);
log("Test float4 3D " + (passed ? "PASSED" : "FAILED"));
// Clean up
cuArrayDestroy(array);
cuMemFree(dOutput);
return passed;
}
示例23
@Test
public void testMemRangeAttribute()
{
JCudaDriver.setExceptionsEnabled(true);
cuInit(0);
CUcontext contest = new CUcontext();
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
cuCtxCreate(contest, 0, device);
int size = 64;
CUdeviceptr deviceData = new CUdeviceptr();
cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST);
int readMostly[] = { 12345 };
int lastPrefetchLocation[] = { 12345 };
int preferredLocation[] = { 12345 };
int accessedBy[] = { 12345, 12345, 12345 };
cuMemRangeGetAttribute(Pointer.to(readMostly), Sizeof.INT,
CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, deviceData, size);
cuMemRangeGetAttribute(Pointer.to(lastPrefetchLocation), Sizeof.INT,
CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, deviceData, size);
cuMemRangeGetAttribute(Pointer.to(preferredLocation), Sizeof.INT,
CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, deviceData, size);
cuMemRangeGetAttribute(
Pointer.to(accessedBy), Sizeof.INT * accessedBy.length,
CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, deviceData, size);
boolean printResults = false;
//printResults = true;
if (printResults)
{
System.out.println("readMostly : " +
Arrays.toString(lastPrefetchLocation));
System.out.println("lastPrefetchLocation: " +
Arrays.toString(lastPrefetchLocation));
System.out.println("preferredLocation : " +
Arrays.toString(preferredLocation));
System.out.println("accessedBy : " +
Arrays.toString(accessedBy));
}
}
示例24
@Test
public void testMemRangeAttributes()
{
JCudaDriver.setExceptionsEnabled(true);
cuInit(0);
CUcontext contest = new CUcontext();
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
cuCtxCreate(contest, 0, device);
int size = 64;
CUdeviceptr deviceData = new CUdeviceptr();
cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST);
int readMostly[] = { 12345 };
int lastPrefetchLocation[] = { 12345 };
int preferredLocation[] = { 12345 };
int accessedBy[] = { 12345, 12345, 12345 };
Pointer data[] =
{
Pointer.to(readMostly),
Pointer.to(lastPrefetchLocation),
Pointer.to(preferredLocation),
Pointer.to(accessedBy)
};
long dataSizes[] =
{
Sizeof.INT,
Sizeof.INT,
Sizeof.INT,
Sizeof.INT * accessedBy.length
};
int attributes[] =
{
CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
};
cuMemRangeGetAttributes(data, dataSizes,
attributes, attributes.length, deviceData, size);
boolean printResults = false;
//printResults = true;
if (printResults)
{
System.out.println("readMostly : " +
Arrays.toString(lastPrefetchLocation));
System.out.println("lastPrefetchLocation: " +
Arrays.toString(lastPrefetchLocation));
System.out.println("preferredLocation : " +
Arrays.toString(preferredLocation));
System.out.println("accessedBy : " +
Arrays.toString(accessedBy));
}
}
示例25
public static void test() {
JCudaDriver.setExceptionsEnabled(true);
String sourceCode = "extern \"C\"" + "\n" +
"__global__ void add(float *result, float *a, float *b)" +
"\n" +
"{" + "\n" +
" int i = threadIdx.x;" + "\n" +
" result[i] = a[i] + b[i];" + "\n" +
"}";
// Prepare the kernel
System.out.println("Preparing the KernelLauncher...");
KernelLauncher kernelLauncher =
KernelLauncher.compile(sourceCode, "add");
// Create the input data
System.out.println("Creating input data...");
int size = 10;
float result[] = new float[size];
float a[] = new float[size];
float b[] = new float[size];
for (int i = 0; i < size; i++) {
a[i] = i;
b[i] = i;
}
// Allocate the device memory and copy the input
// data to the device
System.out.println("Initializing device memory...");
CUdeviceptr dResult = GPUHelper.allocateOutput(size, Sizeof.FLOAT);
CUdeviceptr dA = GPUHelper.allocateInput(a);
CUdeviceptr dB = GPUHelper.allocateInput(b);
System.out.println("Calling the kernel...");
kernelLauncher.setBlockSize(size, 1, 1);
kernelLauncher.call(dResult, dA, dB);
// Copy the result from the device to the host
System.out.println("Obtaining results...");
cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT);
System.out.println("Result: " + Arrays.toString(result));
// Clean up
cuMemFree(dA);
cuMemFree(dB);
cuMemFree(dResult);
}
示例26
public static CUdeviceptr allocateOutput(int size, int typeSize) {
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, size * typeSize);
return dOutput;
}
示例27
public static void freeUpMemory(CUdeviceptr[] pointers) {
for (CUdeviceptr pointer : pointers) {
cuMemFree(pointer);
}
}
示例28
public CUdeviceptr malloc(long numBytes) {
CUdeviceptr pdBuf = new CUdeviceptr();
JCudaDriver.cuMemAlloc(pdBuf, numBytes);
return pdBuf;
}
示例29
public void free(CUdeviceptr pdBuf) {
JCudaDriver.cuMemFree(pdBuf);
}
示例30
public void uploadAsync(CUdeviceptr pdBuf, Pointer phBuf, long numBytes, GpuStream stream) {
JCudaDriver.cuMemcpyHtoDAsync(pdBuf, phBuf, numBytes, stream.getStream());
}