CUDA: Доступ к памяти Cuda отличается от OpenCL? Что вызывает этот незаконный доступ к памяти?

Поэтому я написал Cuda-версию программы OpenCL, которую я написал. Версии OpenCL работают, а версия Cuda — нет. Теперь преобразование OpenCL-кода в код Cuda — это не 1-к-1, но я не понимаю, почему версия cuda не сработает после того, как я основал свой код на примере cuda при его переводе.

я получаю an illegal memory access was encountered (error code # = 77) во время cudaMemcpy (… cudaMemcpyDeviceToHost); (строка 227) Хотя это происходит во время memcpy, проблема, по-видимому, заключается в недопустимом доступе к памяти во время запуска ядра. Вот пример того, что я получаю, проверяя программу cuda-memcheck:

========= Invalid __global__ read of size 4
=========     at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
=========     by thread (53,0,0) in block (130,0,0)
=========     Address 0x130718e590 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x4059]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3438]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
=========
========= Invalid __global__ read of size 4
=========     at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
=========     by thread (52,0,0) in block (130,0,0)
=========     Address 0x130718e590 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x4059]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3438]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
=========     Host Frame:./MoveoutAndStackCudaMVC [0x2b69]

Я недостаточно хорошо понимаю различия между Cuda и OpenCL, чтобы понять, что я делаю неправильно. Я пытался возиться с MoveoutAndStackCuda<<<grid, threads>>> и изменить его на что-то вроде MoveoutAndStackCuda<<<grid, threads, (localGroupSize * sizeof(float))>>> но не повезло. Я также пытался закомментировать части моего ядра, проблема возникает, даже когда я закомментировал большую часть своего ядра.

Надеюсь, это можно проверить, но есть вероятность, что это не так, поскольку это может зависеть от моего оборудования. Я бегу Quadro M5000 на CentOS 6,8 (Финал).

Я пытался вырезать как можно больше вещей, которые бесполезны для этой проблемы. Я также предоставил бы рабочую версию OpenCL этого примера MCV, однако у меня нет текста. Сейчас я рекомендую отладку с использованием аргументов 100 50 40, потому что у меня также есть проблема порождения слишком большого количества глобальных потоков, которые я буду решать после того, как этот будет решен.

Вот минимальный, полный и проверяемый пример:

#include <math.h>
#include <sstream>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <cuda.h>
#include <assert.h>
#include <unistd.h>

const bool _VERBOSE = true;
const bool _PRINT_ALLOC_SIZE = true;
const bool _PRINT_RUN_TIME = true;
const int MIN_LOCAL_SIZE = 8;

__global__ void MoveoutAndStackCuda(float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], int startIndices[], int exitIndices[],
int sampleShift[], const unsigned int samplesPerT, const unsigned int readIns,
const unsigned int nOuts) {

unsigned int globalId = (blockIdx.x * blockDim.x) + threadIdx.x;

float stackF = 0.0;
float powerF = 0.0;

unsigned int readIndex = (globalId % samplesPerT);
unsigned int jobNum = (globalId / samplesPerT);

for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];

if ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd))) {
float value = prestackTraces[readIndex + (x * samplesPerT) + sampleShift[offsetIndex]];

stackF += value;
powerF += (value * value);
}
}

stackTracesOut[globalId] = stackF;
powerTracesOut[globalId] = powerF;
}

/*
*  Single threaded version that somewhat mimics what is executed in the OpenCL code as close as possible.
*/
void MoveoutAndStackSingleThread(const float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], const int startIndices[], const int exitIndices[], const int shift[],
const unsigned int samplesPerT, const unsigned int readIns, const unsigned int nOuts,
const unsigned int jobNum, const unsigned int readIndex) {

float stackF = 0.0f;
float powerF = 0.0f;

int outputIndex = readIndex + (jobNum * samplesPerT);

for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];

bool shouldRead = ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd)));
if (shouldRead) {
float value = prestackTraces[readIndex + (x * samplesPerT) + shift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}

stackTracesOut[outputIndex] = stackF;
powerTracesOut[outputIndex] = powerF;
}

/**
*  Used to keep track of how long it takes to execute this.
*/;
double GetTime() {

struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + (1e-6 * tv.tv_usec);
}

/*
*  Print message to stderr and exit.
*/
void Fatal(const char* format, ...) {

va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(1);
}

/*
*  We have an error, which one? Also print out where this occured.
*/
void CudaWhichError(cudaError_t errorCode, char* location) {

if (errorCode == cudaSuccess) {
// This shouldn't happen. It should be made sure that errorCode != cudaSuccess before calling this function.
printf("Reported error not actually an error... (cudaSuccess) %s\n", location);
return;
}

Fatal("%s %s (error code # = %d)\n", location, cudaGetErrorString(errorCode), errorCode);
}

/*
*  Check for errors.
*/
void CheckForErrors(char* location) {

cudaError_t errorCode = cudaGetLastError();
if (errorCode != cudaSuccess) {
CudaWhichError(errorCode, location);
}
}

/*
*  Finds and initializes the fastest graphics card for CUDA use.
*
*  Returns the max number of threads per block for the selected device.
*/
int GetFastestDevice() {

// Get the number of CUDA devices
int num;
if (cudaGetDeviceCount(&num)) Fatal("Cannot get number of CUDA devices\n");
if (num<1) Fatal("No CUDA devices found\n");

// Props
cudaDeviceProp currentDevice;
int fastestGflops = -1;
cudaDeviceProp bestDevice;
int fastestDeviceID = -1;

//  Get fastest device
for (int dev=0;dev<num;dev++) {
if (cudaGetDeviceProperties(&currentDevice, dev)) {
Fatal("Error getting device %d properties\n", dev);
}

int Gflops = currentDevice.multiProcessorCount * currentDevice.clockRate;

if (_VERBOSE) {
printf("CUDA Device %d: %s Gflops %f Processors %d Threads/Block %d\n",
dev,
currentDevice.name,
(1e-6 * Gflops),
currentDevice.multiProcessorCount,
currentDevice.maxThreadsPerBlock);
}

if (Gflops > fastestGflops) {
fastestGflops = Gflops;
fastestDeviceID = dev;
bestDevice = currentDevice;
}
}

// Check to see if we get a device
if (fastestDeviceID == -1) {
Fatal("bestDevice == NULL");
}

// Print and set device
if (cudaGetDeviceProperties(&bestDevice, fastestDeviceID)) {
Fatal("Error getting device %d properties\n", fastestDeviceID);
}

cudaSetDevice(fastestDeviceID);

if (_VERBOSE) {
printf("Fastest CUDA Device %d: %s\n", fastestDeviceID, bestDevice.name);
printf("bestDevice.maxThreadsPerBlock: %d\n", bestDevice.maxThreadsPerBlock);
}

CheckForErrors((char*)("GetFastestDevice()"));

// Return max thread count
return bestDevice.maxThreadsPerBlock;
}

/*
*  Allocate memory on the GPU, also copy the data over.
*
*  CudaPtr variables point to the arrays on the GPU side.
*  Host variables point to the arrays on the CPU side.
*  Sizes variables determine sizes of the arrays.
*/
void AllocateAndCopyCudaDeviceMemory(float** prestackCudaPtr, float** stackOutCudaPtr, float** powerOutCudaPtr,
int** startIndicesCudaPtr, int** endIndicesCudaPtr, int** sampleShiftCudaPtr,
float *prestackHost, int *startIndicesHost, int *endIndicesHost, int *sampleShiftHost,
size_t prestackSizes, size_t outputSizes, size_t inputSizes) {

if (_PRINT_ALLOC_SIZE) {
size_t totalMemoryAllocated = (prestackSizes + (outputSizes * 2) + (inputSizes * 3));
printf(" Total memory allocated for run:                        %zu\n", totalMemoryAllocated);
printf(" Prestack array size:                                   %zu\n", prestackSizes);
printf(" Output array sizes:                                    %zu\n", outputSizes);
printf(" EtartIndices, EndIndices, & SampleShift array size:    %zu\n", inputSizes);
}

cudaError_t cudaCode;

// Allocate memory on the graphics card
cudaCode = cudaMalloc((void**)prestackCudaPtr, prestackSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for prestack array\n")));
cudaCode = cudaMalloc((void**)stackOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for stackOut array\n")));
cudaCode = cudaMalloc((void**)powerOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for powerOut array\n")));
cudaCode = cudaMalloc((void**)startIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for startIndices array\n")));
cudaCode = cudaMalloc((void**)endIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for endIndices array\n")));
cudaCode = cudaMalloc((void**)sampleShiftCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for sampleShift array\n")));

// Copy data over (for the arrays the need it)
cudaCode = cudaMemcpy(*prestackCudaPtr, prestackHost, prestackSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy prestack data over to device.\n")));
cudaCode = cudaMemcpy(*startIndicesCudaPtr, startIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy startIndices data over to device.\n")));
cudaCode = cudaMemcpy(*endIndicesCudaPtr, endIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy endIndices data over to device.\n")));
cudaCode = cudaMemcpy(*sampleShiftCudaPtr, sampleShiftHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy sampleSgift data over to device.\n")));
}

/*
*  Enqueue the kernels to be ran on the gpu. Pointers that are passed in are pointing to
*  device side memory.
*/
void RunCudaMoveAndStackJobs(float** prestackTracesCudaPtr, float** stackTracesOutCudaPtr,
float** powerTracesOutCudaPtr, int** startIndicesCudaPtr, int** exitIndicesCudaPtr,
int** sampleShiftCudaPtr, unsigned int samplesPerT, unsigned int readIns,
unsigned int nOuts, size_t localGroupSize) {

// Set the size
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);

if (*prestackTracesCudaPtr == NULL) printf("*prestackTracesCudaPtr == NULL\n");

// Execute the kernel
MoveoutAndStackCuda<<<grid, threads>>>(*prestackTracesCudaPtr,
*stackTracesOutCudaPtr, *powerTracesOutCudaPtr, *startIndicesCudaPtr, *exitIndicesCudaPtr,
*sampleShiftCudaPtr, samplesPerT, readIns, nOuts);

CheckForErrors((char*)("RunCudaMoveAndStackJobs()"));
}

/*
*  Free memory on the GPU device as well as free the remaining OpenCL objects for the host side.
*/
void RetrieveAndCleanupCudaDeviceMemory(float **prestackCudaPtr, float **stackOutCudaPtr,
float **powerOutCudaPtr, int **startIndicesCudaPtr, int **endIndicesCudaPtr, int **sampleShiftCudaPtr,
float *stackOutHost, float *powerOutHost, size_t outputSizes) {

// Copy C from device to host
cudaError_t cudaCode;
cudaCode = cudaMemcpy(stackOutHost, *stackOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy stackOut data back to host.\n"));
cudaCode = cudaMemcpy(powerOutHost, *powerOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy powerOut data back to host.\n"));

// Free device memory (TODO: reverse order)
cudaFree(*prestackCudaPtr);
cudaFree(*stackOutCudaPtr);
cudaFree(*powerOutCudaPtr);
cudaFree(*startIndicesCudaPtr);
cudaFree(*endIndicesCudaPtr);
cudaFree(*sampleShiftCudaPtr);
}

/*
* Runs the program given the arrays passed in the parameters.
*
* Return the time it took to run the program, if desired.
*/
double CommenceCUDAMoveoutAndStack(float* prestackTraces, float* stackOut, float* powerOut,
int* startIndices, int* endIndices, int* sampleShift,
unsigned int samplesPerTrace, unsigned int nTracesIn, unsigned int nTracesOut,
size_t localGroupSize, size_t prestackSizes, size_t outputSizes, size_t inputSizes) {

double returnVal = 0.0;
if (_PRINT_RUN_TIME) {
printf("CommenceCUDAMoveoutAndStack:\n   samplesPerTrace=%u nTracesIn=%u nTracesOut=%u\n""   localGroupSize=%zu\n",
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
}

// Init CUDA
int maxThreadsPerBlock = GetFastestDevice();

// Check the desirec local size
if (((int)localGroupSize) > maxThreadsPerBlock) {
Fatal("Error: local group (%zu) size exceeds the max local group size of the selected graphics card (%d).\n",
localGroupSize, maxThreadsPerBlock);
} else if (((int)localGroupSize) < MIN_LOCAL_SIZE) {
Fatal("Error: local group (%zu) size is less than MIN_LOCAL_SIZE (%d).\n",
localGroupSize, MIN_LOCAL_SIZE);
}

// Allocate memory on the device. These pointers will point to memory on the GPU.
double preInitTime = GetTime();
float* prestackCudaPtr = NULL;
float* stackOutCudaPtr = NULL;
float* powerOutCudaPtr = NULL;
int* startIndicesCudaPtr = NULL;
int* endIndicesCudaPtr = NULL;
int* sampleShiftCudaPtr = NULL;
AllocateAndCopyCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
prestackTraces, startIndices, endIndices, sampleShift,
prestackSizes, outputSizes, inputSizes);

// Run the program
RunCudaMoveAndStackJobs(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);

// Retrieve the data and clean up graphics card memory
RetrieveAndCleanupCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
stackOut, powerOut,
(size_t)(nTracesOut * samplesPerTrace * sizeof(float)));

// Print the run time (if requested)
if (_PRINT_RUN_TIME) {
returnVal = (GetTime() - preInitTime);
if (_PRINT_RUN_TIME) {
printf("       Run Time:   %f secs\n", returnVal);
}
}

return returnVal;
}

// Returns a float 0.0 - 1.0, inclusive
float RandomFloat() {
return static_cast <float> (rand()) / static_cast <float>(RAND_MAX);
}

// Fill in the prestack traces array
void FillFloatArrayRandomly(float* fillArray, unsigned int length) {

for (unsigned int r = 0; r < length; r++) {
fillArray[r] = RandomFloat() * 1000.0f;
}
}

// Fill the start and end arrays randomly
void FillStartEndShiftArraysRandomly(int* startArray, int* nSampsArray, int* shiftArray,
int arrayLength, int rangeOfStartEndMax, int samplesPerT) {

for (int r = 0; r < arrayLength; r++) {
startArray[r] = (rand() % rangeOfStartEndMax);
int endIndex = samplesPerT - (rand() % rangeOfStartEndMax);
nSampsArray[r] = endIndex - startArray[r];

int range = startArray[r] + (samplesPerT - endIndex);
int ra = rand();

if (range != 0) shiftArray[r] = (ra % range) - startArray[r];
else shiftArray[r] = 0;

// Check to make sure we won't go out of bounds
assert((startArray[r] + nSampsArray[r]) <= samplesPerT);
assert(endIndex > startArray[r]);
assert(startArray[r] >= 0);
assert(nSampsArray[r] >= 0);
assert((startArray[r] + shiftArray[r]) >= 0);
assert((nSampsArray[r] + shiftArray[r]) <= samplesPerT);
}
}

// Create arrays for the OpenCL program to use
double GenerateArraysAndRun(unsigned int samplesPerTrace,
unsigned int nTracesIn, unsigned int nTracesOut, size_t localGroupS) {

srand(time(NULL)); // Set random seed to current time
double returnVal;

// Create the arrays to be used in the program
float* prestackTraces1D;
float* stackOut1D;
float* powerOut1D;
int* startIndices1D;
int* endIndices1D;
int* shift1D;

// Get sizes or arrays
size_t prestackSizes = samplesPerTrace * nTracesIn * sizeof(float);
size_t outputSizes = nTracesOut * samplesPerTrace * sizeof(float);
size_t inputSizes = nTracesOut * nTracesIn * sizeof(int);

// Fill in the arrays
prestackTraces1D = (float*)malloc(prestackSizes);
stackOut1D = (float*)malloc(outputSizes);
powerOut1D = (float*)malloc(outputSizes);
startIndices1D = (int*)malloc(inputSizes);
endIndices1D = (int*)malloc(inputSizes);
shift1D = (int*)malloc(inputSizes);

FillFloatArrayRandomly(prestackTraces1D, samplesPerTrace * nTracesIn);
FillStartEndShiftArraysRandomly(startIndices1D, endIndices1D, shift1D,
(int)(nTracesOut * nTracesIn), (int)(((float)samplesPerTrace) * 0.1), (int)samplesPerTrace);

// Check if arrays were created
if (prestackTraces1D == NULL) Fatal("GenerateArraysAndRun(): prestackTraces1D == NULL\n");
if (stackOut1D == NULL) Fatal("GenerateArraysAndRun(): stackOut1D == NULL\n");
if (powerOut1D == NULL) Fatal("GenerateArraysAndRun(): powerOut1D == NULL\n");
if (startIndices1D == NULL) Fatal("GenerateArraysAndRun(): startIndices1D == NULL\n");
if (endIndices1D == NULL) Fatal("GenerateArraysAndRun(): endIndices1D == NULL\n");
if (shift1D == NULL) Fatal("GenerateArraysAndRun(): shift1D == NULL\n");

// Run the program
returnVal = CommenceCUDAMoveoutAndStack(prestackTraces1D, stackOut1D, powerOut1D, startIndices1D,
endIndices1D, shift1D, samplesPerTrace, nTracesIn, nTracesOut,
localGroupS, prestackSizes, outputSizes, inputSizes);

// Finished: free the memory on CPU side in reverse order
free(shift1D);
free(endIndices1D);
free(startIndices1D);
free(powerOut1D);
free(stackOut1D);
free(prestackTraces1D);

// Return the time that the program gave us
return returnVal;
}

// Main
int main(int argc, char* argv[]) {

// TODO: Errors here
if (argc != 5)
Fatal("Incorrect # of Arguments (5 Needed) <samplesPerTrace> <nTracesIn> <nTracesOut> <LocalGroupSize>\n""   argc = %d\n", argc);

unsigned int samplesPerTrace = atoi(argv[1]);
unsigned int nTracesIn = atoi(argv[2]);
unsigned int nTracesOut = atoi(argv[3]);

size_t localGroupS = atoi(argv[4]);

GenerateArraysAndRun(samplesPerTrace, nTracesIn, nTracesOut, localGroupS);

return 0;
}

-1

Решение

Проблема заключалась в том, что я создавал слишком много блоков. В OpenCL вы указываете ядру общее количество потоков и количество потоков в каждом блоке, и из этого определяется общее количество блоков. Тем временем в Cuda вы сообщаете ядру, сколько блоков существует и сколько потоков в каждом блоке, и общее количество потоков определяется ими. Так:

  dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);

Должно быть:

  dim3 threads(localGroupSize);
dim3 grid((samplesPerT * nOuts) / localGroupSize);
0

Другие решения

Других решений пока нет …

По вопросам рекламы [email protected]