Launch Bounds не выполняет свою работу?

Это мой визуальный результат профилирования для ядра, который я выкладываю ниже. Обратите внимание на размер сетки (1) и блока (1024) и на то, что он использует только 43 регистра, когда он должен использовать 64 регистра. Я использую Tesla K40C.

введите описание изображения здесь

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"#include "curand.h"#include <cuda_runtime.h>
#include "math.h"#include <curand_kernel.h>
#include <time.h>
#include <algorithm>
#include <iostream>

#define iterations 159744
#define transMatrixSize 2592 // Just for clarity. Do not change.
#define reps 1024 // Is equal to blocksize. Do not change
#define integralStep 13125  // Number of time steps to be averaged at the tail

__global__ void
__launch_bounds__(1024,1)
bufferleech(float *masterForces, float *masterForces50, const float * __restrict__ transMatrix, const float *rands, const int r_max)
{

int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);
float r = 0.0;

volatile __shared__ float buffer[reps];
volatile __shared__ float buffer50[reps];int RU[26] = {0};

for(int e =1; e< 25; e++)
{
r = curand_uniform(&s);
if(r < .5)
{
RU[e] += 10;
}
}

int index = 0;
float temp = 0;
float temp50 = 0;

int RUsnapshot = 0;
int leftsnap = 0;
int RUsnapshot50 = 0;
int leftsnap50 = 0;for (int i =0; i < iterations; i++)
{

leftsnap = 0;
leftsnap50 = 0;
/////////////////////////////////////////XYZ: [100% state][50%Binary][50% state]//////////////////////////////////////
for(int j = 1; j < 25; j++)
{
r = curand_uniform(&s);
RUsnapshot = int(RU[j]/100);

//index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
index =  ((((leftsnap * 6 + int(RU[j+1]/100)) * 6  + int(RU[j]/100)) * 2) * 6) ;

RU[j]+= 100 * (( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5])) ;

leftsnap = RUsnapshot;
//-------------------------------------xTnC 50----------------------------
RUsnapshot50 = (RU[j] % 10);

//index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4 + xTnC?) * dimen5) ;
index =  ((((leftsnap50 * 6 + (RU[j+1] % 10)) * 6  + (RU[j] % 10) ) * 2 + int((RU[j] % 100)/10)) * 6) ;

RU[j]+= ( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;

leftsnap50 = RUsnapshot50;
}///////////////////////////////////////////////////////////

for(int z = 1; z < 25; z++)
{
temp+= ((int(RU[z]/100.0)) ==4) + ((int(RU[z]/100.0)) ==5);
temp50+= ((RU[z] % 10) ==4) + ((RU[z] % 10) ==5);
}

buffer[globalIdx] = temp;
buffer50[globalIdx] = temp50;

__syncthreads();

for (int b = 0; b < 10; b++)
{
if ((globalIdx % int(powf(2, (b+1)))) == 0)
{
buffer[globalIdx] += buffer[globalIdx + int(powf(2,b))];
buffer50[globalIdx] += buffer50[globalIdx + int(powf(2,b))];
if(b ==9)
{
masterForces[i] = buffer[0]/24576.0;
masterForces50[i] = buffer50[0]/24576.0;
}

}
}

temp = 0.0;
temp50 = 0.0;
}}

Как я могу заставить это ядро ​​использовать 64 регистра?
Определенно, есть место для дальнейшего использования регистров, так как похожие ядра, которые я написал, без проблем получают до 116 регистров с помощью команды launch bounds.

Спасибо

Вот основная функция, если вы хотите запустить ее самостоятельно:

    int main()
{
srand((unsigned)time(NULL));
cudaSetDevice(0);

cudaStream_t s6;
cudaStreamCreate(&s6);

float tm[transMatrixSize] = {0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000,
0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701,
2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000,
0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205,
2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0,
0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315,
1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000,
0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649,
-2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000,
0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571,
1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0,
0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0,
0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1,
0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0,
0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1,
0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0,
0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0,
0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0,
0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0,
0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513,
-2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000,
0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043,
2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1,
0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0,
0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000,
0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660,
-2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000,
0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088,
-2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0,
0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0,
0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000,
0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861,
-1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000,
0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0};float *h_F6 = new float[(iterations)];
float *h_F50_6 = new float[(iterations)];
float h_S6 [reps]; for (int i = 0; i < reps; i++) {h_S6 [i] = 0;}; for (int i = 0; i < reps; i++){h_S6 [i] = float(rand());}
// Device input vectors
float *d_F6 ;
float *d_F50_6 ;
float *d_S6 ;
float *d_TM6 ;
// Allocate memory for each vector on GPU
cudaMalloc((void**)&d_F6 , iterations * sizeof(float));
cudaMalloc((void**)&d_F50_6 , iterations * sizeof(float));
cudaMalloc((void**)&d_S6 , reps * sizeof(float));
cudaMalloc((void**)&d_TM6 , transMatrixSize * sizeof(float));
// Copy host vectors to device
cudaMemcpyAsync( d_S6 , h_S6 , reps * sizeof(float), cudaMemcpyHostToDevice, s6 );
cudaMemcpyAsync( d_TM6 , tm , transMatrixSize * sizeof(float), cudaMemcpyHostToDevice, s6 );

bufferleech<<<1, 1024, 0, s6 >>>( d_F6 , d_F50_6 , d_TM6 , d_S6 , reps);

cudaMemcpyAsync( h_F6 , d_F6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );
cudaMemcpyAsync( h_F50_6 , d_F50_6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );

delete h_F6; delete h_F50_6;

cudaDeviceReset();
return 0;
}

0

Решение

границы запуска устанавливает (косвенно) верхний предел для регистров на поток. Это не заставляет компилятор использовать определенное количество регистров на поток.

Если компилятору нужно только 46 регистров на поток, границы запуска не заставят его использовать больше.

Тот факт, что вы написали «похожие» ядра, которые используют больше, ничего не говорит. Безобидные изменения в коде ядра могут привести к существенно другому использованию регистра.

Если вы хотите, чтобы ваша программа работала быстрее, используйте более 1 потокового блока. Это значительно более важная задача оптимизации, чем стремление увеличить использование регистров для каждого потока.

Есть бинарные утилиты это позволяет вам проверить сгенерированный машинный код, если вы хотите. Однако это не будет таким поучительным для понимания использования регистра на поток, кроме как для его подтверждения.

1

Другие решения


По вопросам рекламы [email protected]