Ошибка сегментации с использованием следующего кода на нескольких графических процессорах Nvidia

Question

Ошибка сегментации с использованием следующего кода на нескольких графических процессорах Nvidia

Следующий код надежно приводит к ошибке сегментации через некоторое время, когда настроен для работы более чем на одной карте (у меня система с 4 GPU (NVIDIA Titan’s)). Это минимальный пример проблемы, которую я пытался решить ранее (см. Здесь: Ошибка сегментации в __pthread_getspecific, вызванная из libcuda.so.1).

Описание того, что делает код:

Сначала он выделяет огромное количество памяти на каждой карте (на первых N картах, где N — 1-4), идея состоит в том, чтобы максимально увеличить нагрузку на карту. Затем он запускает 16 потоков, каждый из которых запускает пару ядер на одной случайной карте. Он повторяет это пару раз, каждый раз выбирая карту случайным образом, а затем выходит. Главный поток ожидает завершения всех 16 потоков, а затем повторяет процесс бесконечно или до тех пор, пока по какой-либо причине не произойдет сбой одного из потоков.

После пары (обычно около 20, но может быть больше или меньше) итераций главного потока это приводит к ошибке сегментации. Стек обычно выглядит так:

#0  0x00007f164a71f43c in ?? () from /usr/lib/libcuda.so
#1  0x00007f164a6bd1b5 in ?? () from /usr/lib/libcuda.so
#2  0x00007f164a5dff1a in ?? () from /usr/lib/libcuda.so
#3  0x00007f164a6c0b34 in ?? () from /usr/lib/libcuda.so
#4  0x00007f164a6c0c92 in ?? () from /usr/lib/libcuda.so
#5  0x00007f164a5e009f in ?? () from /usr/lib/libcuda.so
#6  0x00007f164a5d03c0 in ?? () from /usr/lib/libcuda.so
#7  0x00007f164a5c43bf in ?? () from /usr/lib/libcuda.so
#8  0x00007f164c131c39 in ?? () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#9  0x00007f164c152879 in cudaDeviceSynchronize () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#10 0x0000000000401911 in TestCUDA(int, unsigned int, unsigned int, unsigned int const*, unsigned short*) ()
#11 0x00000000004012b0 in main (argc=0, argv=0x100000200) at main.cpp:208

Полный исходный код:

main.cpp:

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <cuda_runtime.h>

class CriticalSection
{
pthread_mutex_t cs;

public:

CriticalSection();
~CriticalSection();

void Lock( void );
void Unlock( void );
};

CriticalSection::CriticalSection()
{
assert( pthread_mutex_init( &cs, NULL ) == 0 );
}

CriticalSection::~CriticalSection()
{
assert( pthread_mutex_destroy( &cs ) == 0 );
}

void CriticalSection::Lock( void )
{
assert( pthread_mutex_lock( &cs ) == 0 );
}

void CriticalSection::Unlock( void )
{
assert( pthread_mutex_unlock( &cs ) == 0 );
}

class DeviceWrapper
{
protected:

CriticalSection m_cs;

public:

int32_t     m_i32DeviceId;

uint32_t*   m_pdu32Data;
uint16_t*   m_pdu16Res;
uint32_t    m_u32Count;

DeviceWrapper();
~DeviceWrapper();

void Lock( void );
void Unlock( void );

bool Init( const int32_t i32DevId, const uint32_t u32Count );
bool Free();
};

DeviceWrapper::DeviceWrapper()
{
m_i32DeviceId = 0;
m_pdu32Data = NULL;
m_pdu16Res = NULL;
m_u32Count = 0;
}

DeviceWrapper::~DeviceWrapper()
{

}

void DeviceWrapper::Lock( void )
{
m_cs.Lock();
}

void DeviceWrapper::Unlock( void )
{
m_cs.Unlock();
}

bool DeviceWrapper::Init( const int32_t i32DevId, const uint32_t u32Count )
{
if ( cudaSetDevice( i32DevId ) != cudaSuccess )
{
printf( "DeviceWrapper::Init: Failed to set device %d\n", i32DevId );
return false;
}

if ( cudaMalloc( &m_pdu32Data, sizeof( uint32_t ) * u32Count ) != cudaSuccess )
{
printf( "DeviceWrapper::Init: Failed to allocate %u unsigned int's on device %d\n", u32Count, i32DevId );
return false;
}

if ( cudaMalloc( &m_pdu16Res, sizeof( uint16_t ) * u32Count ) != cudaSuccess )
{
printf( "DeviceWrapper::Init: Failed to allocate %u unsigned short's on device %d\n", u32Count, i32DevId );
return false;
}

m_u32Count = u32Count;
m_i32DeviceId = i32DevId;
return true;
}

bool DeviceWrapper::Free()
{
if ( cudaSetDevice( m_i32DeviceId ) != cudaSuccess )
{
printf( "DeviceWrapper::Free: Failed to set device %d\n", m_i32DeviceId );
return false;
}

if ( cudaFree( m_pdu32Data ) != cudaSuccess )
{
printf( "DeviceWrapper::Free: Failed to free pdu32Mem on device %d\n", m_i32DeviceId );
return false;
}

if ( cudaFree( m_pdu16Res ) != cudaSuccess )
{
printf( "DeviceWrapper::Free: Failed to free pdu16Mem on device %d\n", m_i32DeviceId );
return false;
}

m_pdu32Data = NULL;
m_pdu16Res = NULL;
m_u32Count = 0;
m_i32DeviceId = 0;
return true;
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res );

void* DoWork( void* pArg );

static bool bRun = true;

static DeviceWrapper devices[4];

int main( int argc, char* argv[] )
{
if ( argc != 2 )
{
printf( "Usage: %s <number of cards to use>\n", argv[0] );
return 1;
}

uint32_t u32CardsToUse = strtoul( argv[1], NULL, 0 );

if ( !u32CardsToUse || u32CardsToUse > 4 )
{
printf( "Invalid argument, must be in range 1-4\n" );
return 2;
}

for ( int32_t i = 0; i < u32CardsToUse; i++ )
{
if ( !devices[i].Init( i, 0x20000000 ) )
{
for ( uint32_t j = 0; j < i; j++ )
{
devices[j].Free();
}

printf( "Failed to init device %d\n", i );
return 3;
}
}

uint32_t u32IterationsCompleted = 0;

while ( bRun )
{
pthread_t pWorkers[ 16 ];
memset( pWorkers, 0, 16 * sizeof( pthread_t ) );

for ( uint32_t i = 0; i < 16; i++ )
{
int iReturnValue = pthread_create( &pWorkers[i], NULL, &DoWork, (void*)u32CardsToUse );

if ( iReturnValue != 0 )
{
printf( "Error calling pthread_create: %d\n", iReturnValue );
return 4;
}
}

for ( uint32_t i = 0; i < 16; i++ )
{
pthread_join( pWorkers[i], NULL );
}

printf( "Iterations completed: %u\n", ++u32IterationsCompleted );
}

printf( "Finished\n" );
fflush( stdout );
return 0;
}

void* DoWork( void* pArg )
{
uint32_t u32CardsToUse = uint32_t( pArg );

uint32_t u32TestCount = (rand() % 4) + 4;

for ( uint32_t i = 0; i < u32TestCount; i++ )
{
int32_t i32DeviceId = int32_t( rand() % u32CardsToUse );

devices[ i32DeviceId ].Lock();

if ( !TestCUDA( i32DeviceId, 1, devices[i32DeviceId].m_u32Count, devices[i32DeviceId].m_pdu32Data, devices[i32DeviceId].m_pdu16Res ) )
{
printf( "DoWork: Failure in executing TestCUDA for device %d (test number %u)\n", i32DeviceId, i );
bRun = false;
devices[ i32DeviceId ].Unlock();
return NULL;
}

devices[ i32DeviceId ].Unlock();
}

return NULL;
}

cuda_test.cu:

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>

__global__ void HammingU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx );
gidx += blockDim.x * gridDim.x;
}
}

__global__ void EqualU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ]++;

gidx += blockDim.x * gridDim.x;
}
}

__global__ void EqualByteU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 4;

gidx += blockDim.x * gridDim.x;
}
}

__global__ void EqualBitU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

gidx += blockDim.x * gridDim.x;
}
}

__global__ void OrderU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
uint32_t u32File = pu32Data[gidx];                  // 32-bit value to find the log2 of
uint32_t u32FileLog = 0;                            // result of log2 will go here
uint32_t u32Shift = 0;

u32FileLog = (u32File > 0xFFFF) << 4;
u32File >>= u32FileLog;
u32Shift = (u32File > 0xFF) << 3;
u32File >>= u32Shift;
u32FileLog |= u32Shift;
u32Shift = (u32File > 0xF) << 2;
u32File >>= u32Shift;
u32FileLog |= u32Shift;
u32Shift = (u32File > 0x3) << 1;
u32File >>= u32Shift;
u32FileLog |= u32Shift;
u32FileLog |= (u32File >> 1);

uint32_t u32Other = gidx;           // 32-bit value to find the log2 of
uint32_t u32OtherLog = 0;                           // result of log2 will go here
u32Shift = 0;

u32OtherLog = (u32Other > 0xFFFF) << 4;
u32Other >>= u32OtherLog;
u32Shift = (u32Other > 0xFF) << 3;
u32Other >>= u32Shift;
u32OtherLog |= u32Shift;
u32Shift = (u32Other > 0xF) << 2;
u32Other >>= u32Shift;
u32OtherLog |= u32Shift;
u32Shift = (u32Other > 0x3) << 1;
u32Other >>= u32Shift;
u32OtherLog |= u32Shift;
u32OtherLog |= (u32Other >> 1);

if ( u32FileLog >= u32OtherLog )
{
pu16Results[ gidx ] += uint16_t( u32FileLog - u32OtherLog );
}
else
{
pu16Results[ gidx ] += uint16_t( u32OtherLog - u32FileLog );
}

gidx += blockDim.x * gridDim.x;
}
}

__global__ void LogU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
uint32_t u32Value = 0;

if ( pu32Data[gidx] >= gidx )
{
u32Value = pu32Data[gidx] - gidx;
}
else
{
u32Value = gidx - pu32Data[gidx];
}

uint32_t u32Log = 0;        // result of log2 will go here
uint32_t u32Shift = 0;

u32Log = (u32Value > 0xFFFF) << 4;
u32Value >>= u32Log;
u32Shift = (u32Value > 0xFF) << 3;
u32Value >>= u32Shift;
u32Log |= u32Shift;
u32Shift = (u32Value > 0xF) << 2;
u32Value >>= u32Shift;
u32Log |= u32Shift;
u32Shift = (u32Value > 0x3) << 1;
u32Value >>= u32Shift;
u32Log |= u32Shift;
u32Log |= (u32Value >> 1);

pu16Results[ gidx ] += (uint16_t)u32Log;

gidx += blockDim.x * gridDim.x;
}
}

__global__ void EqualRetU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

gidx += blockDim.x * gridDim.x;
}
}

__global__ void HammingMulU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

while ( gidx < u32Count )
{
pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx ) << 5;
gidx += blockDim.x * gridDim.x;
}
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res )
{
for ( uint32_t i = 0; i < u32Iterations; i++ )
{
if ( cudaSetDevice( i32DeviceId ) != cudaSuccess )
{
return false;
}

if ( cudaMemset( pdu16Res, 0, u32Count * sizeof( uint16_t ) ) != cudaSuccess )
{
return false;
}

for ( uint32_t j = 0; j < 3; j++ )
{
HammingU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualByteU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualBitU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
OrderU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
LogU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualRetU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
HammingMulU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
}

cudaDeviceSynchronize();
}

return true;
}

Makefile:

IDIR_CUDA = -I. -I/usr/local/cuda-5.5/include
CC_CUDA = g++
CFLAGS_CUDA = -g $(IDIR_CUDA)
LIBS_CUDA = -lz -lpthread -lrt -ldl -L/usr/local/cuda-5.5/lib64 -lcudart -lcuda

all:
nvcc -I/usr/include -arch=compute_35 -code=sm_35 --machine 64 --compile cuda_test.cu
$(CC_CUDA) -fpermissive *.cpp *.o -o test_cuda.out $(CFLAGS_CUDA) $(LIBS_CUDA)
@echo DONE TEST_CUDA BUILD

clean:
rm -f *.o test_cuda.out

Я использую CUDA 5.5 с последними драйверами (319.32), работающими под Ubuntu Linux (64bit).

Мои вопросы:
Есть ли какая-то ошибка в коде, которая может вызвать такое поведение?
Почему я не вижу сбоя при использовании только одной карты? Просто крушение гораздо менее вероятно? (кажется, что при использовании большего количества карт сбой появляется раньше, чем при меньшем количестве карт)
Бонус: кто-нибудь еще видит сбой, используя этот код?

1

c++cuda multi-gpu segmentation-fault

Решение

Другие решения

Других решений пока нет …

Источник

Accepted Answer

Я смог найти Обходной путь, который решает аварии:

Идея в том, что там точно один поток на каждый графический процессор используемые и другие рабочие потоки отправляют работу этим потокам для запуска на графических процессорах. Мне удалось без проблем запустить тестовое приложение на компьютере в течение дня (выполнено более 6 тысяч итераций).

0