Что я сделал не так Преобразование моего MMX Intrinsics в x64 (SSE)?

Question

Что я сделал не так Преобразование моего MMX Intrinsics в x64 (SSE)?

Я понимаю, что преобразование MMX 32-битных встроенных функций mmx больше не позволяет __m64. Поэтому у меня возникли большие проблемы при обновлении этого кода до SSE. Мне сказали в другом сообщении переполнения стека, чтобы опубликовать мой код. Возможно, это упражнение поможет и другим.

Я прокомментировал «_mm_empty», думая, что это правильно. Я нашел подобные функции в emmintrin.h для всех других операций __m128i, но что-то все еще не так.

оригинальный 32-битный код функции:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn     = m_taskdata.arrayIn;
ushort arrayLen0    = m_taskdata.arrayLen0;
ushort arrayLen1    = m_taskdata.arrayLen1;
ushort* kernel      = m_taskdata.kernel;
ushort kernelLen    = m_taskdata.kernelLen;
uint32_t* norm_r        = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;

ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////

ushort tailLength = (ushort)((kernelLen - 1) / 2);

_ASSERTE(interArray);

//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray

INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;

// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;

// want to process only those (edge) pixels that need the innner loop condition
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges

int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests

_mm_empty();
__m64 accu, temp;
__m64 shifter = _m_from_int(32);

for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
// skip over left edge
lpInterRowInside = lpInterRow + cbLeftEdgeStride;

for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
{
sum = 0;
points = 0;
lpKernel = (INT_PTR)kernel;

lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

// MMX Optimizations
accu = _mm_setzero_si64(); // zero the accumulator

// VECTOR processing
for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

// _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
// _mm_add_pi32/_m_paddd: 2*32bit add
temp = _m_pmaddwd(*(__m64*)lpKernel, *(__m64*)lpInnerPixels);

accu = _mm_add_pi32(accu, temp); // each double word has a partial sum

lpKernel += 8; lpInnerPixels += 8;

} // loop over the kernel

// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
// and finally store the result into the variable "accu"accu = _mm_add_pi32(accu, _mm_srl_si64(accu, shifter)); // combine results from upper and lower double words

sum = _m_to_int(accu); // move mmx result to sum

// SCALAR
for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
//points++;
lpKernel += 2; lpInnerPixels += 2;
} // loop over the kernel//*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

*(ushort*)lpInterRowInside = (ushort)(sum/norm);
lpInterRowInside += 2; // move to next column sizeof(ushort)
} // for each columnlpRow += rowstride; // move to next row ( h * arrayLen1 )
lpInterRow += rowstride;} // for each row

_mm_empty();

return 0;

}

64-битная попытка:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn     = m_taskdata.arrayIn;
ushort arrayLen0    = m_taskdata.arrayLen0;
ushort arrayLen1    = m_taskdata.arrayLen1;
ushort* kernel      = m_taskdata.kernel;
ushort kernelLen    = m_taskdata.kernelLen;
uint32_t* norm_r        = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;

ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////

ushort tailLength = (ushort)((kernelLen - 1) / 2);

_ASSERTE(interArray);

//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray

INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;

// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;// want to process only those (edge) pixels that need the innner loop condition
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges

int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests

//_mm_empty();
__m128i accu, temp;
__m128i shifter = _mm_cvtsi32_si128(32);

for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
// skip over left edge
lpInterRowInside = lpInterRow + cbLeftEdgeStride;

for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
{
sum = 0;
points = 0;
lpKernel = (INT_PTR)kernel;

lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

// MMX Optimizations
accu = _mm_setzero_si128(); // zero the accumulator

// VECTOR processing
for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

// _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
// _mm_add_pi32/_m_paddd: 2*32bit add
//temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
temp = _mm_madd_epi16(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);

accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

lpKernel += 8; lpInnerPixels += 8;

} // loop over the kernel

// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
// and finally store the result into the variable "accu"accu = _mm_add_epi32(accu, _mm_sll_epi64(accu, shifter)); // combine results from upper and lower double words

sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

// SCALAR
for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
//points++;
lpKernel += 2; lpInnerPixels += 2;
} // loop over the kernel//*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

*(ushort*)lpInterRowInside = (ushort)(sum/norm);
lpInterRowInside += 2; // move to next column sizeof(ushort)
} // for each columnlpRow += rowstride; // move to next row ( h * arrayLen1 )
lpInterRow += rowstride;} // for each row

//_mm_empty();

return 0;

}

1

64bit c++convolution mmx visual-c++

Решение

Другие решения

Источник

Accepted Answer

Со всеми исправленными проблемами, упомянутыми выше в комментариях.
Вот окончательный рабочий код x64 SSE Convolution:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn     = m_taskdata.arrayIn;
ushort arrayLen0    = m_taskdata.arrayLen0;
ushort arrayLen1    = m_taskdata.arrayLen1;
ushort* kernel      = m_taskdata.kernel;
ushort kernelLen    = m_taskdata.kernelLen;
uint32_t* norm_r        = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;

ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////

ushort tailLength = (ushort)((kernelLen - 1) / 2);

_ASSERTE(interArray);

//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray

INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;

// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;// want to process only those (edge) pixels that need the innner loop condition
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges

int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests

//_mm_empty();
__m128i accu, temp, mlpkernel, mlpInnerPixels;
__m128i shifter = _mm_cvtsi32_si128(32);

for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
// skip over left edge
lpInterRowInside = lpInterRow + cbLeftEdgeStride;

for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
{
sum = 0;
points = 0;
lpKernel = (INT_PTR)kernel;

lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

// MMX Optimizations
accu = _mm_setzero_si128(); // zero the accumulator

// VECTOR processing
for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

// _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
// _mm_add_pi32/_m_paddd: 2*32bit add
//temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
//mlpkernel = _mm_cvtsi32_si128(lpKernel);
mlpkernel = _mm_cvtsi64_si128(*(__int64*)lpKernel);
mlpInnerPixels = _mm_cvtsi64_si128(*(__int64*)lpInnerPixels);
temp = _mm_madd_epi16(mlpkernel, mlpInnerPixels);

accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

lpKernel += 8; lpInnerPixels += 8;

} // loop over the kernel

// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
// and finally store the result into the variable "accu"accu = _mm_add_epi32(accu, _mm_srl_epi64(accu, shifter)); // combine results from upper and lower double words

sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

// SCALAR
for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
//points++;
lpKernel += 2; lpInnerPixels += 2;
} // loop over the kernel//*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

*(ushort*)lpInterRowInside = (ushort)(sum/norm);
lpInterRowInside += 2; // move to next column sizeof(ushort)
} // for each columnlpRow += rowstride; // move to next row ( h * arrayLen1 )
lpInterRow += rowstride;} // for each row

//_mm_empty();

return 0;

}

1