Я работаю над SIMD оптимизацией преобразования BGR в оттенки серого, что эквивалентно OpenCV-х cvtColor()
функция. Существует версия этой функции для Intel SSE, и я имею в виду ее. (То, что я делаю, это в основном преобразование кода SSE в код NEON.)
Я почти закончил писать код и могу скомпилировать его с помощью g ++, но не могу получить правильный вывод. У кого-нибудь есть идеи, в чем может быть ошибка?
Что я получаю (неверно):
Что я должен получить:
Вот мой код:
#include <opencv/cv.hpp>
#include <opencv/highgui.h>
#include <arm_neon.h>
//#include <iostream>
using namespace std;
//using namespace cv;
#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
void cvtBGR2GrayNEON(cv::Mat& src, cv::Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);
const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
const int8x16_t smask1 = {6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15};
const int8x16_t ssmask1 = {11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10};
const int8x16_t mask2 = {0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13};
const int8x16_t ssmask2 = {0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10};
const int8x16_t bmask1 = {255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask2 = {255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0};
const int8x16_t bmask3 = {255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask4 = {255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0};
const int shift = 8;
const int amp = 1<<shift;
const int16_t _R_ = (int16_t)(amp*0.299);
const int16_t _G_ = (int16_t)(amp*0.587);
const int16_t _B_ = (int16_t)(amp*0.114);
const int16x8_t R = vdupq_n_s16(_R_);
const int16x8_t G = vdupq_n_s16(_G_);
const int16x8_t B = vdupq_n_s16(_B_);
const int8x16_t zero = vdupq_n_s8(0);
for(int i = 0; i < size; i += 48)
{
int8x16_t a = vld1q_s8((int8_t *) s + i);
int8x16_t b = vld1q_s8((int8_t *) s + i + 16);
int8x16_t c = vld1q_s8((int8_t *) s + i + 32);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(mask2)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(mask2)));
//BBBBBB
const int8x16_t aaaa = vbslq_s8(c, vbslq_s8(b, a, bmask1), bmask2);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(smask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(smask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(smask1)));
//GGGGGG
const int8x16_t bbbb = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask2);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(ssmask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(ssmask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(ssmask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(ssmask2)));
//RRRRRR
const int8x16_t cccc = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask4);
/*
int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa), vget_high_s8(zero));
int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa), vget_low_s8(zero));
*/
int8x16_t a1 = aaaa;
int8x16_t a2 = zero;
int8x16x2_t temp1 = vzipq_s8(a1, a2);
a1 = temp1.val[0];
a2 = temp1.val[1];
int16x8_t aa1 = vmulq_s16((int16x8_t)a2, B);
int16x8_t aa2 = vmulq_s16((int16x8_t)a1, B);
int8x16_t b1 = bbbb;
int8x16_t b2 = zero;
int8x16x2_t temp2 = vzipq_s8(b1, b2);
b1 = temp2.val[0];
b2 = temp2.val[1];
int16x8_t bb1 = vmulq_s16((int16x8_t)b2, G);
int16x8_t bb2 = vmulq_s16((int16x8_t)b1, G);
int8x16_t c1 = cccc;
int8x16_t c2 = zero;
int8x16x2_t temp3 = vzipq_s8(c1, c2);
c1 = temp3.val[0];
c2 = temp3.val[1];
int16x8_t cc1 = vmulq_s16((int16x8_t)c2, R);
int16x8_t cc2 = vmulq_s16((int16x8_t)c1, R);
aa1 = vaddq_s16(aa1, bb1);
aa1 = vaddq_s16(aa1, cc1);
aa2 = vaddq_s16(aa2, bb2);
aa2 = vaddq_s16(aa2, cc2);
const int shift1 = 8;
aa1 = vshrq_n_s16(aa1, shift1);
aa2 = vshrq_n_s16(aa2, shift1);
uint8x8_t aaa1 = vqmovun_s16(aa1);
uint8x8_t aaa2 = vqmovun_s16(aa2);
uint8x16_t result = vcombine_u8(aaa1, aaa2);
vst1q_u8((uint8_t *)(d), result);
d+=16;
}
}
int main()
{
cv::Mat src = cv::imread("Lenna.bmp");
cv::Mat dest(src.rows, src.cols, CV_8UC1);
cvtBGR2GrayNEON(src, dest);
cv::imwrite("grey.jpg", dest);
return 0;
}
Вот эквивалентный код SSE (от Вот):
void cvtBGR2GraySSEShort(Mat& src, Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);
//data structure
//BGR BGR BGR BGR BGR B
//GR BGR BGR BGR BGR BG
//R BGR BGR BGR BGR BGR
//shuffle to BBBBBBGGGGGRRRRR
const __m128i mask1 = _mm_setr_epi8(0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14);
const __m128i smask1 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);
const __m128i ssmask1 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);
//shuffle to GGGGGGBBBBBRRRRR
const __m128i mask2 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);
//const __m128i smask2 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);same as smask1
const __m128i ssmask2 = _mm_setr_epi8(0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10);
//shuffle to RRRRRRGGGGGBBBBB
//__m128i mask3 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);//same as mask2
//const __m128i smask3 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10);//same as smask1
//const __m128i ssmask3 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);//same as ssmask1
//blend mask
const __m128i bmask1 = _mm_setr_epi8
(255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0);
const __m128i bmask2 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0);
const __m128i bmask3 = _mm_setr_epi8
(255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0);
const __m128i bmask4 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0);
const int shift = 8;
const int amp = 1<<shift;
const int _R_=(int)(amp*0.299);
const int _G_=(int)(amp*0.587);
const int _B_=(int)(amp*0.114);
const __m128i R = _mm_set1_epi16(_R_);
const __m128i G = _mm_set1_epi16(_G_);
const __m128i B = _mm_set1_epi16(_B_);
const __m128i zero = _mm_setzero_si128();
for(int i=0;i<size;i+=48)
{
__m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1);
__m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2);
__m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2);
const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask1),bmask2);
a = _mm_shuffle_epi8(a,smask1);
b = _mm_shuffle_epi8(b,smask1);
c = _mm_shuffle_epi8(c,smask1);
const __m128i bbbb =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask2);
a = _mm_shuffle_epi8(a,ssmask1);
c = _mm_shuffle_epi8(c,ssmask1);
b = _mm_shuffle_epi8(b,ssmask2);
const __m128i cccc =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask4);
__m128i a1 = _mm_unpackhi_epi8(aaaa,zero);
__m128i a2 = _mm_unpacklo_epi8(aaaa,zero);
a1 = _mm_mullo_epi16(a1,B);
a2 = _mm_mullo_epi16(a2,B);
__m128i b1 = _mm_unpackhi_epi8(bbbb,zero);
__m128i b2 = _mm_unpacklo_epi8(bbbb,zero);
b1 = _mm_mullo_epi16(b1,G);
b2 = _mm_mullo_epi16(b2,G);
__m128i c1 = _mm_unpackhi_epi8(cccc,zero);
__m128i c2 = _mm_unpacklo_epi8(cccc,zero);
c1 = _mm_mullo_epi16(c1,R);
c2 = _mm_mullo_epi16(c2,R);
a1 = _mm_add_epi16(a1,b1);
a1 = _mm_add_epi16(a1,c1);
a2 = _mm_add_epi16(a2,b2);
a2 = _mm_add_epi16(a2,c2);
a1 = _mm_srli_epi16(a1,8);
a2 = _mm_srli_epi16(a2,8);
a = _mm_packus_epi16(a1,a2);
_mm_stream_si128((__m128i*)(d),a);
d+=16;
}
}
Хорошо, ниже приведена полностью оптимизированная версия этой функции, которую я только что написал (помните, что эта функция просто возвращает значение, если размер меньше 32).
/*
* Created on: 2014. 7. 27.
* Author: Jake Lee
* Project FANIC - Fastest ARM NEON Implementaion Challenge
*/
// void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
// Y = 0.114*B + 0.587*G + 0.299*R
.text
.arm
.global fanicCvtBGR2GrayNEON
pDst .req r0
pSrc .req r1
size .req r2
.align 5
.func
fanicCvtBGR2GrayNEON:
pld [pSrc]
subs size, size, #32
pld [pSrc, #64]
bxmi lr
pld [pSrc, #64*2]
vmov.i8 d0, #29
vmov.i8 d1, #150
vmov.i8 d2, #77
.align 5
1:
vld3.8 {d20, d21, d22}, [pSrc]!
vld3.8 {d23, d24, d25}, [pSrc]!
vld3.8 {d26, d27, d28}, [pSrc]!
vld3.8 {d29, d30, d31}, [pSrc]!
vmull.u8 q8, d20, d0
vmlal.u8 q8, d21, d1
vmlal.u8 q8, d22, d2
vmull.u8 q9, d23, d0
vmlal.u8 q9, d24, d1
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d0
vmlal.u8 q10, d27, d1
vmlal.u8 q10, d28, d2
vmull.u8 q11, d29, d0
vmlal.u8 q11, d30, d1
vmlal.u8 q11, d31, d2
vrshrn.u16 d24, q8, #8
vrshrn.u16 d25, q9, #8
vrshrn.u16 d26, q10, #8
vrshrn.u16 d27, q11, #8
subs size, size, #32
pld [pSrc, #64*3]
pld [pSrc, #64*4]
vst1.8 {q12, q13}, [pDst]!
bpl 1b
cmp size, #-32
add pSrc, pSrc, size
bxle lr
add pSrc, pSrc, size, lsl #1
add pDst, pDst, size
b 1b
.endfunc
.end
Как видите, написание ассемблерных кодов NEON намного проще и короче, чем встроенных, несмотря на интенсивное развертывание.
Повеселись.