色変換のためのSSE2組み込み関数をスピードアップする
質問
YCBCRからBGRAへの画像色変換を実行しようとしています(このような頭痛があることを確認しないでください)。
とにかく、これはできるだけ早く実行する必要があるため、SSE2を利用するためにコンパイラ組み込み関数を使用して書きました。これはSimd Landへの私の最初のベンチャーです、私は基本的に初心者であり、それで私はきちんとしているたくさんあると確信しています。
実際の色変換を行うための私の算術符号は特に遅くなり、Intelのvtuneはそれを重大なボトルネックとして表示されています。
だから、私は次のコードをスピードアップすることができますか?一度に4ピクセル、32ビットで行われています。私はもともと(上のループのように)8ビット、16ピクセルで一度に行うことを試みましたが、計算は整数オーバーフローと壊れた変換を引き起こします。 Intel JPEGデコードを含むこの全プロセスは、フルHDの単一フィールドのために~14ミリ秒である。私がそれを少なくとも12msまで、理想的に10msにそれを得ることができれば素晴らしいことだ。
何か助言やヒントは感謝しています。ありがとう!
const __m128i s128_8 = _mm_set1_epi8((char)128);
const int nNumPixels = roi.width * roi.height;
for (int i=0; i<nNumPixels; i+=32)
{
// Go ahead and prefetch our packed UV Data.
// As long as the load remains directly next, this saves us time.
_mm_prefetch((const char*)&pSrc8u[2][i],_MM_HINT_T0);
// We need to fetch and blit out our k before we write over it with UV data.
__m128i sK1 = _mm_load_si128((__m128i*)&pSrc8u[2][i]);
__m128i sK2 = _mm_load_si128((__m128i*)&pSrc8u[2][i+16]);
// Using the destination buffer temporarily here so we don't need to waste time doing a memory allocation.
_mm_store_si128 ((__m128i*)&m_pKBuffer[i], sK1);
_mm_store_si128 ((__m128i*)&m_pKBuffer[i+16], sK2);
// In theory, this prefetch needs to be some cycles ahead of the first read. It isn't, yet it does appear to save us time. Worth investigating.
_mm_prefetch((const char*)&pSrc8u[1][i],_MM_HINT_T0);
__m128i sUVI1 = _mm_load_si128((__m128i*)&pSrc8u[1][i]);
__m128i sUVI2 = _mm_load_si128((__m128i*)&pSrc8u[1][i+16]);
// Subtract the 128 here ahead of our YCbCr -> BGRA conversion so we can go 16 pixels at a time rather than 4.
sUVI1 = _mm_sub_epi8(sUVI1, s128_8);
sUVI2 = _mm_sub_epi8(sUVI2, s128_8);
// Swizzle and double up UV data from interleaved 8x1 byte blocks into planar
__m128i sU1 = _mm_unpacklo_epi8(sUVI1, sUVI1);
__m128i sV1 = _mm_unpackhi_epi8(sUVI1, sUVI1);
__m128i sU2 = _mm_unpacklo_epi8(sUVI2, sUVI2);
__m128i sV2 = _mm_unpackhi_epi8(sUVI2, sUVI2);
_mm_store_si128((__m128i*)&pSrc8u[1][i], sU1);
_mm_store_si128((__m128i*)&pSrc8u[1][i+16], sU2);
_mm_store_si128((__m128i*)&pSrc8u[2][i], sV1);
_mm_store_si128((__m128i*)&pSrc8u[2][i+16], sV2);
}
const __m128i s16 = _mm_set1_epi32(16);
const __m128i s299 = _mm_set1_epi32(299);
const __m128i s410 = _mm_set1_epi32(410);
const __m128i s518 = _mm_set1_epi32(518);
const __m128i s101 = _mm_set1_epi32(101);
const __m128i s209 = _mm_set1_epi32(209);
Ipp8u* pDstP = pDst8u;
for (int i=0; i<nNumPixels; i+=4, pDstP+=16)
{
__m128i sK = _mm_set_epi32(m_pKBuffer[i], m_pKBuffer[i+1], m_pKBuffer[i+2], m_pKBuffer[i+3]);
__m128i sY = _mm_set_epi32(pSrc8u[0][i], pSrc8u[0][i+1], pSrc8u[0][i+2], pSrc8u[0][i+3]);
__m128i sU = _mm_set_epi32((char)pSrc8u[1][i], (char)pSrc8u[1][i+1], (char)pSrc8u[1][i+2], (char)pSrc8u[1][i+3]);
__m128i sV = _mm_set_epi32((char)pSrc8u[2][i], (char)pSrc8u[2][i+1], (char)pSrc8u[2][i+2], (char)pSrc8u[2][i+3]);
// N.b. - Attempted to do the sub 16 in 8 bits similar to the sub 128 for U and V - however doing it here is quicker
// as the time saved on the arithmetic is less than the time taken by the additional loads/stores needed in the swizzle loop
sY = _mm_mullo_epi32(_mm_sub_epi32(sY, s16), s299);
__m128i sR = _mm_srli_epi32(_mm_add_epi32(sY,_mm_mullo_epi32(s410, sV)), 8);
__m128i sG = _mm_srli_epi32(_mm_sub_epi32(_mm_sub_epi32(sY, _mm_mullo_epi32(s101, sU)),_mm_mullo_epi32(s209, sV)), 8);
__m128i sB = _mm_srli_epi32(_mm_add_epi32(sY, _mm_mullo_epi32(s518, sU)), 8);
//Microsoft's YUV Conversion
//__m128i sC = _mm_sub_epi32(sY, s16);
//__m128i sD = _mm_sub_epi32(sU, s128);
//__m128i sE = _mm_sub_epi32(sV, s128);
//
//__m128i sR = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298, sC), _mm_mullo_epi32(s409, sE)), s128), 8);
//__m128i sG = _mm_srli_epi32(_mm_add_epi32(_mm_sub_epi32(_mm_mullo_epi32(s298, sC), _mm_sub_epi32(_mm_mullo_epi32(s100, sD), _mm_mullo_epi32(s208, sE))), s128), 8);
//__m128i sB = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298, sC), _mm_mullo_epi32(s516, sD)), s128), 8);
__m128i sKGl = _mm_unpacklo_epi32(sK, sG);
__m128i sKGh = _mm_unpackhi_epi32(sK, sG);
__m128i sRBl = _mm_unpacklo_epi32(sR, sB);
__m128i sRBh = _mm_unpackhi_epi32(sR, sB);
__m128i sKRGB1 = _mm_unpackhi_epi32(sKGh,sRBh);
__m128i sKRGB2 = _mm_unpacklo_epi32(sKGh,sRBh);
__m128i sKRGB3 = _mm_unpackhi_epi32(sKGl,sRBl);
__m128i sKRGB4 = _mm_unpacklo_epi32(sKGl,sRBl);
__m128i p1 = _mm_packus_epi16(sKRGB1, sKRGB2);
__m128i p2 = _mm_packus_epi16(sKRGB3, sKRGB4);
__m128i po = _mm_packus_epi16(p1, p2);
_mm_store_si128((__m128i*)pDstP, po);
}
. 解決
You may be bandwidth limited here, as there is very little computation relative to the number of loads and stores.
One suggestion: get rid of the _mm_prefetch
intrinsics - they are almost certainly not helping and may even hinder operation on more recent CPUs (which already do a pretty good job with automatic prefetching).
Another area to look at:
__m128i sK = _mm_set_epi32(m_pKBuffer[i], m_pKBuffer[i+1], m_pKBuffer[i+2], m_pKBuffer[i+3]);
__m128i sY = _mm_set_epi32(pSrc8u[0][i], pSrc8u[0][i+1], pSrc8u[0][i+2], pSrc8u[0][i+3]);
__m128i sU = _mm_set_epi32((char)pSrc8u[1][i], (char)pSrc8u[1][i+1], (char)pSrc8u[1][i+2], (char)pSrc8u[1][i+3]);
__m128i sV = _mm_set_epi32((char)pSrc8u[2][i], (char)pSrc8u[2][i+1], (char)pSrc8u[2][i+2], (char)pSrc8u[2][i+3]);
This is generating a lot of unnecessary instructions - you should be using _mm_load_xxx
and _mm_unpackxx_xxx
here. It will look like more code, but it will be a lot more efficient. And you should probably be processing 16 pixels per iteration of the loop, rather than 4 - that way you load a vector of 8 bit values once, and unpack to get each set of 4 values as a vector of 32 bit ints as needed.