Yes, you can XOR 16 bytes in one instruction using SSE2, or 32 bytes at a time with AVX2 (Haswell and later).
SSE2:
#include <emmintrin.h> // SSE2 instrinsics
__m128i v, v_mask;
uint8_t *buff; // buffer - must be 16 byte aligned
for (int i = 0; i < N; i += 16) // note that N must be multiple of 16
{
v = _mm_load_si128(&buff[i]); // load 16 bytes
v = _mm_xor_si128(v, v_mask); // XOR with mask
v = _mm_store_si128(&buff[i], v); // store 16 masked bytes
}
AVX2:
#include <immintrin.h> // AVX2 intrinsics
__m256i w, w_mask;
uint8_t *buff; // buffer - must be 16 byte aligned,
// and preferably 32 byte aligned
for (int i = 0; i < N; i += 32) // note that N must be multiple of 32
{
w = _mm256_load_si256(&buff[i]); // load 32 bytes
w = _mm256_xor_si256(w, w_mask); // XOR with mask
w = _mm256_store_si256(&buff[i], w); // store 32 masked bytes
}