static uint16_t sad16(const Image *im1, const BImage *im2, uint16_t x1, uint16_t y1,
uint16_t bx2, uint16_t by2)
{
__m128i vec1[16], vec2[16];
for(uint8_t i = 0; i < 16; ++i)
{
vec1[i] = _mm_loadu_si128(
(__m128i const*)(im1->pix + (y1 + i) * im1->w + x1));
vec2[i] = _mm_load_si128(
(__m128i const*)(im2->grid[bx2][by2].pix + i * 16));
vec1[i] = _mm_sad_epu8(vec1[i], vec2[i]);
}
for(uint8_t i = 1; i < 16; ++i)
vec1[0] = _mm_add_epi64(vec1[0], vec1[i]);
return _mm_extract_epi16(vec1[0], 0) + _mm_extract_epi16(vec1[0], 4);
}