_mm_mpsadbw_epu8

Microsoft Specific

Emits the Streaming SIMD Extensions 4 (SSE4) instruction mpsadbw. This instruction computes multiple packed sums on the absolute value of the difference between two parameters.

__m128i _mm_mpsadbw_epu8( 
   __m128i a,
   __m128i b,
   const int mask 
);

Parameters

  • [in] a
    A 128-bit parameter that contains sixteen 8-bit unsigned integers.

  • [in] b
    A 128-bit parameter that contains sixteen 8-bit unsigned integers.

  • [in] mask
    A constant that specifies which integers to use in the calculation.

Result value

A 128-bit result that contains eight 16-bit unsigned integers. The values of these integers can be computed as follows:

i = mask2 * 4
j = mask0-1 * 4
for (k = 0; k < 8; k = k + 1) {
        t0 = abs(a[i + k + 0] - b[j + 0])
        t1 = abs(a[i + k + 1] - b[j + 1])
        t2 = abs(a[i + k + 2] - b[j + 2])
        t3 = abs(a[i + k + 3] - b[j + 3])
        r[k] = t0 + t1 + t2 + t3
}

Requirements

Intrinsic

Architecture

_mm_mpsadbw_epu8

x86, x64

Header file <smmintrin.h>

Remarks

a[n] and b[n] indicate the nth ordered unsigned 8-bit integer of parameters a and b where a[0] and b[0] are the lowest 8 bits. r[n] is the nth ordered unsigned 16-bit element of result r, where r[0] refers to the lowest 16 bits. mask0, mask1, and mask2 are the three least significant bits of parameter mask.

Before you use this intrinsic, software must ensure that the processor supports the instruction.

Example

#include <stdio.h>
#include <smmintrin.h>

int main ()
{
    __m128i a, b;
    // A mask value of 0101 (5) will add four to each index
    const int mask = 5;

    a.m128i_u8[0] = 15;
    a.m128i_u8[1] = 60;
    a.m128i_u8[2] = 55;
    a.m128i_u8[3] = 31;
    a.m128i_u8[4] = 0;
    a.m128i_u8[5] = 1;
    a.m128i_u8[6] = 2;
    a.m128i_u8[7] = 4;
    a.m128i_u8[8] = 8;
    a.m128i_u8[9] = 16;
    a.m128i_u8[10] = 32;
    a.m128i_u8[11] = 64;
    a.m128i_u8[12] = 128;
    a.m128i_u8[13] = 255;
    a.m128i_u8[14] = 1;
    a.m128i_u8[15] = 17;

    b.m128i_u8[0] = 2;
    b.m128i_u8[1] = 4;
    b.m128i_u8[2] = 8;
    b.m128i_u8[3] = 64;
    b.m128i_u8[4] = 255;
    b.m128i_u8[5] = 0;
    b.m128i_u8[6] = 1;
    b.m128i_u8[7] = 16;
    b.m128i_u8[8] = 32;
    b.m128i_u8[9] = 64;
    b.m128i_u8[10] = 128;
    b.m128i_u8[11] = 255;
    b.m128i_u8[12] = 75;
    b.m128i_u8[13] = 31;
    b.m128i_u8[14] = 42;
    b.m128i_u8[15] = 11;

    __m128i res = _mm_mpsadbw_epu8(a, b, mask);

    __m128i final;
    int temp1, temp2, temp3, temp4, index;

    for (index = 0; index < 8; index++)
    {
        temp1 = abs(a.m128i_u8[4 + index] - b.m128i_u8[4]);
        temp2 = abs(a.m128i_u8[4 + index + 1] - b.m128i_u8[4 + 1]);
        temp3 = abs(a.m128i_u8[4 + index + 2] - b.m128i_u8[4 + 2]);
        temp4 = abs(a.m128i_u8[4 + index + 3] - b.m128i_u8[4 + 3]);
        final.m128i_u16[index] = temp1 + temp2 + temp3 + temp4;
    }

    printf_s("Res0 should be %d: %d\nRes1 should be %d: %d\n",
                final.m128i_u16[0], res.m128i_u16[0], 
                final.m128i_u16[1], res.m128i_u16[1]);
    printf_s("Res2 should be %d: %d\nRes3 should be %d: %d\n",
                final.m128i_u16[2], res.m128i_u16[2], 
                final.m128i_u16[3], res.m128i_u16[3]);
    printf_s("Res4 should be %d: %d\nRes5 should be %d: %d\n",
                final.m128i_u16[4], res.m128i_u16[4], 
                final.m128i_u16[5], res.m128i_u16[5]);
    printf_s("Res6 should be %d: %d\nRes7 should be %d: %d\n",
                final.m128i_u16[6], res.m128i_u16[6], 
                final.m128i_u16[7], res.m128i_u16[7]);

    return 0;
}
Res0 should be 269: 269
Res1 should be 267: 267
Res2 should be 264: 264
Res3 should be 290: 290
Res4 should be 342: 342
Res5 should be 446: 446
Res6 should be 653: 653
Res7 should be 588: 588

See Also

Reference

Compiler Intrinsics