© Daniel Kusswurm 2018
Daniel KusswurmModern X86 Assembly Language Programminghttps://doi.org/10.1007/978-1-4842-4063-2_7

7. AVX Programming – Packed Integers

Daniel Kusswurm1 
(1)
Geneva, IL, USA
 

In the previous chapter, you learned how to use the AVX instruction set to perform calculations using packed floating-point operands. In this chapter, you learn how to carry out computations using packed integer operands. Similar to the previous chapter, the first few source code examples in this chapter demonstrate basic arithmetic operations using packed integers. The remaining source code examples illustrate how to use the computational resources of AVX to perform common image processing operations, including histogram creation and thresholding.

AVX supports packed integer operations using 128-bit wide operands, and that is the focus of the source code examples in this chapter. Performing packed integer operations using 256-bit operands requires a processor that supports AVX2. You learn about AVX2 programming with packed integers in Chapter 10.

Packed Integer Addition and Subtraction

Listing 7-1 shows the C++ and assembly language source code for example Ch07_01. This example demonstrates how to perform packed integer addition and subtraction using signed and unsigned 16-bit integers. It also illustrates both wraparound and saturated arithmetic.
//------------------------------------------------
//        Ch07_01.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <string>
#include "XmmVal.h"
using namespace std;
extern "C" void AvxPackedAddI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);
extern "C" void AvxPackedSubI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);
extern "C" void AvxPackedAddU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);
extern "C" void AvxPackedSubU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);
//
// Signed packed addition and subtraction
//
void AvxPackedAddI16(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[2];
  a.m_I16[0] = 10;     b.m_I16[0] = 100;
  a.m_I16[1] = 200;     b.m_I16[1] = -200;
  a.m_I16[2] = 30;     b.m_I16[2] = 32760;
  a.m_I16[3] = -32766;   b.m_I16[3] = -400;
  a.m_I16[4] = 50;     b.m_I16[4] = 500;
  a.m_I16[5] = 60;     b.m_I16[5] = -600;
  a.m_I16[6] = 32000;    b.m_I16[6] = 1200;
  a.m_I16[7] = -32000;   b.m_I16[7] = -950;
  AvxPackedAddI16_(a, b, c);
  cout << "\nResults for AxvPackedAddI16 - Wraparound Addition\n";
  cout << "a:   " << a.ToStringI16() << '\n';
  cout << "b:   " << b.ToStringI16() << '\n';
  cout << "c[0]: " << c[0].ToStringI16() << '\n';
  cout << "\nResults for AxvPackedAddI16 - Saturated Addition\n";
  cout << "a:   " << a.ToStringI16() << '\n';
  cout << "b:   " << b.ToStringI16() << '\n';
  cout << "c[1]: " << c[1].ToStringI16() << '\n';
}
void AvxPackedSubI16(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[2];
  a.m_I16[0] = 10;     b.m_I16[0] = 100;
  a.m_I16[1] = 200;     b.m_I16[1] = -200;
  a.m_I16[2] = -30;     b.m_I16[2] = 32760;
  a.m_I16[3] = -32766;   b.m_I16[3] = 400;
  a.m_I16[4] = 50;     b.m_I16[4] = 500;
  a.m_I16[5] = 60;     b.m_I16[5] = -600;
  a.m_I16[6] = 32000;    b.m_I16[6] = 1200;
  a.m_I16[7] = -32000;   b.m_I16[7] = 950;
  AvxPackedSubI16_(a, b, c);
  cout << "\nResults for AxvPackedSubI16 - Wraparound Subtraction\n";
  cout << "a:   " << a.ToStringI16() << '\n';
  cout << "b:   " << b.ToStringI16() << '\n';
  cout << "c[0]: " << c[0].ToStringI16() << '\n';
  cout << "\nResults for AxvPackedSubI16 - Saturated Subtraction\n";
  cout << "a:   " << a.ToStringI16() << '\n';
  cout << "b:   " << b.ToStringI16() << '\n';
  cout << "c[1]: " << c[1].ToStringI16() << '\n';
}
//
// Unsigned packed addition and subtraction
//
void AvxPackedAddU16(void)
{
  XmmVal a;
  XmmVal b;
  XmmVal c[2];
  a.m_U16[0] = 10;     b.m_U16[0] = 100;
  a.m_U16[1] = 200;     b.m_U16[1] = 200;
  a.m_U16[2] = 300;     b.m_U16[2] = 65530;
  a.m_U16[3] = 32766;    b.m_U16[3] = 40000;
  a.m_U16[4] = 50;     b.m_U16[4] = 500;
  a.m_U16[5] = 20000;    b.m_U16[5] = 25000;
  a.m_U16[6] = 32000;    b.m_U16[6] = 1200;
  a.m_U16[7] = 32000;    b.m_U16[7] = 50000;
  AvxPackedAddU16_(a, b, c);
  cout << "\nResults for AxvPackedAddU16 - Wraparound Addition\n";
  cout << "a:   " << a.ToStringU16() << '\n';
  cout << "b:   " << b.ToStringU16() << '\n';
  cout << "c[0]: " << c[0].ToStringU16() << '\n';
  cout << "\nResults for AxvPackedAddU16 - Saturated Addition\n";
  cout << "a:   " << a.ToStringU16() << '\n';
  cout << "b:   " << b.ToStringU16() << '\n';
  cout << "c[1]: " << c[1].ToStringU16() << '\n';
}
void AvxPackedSubU16(void)
{
  XmmVal a;
  XmmVal b;
  XmmVal c[2];
  a.m_U16[0] = 10;     b.m_U16[0] = 100;
  a.m_U16[1] = 200;     b.m_U16[1] = 200;
  a.m_U16[2] = 30;     b.m_U16[2] = 7;
  a.m_U16[3] = 65000;    b.m_U16[3] = 5000;
  a.m_U16[4] = 60;     b.m_U16[4] = 500;
  a.m_U16[5] = 25000;    b.m_U16[5] = 28000;
  a.m_U16[6] = 32000;    b.m_U16[6] = 1200;
  a.m_U16[7] = 1200;    b.m_U16[7] = 950;
  AvxPackedSubU16_(a, b, c);
  cout << "\nResults for AxvPackedSubU16 - Wraparound Subtraction\n";
  cout << "a:   " << a.ToStringU16() << '\n';
  cout << "b:   " << b.ToStringU16() << '\n';
  cout << "c[0]: " << c[0].ToStringU16() << '\n';
  cout << "\nResults for AxvPackedSubI16 - Saturated Subtraction\n";
  cout << "a:   " << a.ToStringU16() << '\n';
  cout << "b:   " << b.ToStringU16() << '\n';
  cout << "c[1]: " << c[1].ToStringU16() << '\n';
}
int main()
{
  string sep(75, '-');
  AvxPackedAddI16();
  AvxPackedSubI16();
  cout << '\n' << sep << '\n';
  AvxPackedAddU16();
  AvxPackedSubU16();
  return 0;
}
;-------------------------------------------------
;        Ch07_01.asm
;-------------------------------------------------
; extern "C" void AvxPackedAddI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])
    .code
AvxPackedAddI16_ proc
; Packed signed word addition
    vmovdqa xmm0,xmmword ptr [rcx]   ;xmm0 = a
    vmovdqa xmm1,xmmword ptr [rdx]   ;xmm1 = b
    vpaddw xmm2,xmm0,xmm1        ;packed add - wraparound
    vpaddsw xmm3,xmm0,xmm1       ;packed add - saturated
    vmovdqa xmmword ptr [r8],xmm2    ;save c[0]
    vmovdqa xmmword ptr [r8+16],xmm3  ;save c[1]
    ret
AvxPackedAddI16_ endp
; extern "C" void AvxPackedSubI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])
AvxPackedSubI16_ proc
; Packed signed word subtraction
    vmovdqa xmm0,xmmword ptr [rcx]   ;xmm0 = a
    vmovdqa xmm1,xmmword ptr [rdx]   ;xmm1 = b
    vpsubw xmm2,xmm0,xmm1        ;packed sub - wraparound
    vpsubsw xmm3,xmm0,xmm1       ;packed sub - saturated
    vmovdqa xmmword ptr [r8],xmm2    ;save c[0]
    vmovdqa xmmword ptr [r8+16],xmm3  ;save c[1]
    ret
AvxPackedSubI16_ endp
; extern "C" void AvxPackedAddU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])
AvxPackedAddU16_ proc
; Packed unsigned word addition
    vmovdqu xmm0,xmmword ptr [rcx]   ;xmm0 = a
    vmovdqu xmm1,xmmword ptr [rdx]   ;xmm1 = b
    vpaddw xmm2,xmm0,xmm1        ;packed add - wraparound
    vpaddusw xmm3,xmm0,xmm1       ;packed add - saturated
    vmovdqu xmmword ptr [r8],xmm2    ;save c[0]
    vmovdqu xmmword ptr [r8+16],xmm3  ;save c[1]
    ret
AvxPackedAddU16_ endp
; extern "C" void AvxPackedSubU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])
AvxPackedSubU16_ proc
; Packed unsigned word subtraction
    vmovdqu xmm0,xmmword ptr [rcx]   ;xmm0 = a
    vmovdqu xmm1,xmmword ptr [rdx]   ;xmm1 = b
    vpsubw xmm2,xmm0,xmm1        ;packed sub - wraparound
    vpsubusw xmm3,xmm0,xmm1       ;packed sub - saturated
    vmovdqu xmmword ptr [r8],xmm2    ;save c[0]
    vmovdqu xmmword ptr [r8+16],xmm3  ;save c[1]
    ret
AvxPackedSubU16_ endp
    end
Listing 7-1.

Example Ch07_01

Toward the top of the C++ code are the declarations for the assembly language functions that perform packed integer addition and subtraction. Each function takes two XmmVal arguments and saves its results to an XmmVal array. The structure XmmVal, which you learned about in Chapter 6 (see Listing 6-1), contains a publicly-accessible anonymous union with members that correspond to the packed data types that can be used with an XMM register. The XmmVal structure also defines several member functions that format the contents of an XmmVal for display.

The C++ function AvxPackedAddI16 contains test code that exercises the assembly language function AvxPackedAddI16_. This function performs packed signed 16-bit integer (word) addition using both wraparound and saturated arithmetic. Note that the XmmVal variables a, b, and c are all defined using the C++ specifier alignas(16), which aligns each XmmVal to a 16-byte boundary. Following the execution of the function AvxPackedaddI16_, the results are displayed using a series of stream writes to cout. The C++ function AvxPackedSubI16, which is similar to AvxPackedAddI16, uses the assembly language function AvxPackedSubI16_.

A parallel set of C++ functions, AvxPackedAddU16 and AvxPackedSubU16, contain code that exercise the assembly language functions AvxPackedAddU16_ and AvxPackedSubU16_. These functions perform packed unsigned 16-bit integer addition and subtraction, respectively. Note that the XmmVal variables in AvxPackedAddU16 and AvxPackedSubU16 do not use the alignas(16) specifier, which means that these values are not guaranteed to be aligned on a 16-byte boundary. The reason for doing this is to demonstrate the use of the AVX instruction vmovdqu (Move Unaligned Packed Integer Values), as you’ll soon see.

The assembly language function AvxPackedAddI6_ starts with a vmovdqa xmm0,xmmword ptr [rcx] instruction that loads argument value a into register XMM0. The ensuing vmovdqa xmm1,xmmword ptr [rdx] instruction copies b into register XMM1. The next two instructions, vpaddw xmm2,xmm0,xmm1 and vpaddsw xmm3,xmm0,xmm1, carry out packed signed 16-bit integer addition using wrapround and saturated arithmetic, respectively. The final two vmovdqa instructions save the calculated results to XmmVal array c. Assembly language function AvxPackedSubI16_ is similar to AvxPackedAddI16_ and uses the instructions vpsubw and vpsubsw to carry out packed signed 16-bit integer subtraction.

The assembly language function AvxPackedAddU16_ begins with a vmovdqu xmm0,xmmword ptr [rcx] instruction that loads a into register XMM0. A vmovdqu instruction is used here since XmmVal a was defined in the C++ code without the alignas(16) specifier. Note that function AvxPackedAddU16_ uses vmovdqu for demonstration purposes only; a properly aligned XmmVal and a vmovdqa instruction should have been used instead. It’s already been mentioned a number of times in this book but warrants repeating due to its importance: SIMD operands in memory should be properly aligned whenever possible in order to avoid potential performance penalties that can occur whenever the processor accesses an unaligned operand in memory.

Following the loading of argument values a and b into register XMM0 and XMM1, function AvxPackedAddU16_ performs packed unsigned 16-bit integer addition using the instructions vpaddw xmm2,xmm0,xmm1 (wraparound arithmetic) and vpaddusw xmm3,xmm0,xmm1 (saturated arithmetic). Two vmovdqu instructions save the results to array c. The function AvxPackedSubU16_ implements packed unsigned 16-bit integer subtraction using the vpsubw and vpsubusw instructions. This function also uses the vmovdqu instruction to load argument values and save results. Here are the results for source code example Ch07_01:
Results for AxvPackedAddI16 - Wraparound Addition
a:      10   200   30 -32766  |   50   60  32000 -32000
b:     100  -200  32760  -400  |   500  -600  1200  -950
c[0]:    110    0 -32746  32370  |   550  -540 -32336  32586
Results for AxvPackedAddI16 - Saturated Addition
a:      10   200   30 -32766  |   50   60  32000 -32000
b:     100  -200  32760  -400  |   500  -600  1200  -950
c[1]:    110    0  32767 -32768  |   550  -540  32767 -32768
Results for AxvPackedSubI16 - Wraparound Subtraction
a:      10   200   -30 -32766  |   50   60  32000 -32000
b:     100  -200  32760   400  |   500  -600  1200   950
c[0]:    -90   400  32746  32370  |  -450   660  30800  32586
Results for AxvPackedSubI16 - Saturated Subtraction
a:      10   200   -30 -32766  |   50   60  32000 -32000
b:     100  -200  32760   400  |   500  -600  1200   950
c[1]:    -90   400 -32768 -32768  |  -450   660  30800 -32768
---------------------------------------------------------------------------
Results for AxvPackedAddU16 - Wraparound Addition
a:      10   200   300  32766  |   50  20000  32000  32000
b:     100   200  65530  40000  |   500  25000  1200  50000
c[0]:    110   400   294  7230  |   550  45000  33200  16464
Results for AxvPackedAddU16 - Saturated Addition
a:      10   200   300  32766  |   50  20000  32000  32000
b:     100   200  65530  40000  |   500  25000  1200  50000
c[1]:    110   400  65535  65535  |   550  45000  33200  65535
Results for AxvPackedSubU16 - Wraparound Subtraction
a:      10   200   30  65000  |   60  25000  32000  1200
b:     100   200    7  5000  |   500  28000  1200   950
c[0]:   65446    0   23  60000  |  65096  62536  30800   250
Results for AxvPackedSubI16 - Saturated Subtraction
a:      10   200   30  65000  |   60  25000  32000  1200
b:     100   200    7  5000  |   500  28000  1200   950
c[1]:     0    0   23  60000  |    0    0  30800   250

AVX also supports packed integer addition and subtraction using 8-, 32-, and 64-bit integers. The vpaddb, vpaddsb, vpaddusb, vpsubb, vpsubsb, and vpsubusb instructions are the 8-bit (byte) versions of the packed 16-bit instructions that were demonstrated in this example. The vpadd[d|q] and vpsub[d|q] instructions can be employed to perform packed 32-bit (doubleword) or 64-bit (quadword) addition and subtraction using wraparound arithmetic. AVX does not support saturated addition and subtraction using packed doubleword or quadword integers.

Packed Integer Shifts

The next source code example is named Ch07_02. This example illustrates how to perform logical and arithmetic shift operations using packed integer operands. Listing 7-2 shows the C++ and assembly language source code for example Ch07_02.
//------------------------------------------------
//        Ch07_02.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include "XmmVal.h"
using namespace std;
// The order of the name constants in the following enum must
// correspond to the table values defined in .asm file.
enum ShiftOp : unsigned int
{
  U16_LL,   // shift left logical - word
  U16_RL,   // shift right logical - word
  U16_RA,   // shift right arithmetic - word
  U32_LL,   // shift left logical - doubleword
  U32_RL,   // shift right logical - doubleword
  U32_RA,   // shift right arithmetic - doubleword
};
extern "C" bool AvxPackedIntegerShift_(XmmVal& b, const XmmVal& a, ShiftOp shift_op, unsigned int count);
void AvxPackedIntegerShiftU16(void)
{
  unsigned int count = 2;
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  a.m_U16[0] = 0x1234;
  a.m_U16[1] = 0xFF00;
  a.m_U16[2] = 0x00CC;
  a.m_U16[3] = 0x8080;
  a.m_U16[4] = 0x00FF;
  a.m_U16[5] = 0xAAAA;
  a.m_U16[6] = 0x0F0F;
  a.m_U16[7] = 0x0101;
  AvxPackedIntegerShift_(b, a, U16_LL, count);
  cout << "\nResults for ShiftOp::U16_LL (count = " << count << ")\n";
  cout << "a: " << a.ToStringX16() << '\n';
  cout << "b: " << b.ToStringX16() << '\n';
  AvxPackedIntegerShift_(b, a, U16_RL, count);
  cout << "\nResults for ShiftOp::U16_RL (count = " << count << ")\n";
  cout << "a: " << a.ToStringX16() << '\n';
  cout << "b: " << b.ToStringX16() << '\n';
  AvxPackedIntegerShift_(b, a, U16_RA, count);
  cout << "\nResults for ShiftOp::U16_RA (count = " << count << ")\n";
  cout << "a: " << a.ToStringX16() << '\n';
  cout << "b: " << b.ToStringX16() << '\n';
}
void AvxPackedIntegerShiftU32(void)
{
  unsigned int count = 4;
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  a.m_U32[0] = 0x12345678;
  a.m_U32[1] = 0xFF00FF00;
  a.m_U32[2] = 0x03030303;
  a.m_U32[3] = 0x80800F0F;
  AvxPackedIntegerShift_(b, a, U32_LL, count);
  cout << "\nResults for ShiftOp::U32_LL (count = " << count << ")\n";
  cout << "a: " << a.ToStringX32() << '\n';
  cout << "b: " << b.ToStringX32() << '\n';
  AvxPackedIntegerShift_(b, a, U32_RL, count);
  cout << "\nResults for ShiftOp::U32_RL (count = " << count << ")\n";
  cout << "a: " << a.ToStringX32() << '\n';
  cout << "b: " << b.ToStringX32() << '\n';
  AvxPackedIntegerShift_(b, a, U32_RA, count);
  cout << "\nResults for ShiftOp::U32_RA (count = " << count << ")\n";
  cout << "a: " << a.ToStringX32() << '\n';
  cout << "b: " << b.ToStringX32() << '\n';
}
int main(void)
{
  string sep(75, '-');
  AvxPackedIntegerShiftU16();
  cout << '\n' << sep << '\n';
  AvxPackedIntegerShiftU32();
  return 0;
}
;-------------------------------------------------
;        Ch07_02.asm
;-------------------------------------------------
; extern "C" bool AvxPackedIntegerShift_(XmmVal& b, const XmmVal& a, ShiftOp shift_op, unsigned int count)
;
; Returns:   0 = invalid shift_op argument, 1 = success
;
; Note:     This module requires linker option /LARGEADDRESSAWARE:NO
;        to be explicitly set.
    .code
AvxPackedIntegerShift_ proc
; Make sure 'shift_op' is valid
    mov r8d,r8d           ;zero extend shift_op
    cmp r8,ShiftOpTableCount    ;compare against table count
    jae Error            ;jump if shift_op is invalid
; Jump to the operation specified by shift_op
    vmovdqa xmm0,xmmword ptr [rdx] ;xmm0 = a
    vmovd xmm1,r9d         ;xmm1[31:0] = shift count
    mov eax,1            ;set success return code
    jmp [ShiftOpTable+r8*8]
; Packed shift left logical - word
U16_LL: vpsllw xmm2,xmm0,xmm1
    vmovdqa xmmword ptr [rcx],xmm2
    ret
; Packed shift right logical - word
U16_RL: vpsrlw xmm2,xmm0,xmm1
    vmovdqa xmmword ptr [rcx],xmm2
    ret
; Packed shift right arithmetic - word
U16_RA: vpsraw xmm2,xmm0,xmm1
    vmovdqa xmmword ptr [rcx],xmm2
    ret
; Packed shift left logical - doubleword
U32_LL: vpslld xmm2,xmm0,xmm1
    vmovdqa xmmword ptr [rcx],xmm2
    ret
; Packed shift right logical - doubleword
U32_RL: vpsrld xmm2,xmm0,xmm1
    vmovdqa xmmword ptr [rcx],xmm2
    ret
; Packed shift right arithmetic - doubleword
U32_RA: vpsrad xmm2,xmm0,xmm1
    vmovdqa xmmword ptr [rcx],xmm2
    ret
Error: xor eax,eax           ;set error code
    vpxor xmm0,xmm0,xmm0
    vmovdqa xmmword ptr [rcx],xmm0 ;set result to zero
    ret
; The order of the labels in the following table must correspond
; to the enums that are defined in .cpp file.
        align 8
ShiftOpTable  qword U16_LL, U16_RL, U16_RA
        qword U32_LL, U32_RL, U32_RA
ShiftOpTableCount equ ($ - ShiftOpTable) / size qword
AvxPackedIntegerShift_ endp
    end
Listing 7-2.

Example Ch07_02

The C++ code that’s shown in Listing 7-2 begins with the definition of an enum named ShiftOp, which is used to select a shift operation. Supported shift operations include logical left, logical right, and arithmetic right using packed word and doubleword values. Following enum ShiftOp is the declaration for the function AvxPackedIntegerShift_. This function carries out the requested shift operation using the supplied XmmVal argument and the specified count value. The C++ functions AvxPackedIntegerShiftU16 and AvxPackedIntegerShiftU32 initialize test cases for performing various shift operations using packed words and doublewords, respectively.

Assembly language function AvxPackedIntegerShift_ uses a jump table to execute the specified shift operation. This is similar to what you saw in source code examples Ch05_06 (Chapter 5) and Ch06_03 (Chapter 6). Upon entry to AvxPackedIntegerShift_, the argument value shift_op is tested for validity. Following validation of shift_op, a vmovdqa xmm0,xmmword ptr [rdx] instruction loads a into register XMM0. The subsequent vmovd xmm1,r9d instruction copies argument value count into the low-order doubleword of register XMM1. This is followed by a jmp [ShiftOpTable+r8*8] instruction that transfers program control to the appropriate code block.

Each distinct code block in AvxPackedIntegerShift_ performs a particular shift operation. For example, the code block adjacent to the label U16_LL uses the AVX instruction vpsllw xmm2,xmm0,xmm1 to perform a logical left shift using packed words. It is important to note that every word element in XMM0 is independently shifted left by the number of bits specified in XMM1[31:0]. The code blocks adjacent to the labels U16_RL and U16_RA carry out logical and arithmetic right shifts of packed words using the instructions vpsrlw and vpsraw, respectively. The function AvxPackedIntegerShift_ employs a similar structure to perform packed shift operations on doublewords using the instructions vpslld, vpslrd, and vpsrad. All of the code blocks in AvxPackedIntegerShift_ conclude with a vmovdqa xmmword ptr [rcx],xmm2 instruction that saves the calculated result. Here is the output for source code example Ch07_02:
Results for ShiftOp::U16_LL (count = 2)
a:   1234  FF00  00CC  8080  |  00FF  AAAA  0F0F  0101
b:   48D0  FC00  0330  0200  |  03FC  AAA8  3C3C  0404
Results for ShiftOp::U16_RL (count = 2)
a:   1234  FF00  00CC  8080  |  00FF  AAAA  0F0F  0101
b:   048D  3FC0  0033  2020  |  003F  2AAA  03C3  0040
Results for ShiftOp::U16_RA (count = 2)
a:   1234  FF00  00CC  8080  |  00FF  AAAA  0F0F  0101
b:   048D  FFC0  0033  E020  |  003F  EAAA  03C3  0040
---------------------------------------------------------------------------
Results for ShiftOp::U32_LL (count = 4)
a:     12345678    FF00FF00  |    03030303    80800F0F
b:     23456780    F00FF000  |    30303030    0800F0F0
Results for ShiftOp::U32_RL (count = 4)
a:     12345678    FF00FF00  |    03030303    80800F0F
b:     01234567    0FF00FF0  |    00303030    080800F0
Results for ShiftOp::U32_RA (count = 4)
a:     12345678    FF00FF00  |    03030303    80800F0F
b:     01234567    FFF00FF0  |    00303030    F80800F0

The AVX instructions vpsllq, vpslrq, and vpsraq can be used to perform shift operations using packed quadwords. Somewhat surprisingly, AVX does not support shift operations using packed byte operands. AVX also includes the shift instructions vps[l|r]dq, which carry out logical left or logical right byte shifts of a 128-bit wide operand in an XMM register. You’ll see how these instructions work in the next section.

Packed Integer Multiplication

Besides packed integer addition and subtraction, AVX also includes instructions that perform packed integer multiplication. These instructions are slightly different than the corresponding packed addition and subtraction instructions. Part of this difference is due to the fact that in order to calculate a non-truncated product, integer multiplication requires a destination operand to be twice the size of the original source operands. For example, the non-truncated product of two signed 16-bit integers is a signed 32-bit integer. Listing 7-3 shows the source code for example Ch07_03. This example demonstrates how to perform packed integer multiplication using signed 16-bit and 32-bit integers.
//------------------------------------------------
//        Ch07_03.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include "XmmVal.h"
using namespace std;
extern "C" void AvxPackedMulI16_(XmmVal c[2], const XmmVal& a, const XmmVal& b);
extern "C" void AvxPackedMulI32A_(XmmVal c[2], const XmmVal& a, const XmmVal& b);
extern "C" void AvxPackedMulI32B_(XmmVal* c, const XmmVal& a, const XmmVal& b);
void AvxPackedMulI16(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[2];
  a.m_I16[0] = 10;    b.m_I16[0] = -5;
  a.m_I16[1] = 3000;   b.m_I16[1] = 100;
  a.m_I16[2] = -2000;   b.m_I16[2] = -9000;
  a.m_I16[3] = 42;    b.m_I16[3] = 1000;
  a.m_I16[4] = -5000;   b.m_I16[4] = 25000;
  a.m_I16[5] = 8;     b.m_I16[5] = 16384;
  a.m_I16[6] = 10000;   b.m_I16[6] = 3500;
  a.m_I16[7] = -60;    b.m_I16[7] = 6000;
  AvxPackedMulI16_(c, a, b);
  cout << "\nResults for AvxPackedMulI16\n";
  for (size_t i = 0; i < 8; i++)
  {
    cout << "a[" << i << "]: " << setw(8) << a.m_I16[i] << " ";
    cout << "b[" << i << "]: " << setw(8) << b.m_I16[i] << " ";
    if (i < 4)
    {
      cout << "c[0][" << i << "]: ";
      cout << setw(12) << c[0].m_I32[i] << '\n';
    }
    else
    {
      cout << "c[1][" << i - 4 << "]: ";
      cout << setw(12) << c[1].m_I32[i - 4] << '\n';
    }
  }
}
void AvxPackedMulI32A(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c[2];
  a.m_I32[0] = 10;    b.m_I32[0] = -500;
  a.m_I32[1] = 3000;   b.m_I32[1] = 100;
  a.m_I32[2] = -40000;  b.m_I32[2] = -120000;
  a.m_I32[3] = 4200;   b.m_I32[3] = 1000;
  AvxPackedMulI32A_(c, a, b);
  cout << "\nResults for AvxPackedMulI32A\n";
  for (size_t i = 0; i < 4; i++)
  {
    cout << "a[" << i << "]: " << setw(10) << a.m_I32[i] << " ";
    cout << "b[" << i << "]: " << setw(10) << b.m_I32[i] << " ";
    if (i < 2)
    {
      cout << "c[0][" << i << "]: ";
      cout << setw(14) << c[0].m_I64[i] << '\n';
    }
    else
    {
      cout << "c[1][" << i - 2 << "]: ";
      cout << setw(14) << c[1].m_I64[i - 2] << '\n';
    }
  }
}
void AvxPackedMulI32B(void)
{
  alignas(16) XmmVal a;
  alignas(16) XmmVal b;
  alignas(16) XmmVal c;
  a.m_I32[0] = 10;    b.m_I32[0] = -500;
  a.m_I32[1] = 3000;   b.m_I32[1] = 100;
  a.m_I32[2] = -2000;   b.m_I32[2] = -12000;
  a.m_I32[3] = 4200;   b.m_I32[3] = 1000;
  AvxPackedMulI32B_(&c, a, b);
  cout << "\nResults for AvxPackedMulI32B\n";
  for (size_t i = 0; i < 4; i++)
  {
    cout << "a[" << i << "]: " << setw(10) << a.m_I32[i] << " ";
    cout << "b[" << i << "]: " << setw(10) << b.m_I32[i] << " ";
    cout << "c[" << i << "]: " << setw(10) << c.m_I32[i] << '\n';
  }
}
int main()
{
  string sep(75, '-');
  AvxPackedMulI16();
  cout << '\n' << sep << '\n';
  AvxPackedMulI32A();
  cout << '\n' << sep << '\n';
  AvxPackedMulI32B();
  return 0;
}
;-------------------------------------------------
;        Ch07_03.asm
;-------------------------------------------------
; extern "C" void AvxPackedMulI16_(XmmVal c[2], const XmmVal* a, const XmmVal* b)
    .code
AvxPackedMulI16_ proc
    vmovdqa xmm0,xmmword ptr [rdx]   ;xmm0 = a
    vmovdqa xmm1,xmmword ptr [r8]    ;xmm1 = b
    vpmullw xmm2,xmm0,xmm1       ;xmm2 = packed a * b low result
    vpmulhw xmm3,xmm0,xmm1       ;xmm3 = packed a * b high result
    vpunpcklwd xmm4,xmm2,xmm3      ;merge low and high results
    vpunpckhwd xmm5,xmm2,xmm3      ;into final signed dwords
    vmovdqa xmmword ptr [rcx],xmm4   ;save final results
    vmovdqa xmmword ptr [rcx+16],xmm5
    ret
AvxPackedMulI16_ endp
; extern "C" void AvxPackedMulI32A_(XmmVal c[2], const XmmVal* a, const XmmVal* b)
AvxPackedMulI32A_ proc
; Perform packed signed dword multiplication. Note that vpmuldq
; performs following operations:
;
; xmm2[63:0]  = xmm0[31:0] * xmm1[31:0]
; xmm2[127:64] = xmm0[95:64] * xmm1[95:64]
    vmovdqa xmm0,xmmword ptr [rdx]   ;xmm0 = a
    vmovdqa xmm1,xmmword ptr [r8]    ;xmm1 = b
    vpmuldq xmm2,xmm0,xmm1
; Shift source operands right by 4 bytes and repeat vpmuldq
    vpsrldq xmm0,xmm0,4
    vpsrldq xmm1,xmm1,4
    vpmuldq xmm3,xmm0,xmm1
; Save results
    vpextrq qword ptr [rcx],xmm2,0   ;save xmm2[63:0]
    vpextrq qword ptr [rcx+8],xmm3,0  ;save xmm3[63:0]
    vpextrq qword ptr [rcx+16],xmm2,1  ;save xmm2[127:63]
    vpextrq qword ptr [rcx+24],xmm3,1  ;save xmm3[127:63]
    ret
AvxPackedMulI32A_ endp
; extern "C" void AvxPackedMulI32B_(XmmVal*, const XmmVal* a, const XmmVal* b)
AvxPackedMulI32B_ proc
; Perform packed signed integer multiplication and save low packed dword result
    vmovdqa xmm0,xmmword ptr [rdx]       ;xmm0 = a
    vpmulld xmm1,xmm0,xmmword ptr [r8]     ;xmm1 = packed a * b
    vmovdqa xmmword ptr [rcx],xmm1       ;save packed dword result
    ret
AvxPackedMulI32B_ endp
    end
Listing 7-3.

Example Ch07_03

The C++ function AvxPackedMulI16 contains code that initializes XmmVal variables a and b using signed 16-bit integers. This function then invokes the assembly language function AxvPackedMulI16_, which performs packed multiplication using signed 16-bit integers. The results are then streamed to cout. Note that the results displayed by function AvxPackedMulI16 are signed 32-bit integer products. The other two C++ functions in Listing 7-3, AvxPackedMulI32A and AvxPackedMul32B, initialize test cases for performing packed signed 32-bit integer multiplication. The former of these functions computes a packed signed 64-bit integer product, while the latter calculates a packed signed 32-bit integer product.

The assembly language function AvxPackedMulI16_ begins with two vmovdqa instructions that load argument values a and b into registers XMM0 and XMM1, respectively. The ensuing vpmullw xmm2,xmm0,xmm1 instruction multiplies the packed signed 16-bit integers in XMM0 and XMM1 and saves the low-order 16 bits of each 32-bit product in XMM2. This is followed by a vpmulhw xmm3,xmm0,xmm1 instruction that calculates and saves the high-order 16 bits of each 32-bit product. The next two instructions, vpunpcklwd xmm4,xmm2,xmm3 and vpunpckhwd xmm5,xmm2,xmm3, create the final packed 32-bit signed integer products by interleaving the low-order (vpunpcklwd) or high-order (vpunpckhwd) words of their source operands. Figure 7-1 illustrates the instruction sequence that’s employed by AvxPackedMulI16_.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig1_HTML.jpg
Figure 7-1.

Instruction sequence used in AvxPackedMulI16_ to perform packed 16-bit signed integer multiplication

The next assembly language function in Listing 7-3, AvxPackedMulI32A_, performs packed signed 32-bit integer multiplication. This function begins with two vmovdqa instructions that load XmmVal argument values a and b into registers XMM0 and XMM1, respectively. The vpmuldq xmm2,xmm0,xmm1 instruction that follows performs packed signed 32-bit multiplication using the even numbered elements of the two source operands. It then saves the signed 64-bit products in XMM2. Two vpsrldq instructions are then used to right shift by four bytes the contents of registers XMM0 and XMM1. This is followed by another vpmuldq instruction that calculates the remaining 64-bit products. Figure 7-2 show the execution details of this instruction sequence.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig2_HTML.jpg
Figure 7-2.

Execution of vpmuldq and vpsrldq instructions

Following the execution of the second vpmuldq instruction, registers XMM2 and XMM3 contain the four signed 64-bit products. These values are then saved to the specified destination buffer using a series of vpextrq (Extract Quadword) instructions. This instruction copies the quadword element that’s specified by the immediate (or second source) operand from the first source operand and saves it to the destination operand. For example, the instruction vpextrq qword ptr [rcx],xmm2,0 saves the low-order quadword of XMM2 to the memory location specified by RCX. The first source operand of a vpextrq instruction must be an XMM register; the destination operand can be a general-purpose register or a memory location. AVX also includes instructions that you can use to extract byte (vpextrb), word (vpextrw), or doubleword (vpextrd) elements.

The final assembly language function in this source code example is named AvxPackedMulI32B_. This function also performs packed signed 32-bit integer multiplication but saves truncated 32-bit products. Function AvxPackedMulI32B_ uses the vpmulld instruction that performs element-by-element doubleword multiplication similar to packed addition or subtraction. The low-order 32 bits of each product are then saved to the destination operand. The results for source code example Ch07_03 are as follows:
Results for AvxPackedMulI16
a[0]:    10 b[0]:    -5 c[0][0]:     -50
a[1]:   3000 b[1]:   100 c[0][1]:    300000
a[2]:  -2000 b[2]:  -9000 c[0][2]:   18000000
a[3]:    42 b[3]:   1000 c[0][3]:    42000
a[4]:  -5000 b[4]:  25000 c[1][0]:  -125000000
a[5]:    8 b[5]:  16384 c[1][1]:    131072
a[6]:  10000 b[6]:   3500 c[1][2]:   35000000
a[7]:   -60 b[7]:   6000 c[1][3]:   -360000
---------------------------------------------------------------------------
Results for AvxPackedMulI32A
a[0]:     10 b[0]:    -500 c[0][0]:     -5000
a[1]:    3000 b[1]:    100 c[0][1]:     300000
a[2]:   -40000 b[2]:  -120000 c[1][0]:   4800000000
a[3]:    4200 b[3]:    1000 c[1][1]:    4200000
---------------------------------------------------------------------------
Results for AvxPackedMulI32B
a[0]:     10 b[0]:    -500 c[0]:   -5000
a[1]:    3000 b[1]:    100 c[1]:   300000
a[2]:   -2000 b[2]:   -12000 c[2]:  24000000
a[3]:    4200 b[3]:    1000 c[3]:  4200000

Packed Integer Image Processing

The source code examples presented thus far were intended to familiarize you with AVX packed integer programming. Each example included a simple assembly language function that demonstrated the operation of several AVX instructions using instances of the structure XmmVal . For some real-world application programs, it may be appropriate to create a small set of functions similar to the ones you’ve seen thus far. However, in order to fully exploit the benefits of the AVX, you need to code functions that implement complete algorithms using common data structures.

The source code examples in this section present algorithms that process arrays of unsigned 8-bit integers using the AVX instruction set. In the first example, you learn how to determine the minimum and maximum value of an array. This sample program has a certain practicality to it since digital images often use arrays of unsigned 8-bit integers to represent images in memory, and many image-processing algorithms (e.g., contrast enhancement) often need to determine the minimum (darkest) and maximum (lightest) pixels in an image. The second sample program illustrates how to calculate the mean value of an array of unsigned 8-bit integers. This is another example of a realistic algorithm that is directly relevant to the province of image processing. The final three source code examples implement universal image processing algorithms, including pixel conversion, histogram creation, and thresholding.

Pixel Minimum-Maximum Values

Source code example Ch07_04, shown in Listing 7-4, demonstrates how to find the minimum and maximum values in an array of unsigned 8-bit integers. This example also explains how to dynamically allocate aligned storage space for an array.
//------------------------------------------------
//        AlignedMem.h
//------------------------------------------------
#pragma once
#include <cstdint>
#include <malloc.h>
#include <stdexcept>
class AlignedMem
{
public:
  static void* Allocate(size_t mem_size, size_t mem_alignment)
  {
    void* p = _aligned_malloc(mem_size, mem_alignment);
    if (p == NULL)
      throw std::runtime_error("Memory allocation error: AllocateAlignedMem()");
    return p;
  }
  static void Release(void* p)
  {
    _aligned_free(p);
  }
  template <typename T> static bool IsAligned(const T* p, size_t alignment)
  {
    if (p == nullptr)
      return false;
    if (((uintptr_t)p % alignment) != 0)
      return false;
    return true;
  }
};
template <class T> class AlignedArray
{
  T* m_Data;
  size_t m_Size;
public:
  AlignedArray(void) = delete;
  AlignedArray(const AlignedArray& aa) = delete;
  AlignedArray(AlignedArray&& aa) = delete;
  AlignedArray& operator = (const AlignedArray& aa) = delete;
  AlignedArray& operator = (AlignedArray&& aa) = delete;
  AlignedArray(size_t size, size_t alignment)
  {
    m_Size = size;
    m_Data = (T*)AlignedMem::Allocate(size * sizeof(T), alignment);
  }
  ∼AlignedArray()
  {
    AlignedMem::Release(m_Data);
  }
  T* Data(void)        { return m_Data; }
  size_t Size(void)      { return m_Size; }
  void Fill(T val)
  {
    for (size_t i = 0; i < m_Size; i++)
      m_Data[i] = val;
  }
};
//------------------------------------------------
//        Ch07_04.h
//------------------------------------------------
#pragma once
#include <cstdint>
// Ch07_04.cpp
extern void Init(uint8_t* x, size_t n, unsigned int seed);
extern bool AvxCalcMinMaxU8Cpp(const uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max);
// Ch07_04_BM.cpp
extern void AvxCalcMinMaxU8_BM(void);
// Ch07_04_.asm
extern "C" bool AvxCalcMinMaxU8_(const uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max);
// c_NumElements must be > 0 and even multiple of 64
const size_t c_NumElements = 16 * 1024 * 1024;
const unsigned int c_RngSeedVal = 23;
//------------------------------------------------
//        Ch07_04.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <cstdint>
#include <random>
#include "Ch07_04.h"
#include "AlignedMem.h"
using namespace std;
void Init(uint8_t* x, size_t n, unsigned int seed)
{
  uniform_int_distribution<> ui_dist {5, 250};
  default_random_engine rng {seed};
  for (size_t i = 0; i < n; i++)
    x[i] = (uint8_t)ui_dist(rng);
  // Use known values for min & max (for test purposes)
  x[(n / 4) * 3 + 1] = 2;
  x[n / 4 + 11] = 3;
  x[n / 2] = 252;
  x[n / 2 + 13] = 253;
  x[n / 8 + 5] = 4;
  x[n / 8 + 7] = 254;
}
bool AvxCalcMinMaxU8Cpp(const uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max)
{
  if (n == 0 || (n & 0x3f) != 0)
    return false;
  if (!AlignedMem::IsAligned(x, 16))
    return false;
  uint8_t x_min_temp = 0xff;
  uint8_t x_max_temp = 0;
  for (size_t i = 0; i < n; i++)
  {
    uint8_t val = *x++;
    if (val < x_min_temp)
      x_min_temp = val;
    else if (val > x_max_temp)
      x_max_temp = val;
  }
  *x_min = x_min_temp;
  *x_max = x_max_temp;
  return true;
}
void AvxCalcMinMaxU8()
{
  size_t n = c_NumElements;
  AlignedArray<uint8_t> x_aa(n, 16);
  uint8_t* x = x_aa.Data();
  Init(x, n, c_RngSeedVal);
  uint8_t x_min1 = 0, x_max1 = 0;
  uint8_t x_min2 = 0, x_max2 = 0;
  bool rc1 = AvxCalcMinMaxU8Cpp(x, n, &x_min1, &x_max1);
  bool rc2 = AvxCalcMinMaxU8_(x, n, &x_min2, &x_max2);
  cout << "\nResults for AvxCalcMinMaxU8\n";
  cout << "rc1: " << rc1 << " x_min1: " << (int)x_min1;
  cout << " x_max1: " << (int)x_max1 << '\n';
  cout << "rc2: " << rc1 << " x_min2: " << (int)x_min2;
  cout << " x_max2: " << (int)x_max2 << '\n';
}
int main()
{
  AvxCalcMinMaxU8();
  AvxCalcMinMaxU8_BM();
  return 0;
}
;-------------------------------------------------
;        Ch07_04_.asm
;-------------------------------------------------
; extern "C" bool AvxCalcMinMaxU8_(uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max)
;
; Returns:   0 = invalid n or unaligned array, 1 = success
      .const
      align 16
StartMinVal qword 0ffffffffffffffffh  ;Initial packed min values
      qword 0ffffffffffffffffh
StartMaxVal qword 0000000000000000h   ;Initial packed max values
      qword 0000000000000000h
      .code
AvxCalcMinMaxU8_ proc
; Make sure 'n' is valid
    xor eax,eax             ;set error return code
    or rdx,rdx             ;is n == 0?
    jz Done               ;jump if yes
    test rdx,3fh            ;is n a multiple of 64?
    jnz Done              ;jump if no
    test rcx,0fh            ;is x properly aligned?
    jnz Done              ;jump if no
; Initialize packed min-max values
    vmovdqa xmm2,xmmword ptr [StartMinVal]
    vmovdqa xmm3,xmm2            ;xmm3:xmm2 = packed min values
    vmovdqa xmm4,xmmword ptr [StartMaxVal]
    vmovdqa xmm5,xmm4            ;xmm5:xmm4 = packed max values
; Scan array for min & max values
@@:   vmovdqa xmm0,xmmword ptr [rcx]    ;xmm0 = x[i + 15] : x[i]
    vmovdqa xmm1,xmmword ptr [rcx+16]   ;xmm1 = x[i + 31] : x[i + 16]
    vpminub xmm2,xmm2,xmm0
    vpminub xmm3,xmm3,xmm1        ;xmm3:xmm2 = updated min values
    vpmaxub xmm4,xmm4,xmm0
    vpmaxub xmm5,xmm5,xmm1        ;xmm5:xmm4 = updated max values
    vmovdqa xmm0,xmmword ptr [rcx+32]   ;xmm0 = x[i + 47] : x[i + 32]
    vmovdqa xmm1,xmmword ptr [rcx+48]   ;xmm1 = x[i + 63] : x[i + 48]
    vpminub xmm2,xmm2,xmm0
    vpminub xmm3,xmm3,xmm1        ;xmm3:xmm2 = updated min values
    vpmaxub xmm4,xmm4,xmm0
    vpmaxub xmm5,xmm5,xmm1        ;xmm5:xmm4 = updated max values
    add rcx,64
    sub rdx,64
    jnz @B
; Determine final minimum value
    vpminub xmm0,xmm2,xmm3       ;xmm0[127:0] = final 16 min vals
    vpsrldq xmm1,xmm0,8         ;xmm1[63:0] = xmm0[127:64]
    vpminub xmm2,xmm1,xmm0       ;xmm2[63:0] = final 8 min vals
    vpsrldq xmm3,xmm2,4         ;xmm3[31:0] = xmm2[63:32]
    vpminub xmm0,xmm3,xmm2       ;xmm0[31:0] = final 4 min vals
    vpsrldq xmm1,xmm0,2         ;xmm1[15:0] = xmm0[31:16]
    vpminub xmm2,xmm1,xmm0       ;xmm2[15:0] = final 2 min vals
    vpextrw eax,xmm2,0         ;ax = final 2 min vals
    cmp al,ah
    jbe @F               ;jump if al <= ah
    mov al,ah              ;al = final min value
@@:   mov [r8],al             ;save final min
; Determine final maximum value
    vpmaxub xmm0,xmm4,xmm5       ;xmm0[127:0] = final 16 max vals
    vpsrldq xmm1,xmm0,8         ;xmm1[63:0] = xmm0[127:64]
    vpmaxub xmm2,xmm1,xmm0       ;xmm2[63:0] = final 8 max vals
    vpsrldq xmm3,xmm2,4         ;xmm3[31:0] = xmm2[63:32]
    vpmaxub xmm0,xmm3,xmm2       ;xmm0[31:0] = final 4 max vals
    vpsrldq xmm1,xmm0,2         ;xmm1[15:0] = xmm0[31:16]
    vpmaxub xmm2,xmm1,xmm0       ;xmm2[15:0] = final 2 max vals
    vpextrw eax,xmm2,0         ;ax = final 2 min vals
    cmp al,ah
    jae @F               ;jump if al >= ah
    mov al,ah              ;al = final max value
@@:   mov [r9],al             ;save final max
    mov eax,1              ;set success return code
Done:  ret
AvxCalcMinMaxU8_ endp
    end
Listing 7-4.

Example Ch07_04

Listing 7-4 begins with the source code for the header file AlignedMem .h. This file defines a couple of simple C++ classes that facilitate dynamically allocated aligned arrays. The class AlignedMem is a basic wrapper class for the Visual C++ runtime functions _aligned_malloc and _aligned_free. This class also includes a template member function named AlignedMem::IsAligned that validates the alignment of an array in memory. The header file AlignedMem.h also defines a template class named AlignedArray . Class AlignedArray, which is used in this and subsequent source code examples, contains code that implements and manages dynamically allocated aligned arrays. Note that this class contains only minimal functionality to support the source code examples in this book, which is why many of the standard constructors and assignment operators are disabled.

The primary C++ code in example Ch07_04 begins with the definition of a function name Init. This function initializes an array of unsigned 8-bit integers with random values in order to simulate the pixel values of an image. Function Init uses the C++ standard template library (STL) classes uniform_int_distribution and default_random_engine to generate random values for the array. Appendix A contains a list of references that you can consult if you’re interested in learning more about these classes. Note that function Init sets some of the pixel values in the target array to know values for test purposes.

The function AvxCalcMinMaxU8Cpp implements a C++ version of the pixel value min-max algorithm. Parameters for this function include a pointer to the array, the number of array elements, and pointers for the minimum and maximum values. The algorithm itself consists of an unsophisticated for loop that sweeps though the array to find the minimum and maximum pixel values. Note that function AvxCalcMinMaxU8Cpp (and its counterpart assembly language function AvxCalcMinMaxU8_) requires the size of the array to be an even multiple of 64. The reason for this is that the assembly language function AvxCalcMinMaxU8_ (arbitrarily) processes 64 pixels during each loop iteration, as you’ll soon see. Also note that the source pixel array must be aligned to a 16-byte boundary. The C++ template function AlignedMem::IsAligned performs this check.

The C++ function AvxCalcMinMaxU8 contains code that initializes a test array and exercises the two pixel min-max functions. This function uses the aforementioned template class named AlignedArray to dynamically allocate an array of unsigned 8-bit integers that’s aligned to a 16-byte boundary. The constructor arguments for this class include the number of array elements and the alignment boundary. Following the AlignedArray<uint8_t> x_aa(n, 16) statement, AvxCalcMinMaxU8 obtains a raw C++ pointer to the array buffer using the member function AlignedArray::Data(). This pointer is passed as an argument to the two min-max functions.

The assembly language function AvxCalcMinMaxU8_ implements the same algorithm as its C++ counterpart with one significant difference. It processes array elements using 16-byte packets, which is the maximum number of unsigned 8-bit integers that can be stored in an XMM register. The function AvxCalcMinMaxU8_ begins by validating the size of argument n. It then checks array x for proper alignment. Following argument validation, AvxCalcMinMaxU8_ loads register pairs XMM3:XMM2 and XMM5:XMM4 with the initial packed minimum and maximum values, respectively. This enables the processing loop to track 32 min-max values simultaneously.

During each processing loop iteration, the function AvxCalcMinMaxU8_ loads 32 pixel values into register pair XMM1:XMM0 using the instructions vmovdqa xmm0,xmmword ptr [rcx] and vmovdqa xmm1,xmmword ptr [rcx+16]. The next two instructions, vpminub xmm2,xmm2,xmm0 and vpminub xmm3,xmm3,xmm1, update the current pixel minimums in register pair XMM3:XMM2. The ensuing vpmaxub instructions update the current pixel maximums in register pair XMM5:XMM4. Another sequence of vmovdqa, vpminub, and vpmaxub instructions handles the next group of 32 pixels. The processing of multiple data items during each loop iteration reduces the number of executed jump instructions and often results in faster code. This optimization technique is commonly called loop unrolling (or unwinding). You’ll learn more about loop unrolling and jump instruction optimization techniques in Chapter 15.

Following the execution of the pixel value min-max processing loop, the values in register pairs XMM3:XMM2 and XMM5:XMM4 must be reduced in order to obtain the final minimum and maximum values. The vpminub xmm0,xmm2,xmm3 instruction reduces the number of pixel minimum values from 32 to 16. The next instruction, vpsrldq xmm1,xmm0,8, right shifts the contents of XMM0 by eight bytes and saves the result in register XMM1 (i.e., XMM1[63:0] = XMM0[127:64]). This facilitates the use of the subsequent vpminub xmm2,xmm1,xmm0 instruction that reduces the number of minimum values from 16 to 8. Two more vpsrldq-vpminub instruction sequences are then employed to reduce the number of pixel minimums to two as shown in Figure 7-3. The vpextrw eax,xmm2,0 extracts the low-order word (XMM2[15:0]) from register XMM2 and saves it to the low-order word of register EAX (or register AX). The cmp al,ah, jbe, and mov al,ah instructions determine the final pixel minimum value. AvxCalcMinMaxU8_ uses a similar reduction technique to determine the maximum pixel value.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig3_HTML.jpg
Figure 7-3.

Reduction of pixel minimum values using the instructions vpminub and vpsrldq

Here is the output for source code example Ch07_04:
Results for AvxCalcMinMaxU8
rc1: 1 x_min1: 2 x_max1: 254
rc2: 1 x_min2: 2 x_max2: 254
Running benchmark function AvxCalcMinMaxU8_BM - please wait
Benchmark times save to file Ch07_04_AvxCalcMinMaxU8_BM_CHROMIUM.csv
Table 7-1 shows some timing measurements for the functions AvxCalcMinMaxU8 and AvxCalcMinMaxU8_ using several different Intel processors. These measurements were made using the procedure that’s described in Chapter 6. The benchmark source code for this and subsequent examples is not shown but included with the chapter download packages.
Table 7-1.

Pixel Value Min-Max Mean Execution Times (Microseconds), Array Size = 16 MB

CPU

AvxCalcMinMaxU8Cpp

AvxCalcMinMaxU8_

i7-4790S

17642

1007

i9-7900X

13638

874

i7-8700K

12622

721

Pixel Mean Intensity

The next source code example, Ch07_05, contains code that calculates the arithmetic mean of an array of 8-bit unsigned integers. This example also illustrates how to size-promote packed unsigned integers. Listing 7-5 shows the source code for example Ch07_05.
//------------------------------------------------
//        Ch07_05.h
//------------------------------------------------
#pragma once
#include <cstdint>
// Ch07_05.cpp
extern void Init(uint8_t* x, size_t n, unsigned int seed);
extern bool AvxCalcMeanU8Cpp(const uint8_t* x, size_t n, int64_t* sum_x, double* mean);
// Ch07_05_BM.cpp
extern void AvxCalcMeanU8_BM(void);
// Ch07_05_.asm
extern "C" bool AvxCalcMeanU8_(const uint8_t* x, size_t n, int64_t* sum_x, double* mean);
// Common constants
const size_t c_NumElements = 16 * 1024 * 1024;   // Must be multiple of 64
const size_t c_NumElementsMax = 64 * 1024 * 1024;  // Used to avoid overflows
const unsigned int c_RngSeedVal = 29;
//------------------------------------------------
//        Ch07_05.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include <random>
#include "Ch07_05.h"
#include "AlignedMem.h"
using namespace std;
extern "C" size_t g_NumElementsMax = c_NumElementsMax; // Used in .asm code
void Init(uint8_t* x, size_t n, unsigned int seed)
{
  uniform_int_distribution<> ui_dist {0, 255};
  default_random_engine rng {seed};
  for (size_t i = 0; i < n; i++)
    x[i] = (uint8_t)ui_dist(rng);
}
bool AvxCalcMeanU8Cpp(const uint8_t* x, size_t n, int64_t* sum_x, double* mean_x)
{
  if (n == 0 || n > c_NumElementsMax)
    return false;
  if ((n % 64) != 0)
    return false;
  if (!AlignedMem::IsAligned(x, 16))
    return false;
  int64_t sum_x_temp = 0;
  for (int i = 0; i < n; i++)
    sum_x_temp += x[i];
  *sum_x = sum_x_temp;
  *mean_x = (double)sum_x_temp / n;
  return true;
}
void AvxCalcMeanU8()
{
  const size_t n = c_NumElements;
  AlignedArray<uint8_t> x_aa(n, 16);
  uint8_t* x = x_aa.Data();
  Init(x, n, c_RngSeedVal);
  bool rc1, rc2;
  int64_t sum_x1 = -1, sum_x2 = -1;
  double mean_x1 = -1, mean_x2 = -1;
  rc1 = AvxCalcMeanU8Cpp(x, n, &sum_x1, &mean_x1);
  rc2 = AvxCalcMeanU8_(x, n, &sum_x2, &mean_x2);
  cout << "\nResults for MmxCalcMeanU8\n";
  cout << fixed << setprecision(6);
  cout << "rc1: " << rc1 << " ";
  cout << "sum_x1: " << sum_x1 << " ";
  cout << "mean_x1: " << mean_x1 << '\n';
  cout << "rc2: " << rc2 << " ";
  cout << "sum_x2: " << sum_x2 << " ";
  cout << "mean_x2: " << mean_x2 << '\n';
}
int main()
{
  AvxCalcMeanU8();
  AvxCalcMeanU8_BM();
  return 0;
}
;-------------------------------------------------
;        Ch07_05.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
    extern g_NumElementsMax:qword
; extern "C" bool AvxCalcMeanU8_(const Uint8* x, size_t n, int64_t* sum_x, double* mean);
;
; Returns    0 = invalid n or unaligned array, 1 = success
    .code
AvxCalcMeanU8_ proc frame
    _CreateFrame CM_,0,64
    _SaveXmmRegs xmm6,xmm7,xmm8,xmm9
    _EndProlog
; Verify function arguments
    xor eax,eax             ;set error return code
    or rdx,rdx
    jz Done               ;jump if n == 0
    cmp rdx,[g_NumElementsMax]
    jae Done              ;jump if n > NumElementsMax
    test rdx,3fh
    jnz Done              ;jump if (n % 64) != 0
    test rcx,0fh
    jnz Done              ;jump if x is not properly aligned
; Perform required initializations
    mov r10,rdx             ;save n for later use
    add rdx,rcx             ;rdx = end of array
    vpxor xmm8,xmm8,xmm8        ;xmm8 = packed intermediate sums (4 dwords)
    vpxor xmm9,xmm9,xmm9        ;xmm9 = packed zero for promotions
; Promote 32 pixel values from bytes to words, then sum the words
@@:   vmovdqa xmm0,xmmword ptr [rcx]
    vmovdqa xmm1,xmmword ptr [rcx+16]  ;xmm1:xmm0 = 32 pixels
    vpunpcklbw xmm2,xmm0,xmm9      ;xmm2 = 8 words
    vpunpckhbw xmm3,xmm0,xmm9      ;xmm3 = 8 words
    vpunpcklbw xmm4,xmm1,xmm9      ;xmm4 = 8 words
    vpunpckhbw xmm5,xmm1,xmm9      ;xmm5 = 8 words
    vpaddw xmm0,xmm2,xmm3
    vpaddw xmm1,xmm4,xmm5
    vpaddw xmm6,xmm0,xmm1        ;xmm6 = packed sums (8 words)
; Promote another 32 pixel values from bytes to words, then sum the words
    vmovdqa xmm0,xmmword ptr [rcx+32]
    vmovdqa xmm1,xmmword ptr [rcx+48]  ;xmm1:xmm0 = 32 pixels
    vpunpcklbw xmm2,xmm0,xmm9      ;xmm2 = 8 words
    vpunpckhbw xmm3,xmm0,xmm9      ;xmm3 = 8 words
    vpunpcklbw xmm4,xmm1,xmm9      ;xmm4 = 8 words
    vpunpckhbw xmm5,xmm1,xmm9      ;xmm5 = 8 words
    vpaddw xmm0,xmm2,xmm3
    vpaddw xmm1,xmm4,xmm5
    vpaddw xmm7,xmm0,xmm1        ;xmm7 = packed sums (8 words)
; Promote packed sums to dwords, then update dword sums
    vpaddw xmm0,xmm6,xmm7        ;xmm0 = packed sums (8 words)
    vpunpcklwd xmm1,xmm0,xmm9      ;xmm1 = packed sums (4 dwords)
    vpunpckhwd xmm2,xmm0,xmm9      ;xmm2 = packed sums (4 dwords)
    vpaddd xmm8,xmm8,xmm1
    vpaddd xmm8,xmm8,xmm2
    add rcx,64             ;rcx = next 64 byte block
    cmp rcx,rdx
    jne @B               ;repeat loop if not done
; Compute final sum_x (note vpextrd zero extends extracted dword to 64 bits)
    vpextrd eax,xmm8,0         ;rax = partial sum 0
    vpextrd edx,xmm8,1         ;rdx = partial sum 1
    add rax,rdx
    vpextrd ecx,xmm8,2         ;rcx = partial sum 2
    vpextrd edx,xmm8,3         ;rdx = partial sum 3
    add rax,rcx
    add rax,rdx
    mov [r8],rax            ;save sum_x
; Compute mean value
    vcvtsi2sd xmm0,xmm0,rax       ;xmm0 = sum_x (DPFP)
    vcvtsi2sd xmm1,xmm1,r10       ;xmm1 = n (DPFP)
    vdivsd xmm2,xmm0,xmm1        ;calc mean = sum_x / n
    vmovsd real8 ptr [r9],xmm2     ;save mean
    mov eax,1              ;set success return code
Done:  _RestoreXmmRegs xmm6,xmm7,xmm8,xmm9
    _DeleteFrame
    ret
AvxCalcMeanU8_ endp
    end
Listing 7-5.

Example Ch07_05

The organization of the C++ code in example Ch07_05 is somewhat similar to the previous example. The C++ function AvxCalcMeanU8Cpp uses a simple summing loop and scalar arithmetic to calculate the mean of an array of 8-bit unsigned integers. Like the previous example, the number of array elements must be an integral multiple of 64 and the source array must be aligned to a 16-byte boundary. Note that the function AvxCalcMeanU8Cpp also verifies that the number of array elements is not greater than c_NumElementsMax. This size restriction enables the assembly language function AvcCalcMeanU8_ to carry out its calculations using packed doublewords sans any safeguards for arithmetic overflows. The remaining C++ code that’s shown in Listing 7-5 performs test array initialization and streams results to cout.

The assembly language function AvxCalcMeanU8_ begins by performing the same validations of the array size as its C++ counterpart. The address of the array is also check for proper alignment. Following argument validation, AvxCalcMeanU8_ carries out its required initializations. The add rdx,rcx instruction computes the address of the first byte beyond the end of the array. The function AvxCalcMeanU8_ uses this address instead of a counter to terminate the processing loop. Register XMM8 is then initialized to all zeros. The processing loop uses this register to maintain intermediate packed doubleword sums.

Each processing loop iteration begins with two vmovdqa instructions that load 32 unsigned byte values into registers XMM1:XMM0. The pixel values are then size-promoted to words using the vpunpcklbw (Unpack Low Data) and vpunpckhbw (Unpack High Data). These instructions interleave the byte values contained in the two source operands to form word values, as shown in Figure 7-4. Note that register XMM9 contains all zeros, which means that the unsigned byte values are zero extended during size promotion to words. A series of vpaddw instructions then sums the packed unsigned word values. The function AvxCalcMeanU8_ processes another block of 32 pixels using the same sequence of instructions. The unsigned word sums in registers XMM6 and XMM7 are then summed using a vpaddw instruction, size-promoted to doublewords using vpunpcklwd and vpunpckhwd, and added to the intermediate packed doubleword sums in register XMM8. Figure 7-4 illustrates this instruction sequence in greater detail.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig4_HTML.jpg
Figure 7-4.

Execution of the vpunpck[h|l]bw, and vpunpck[h|l]wd instructions

Following termination of the processing loop, the intermediate doubleword sums in register XMM8 are totaled to generate the final pixel sum. The function uses several vpextrd instructions to copy each doubleword value from XMM8 to a general-purpose register. Note that this instruction uses an immediate operand to specify which element value to copy. Following computation of the pixel sum, AvxCalcMeanU8_ calculates the final pixel mean using simple scalar arithmetic. Here are the results for source code example Ch07_05:
Results for AvxCalcMeanU8
rc1: 1 sum_x1: 2139023922 mean_x1: 127.495761
rc2: 1 sum_x2: 2139023922 mean_x2: 127.495761
Running benchmark function AvxCalcMeanU8_BM - please wait
Benchmark times save to file Ch07_05_AvxCalcMeanU8_BM_CHROMIUM.csv
Table 7-2 shows some benchmark timing measurements for source code example Ch07_05.
Table 7-2.

Source Code Example Ch07_05 Mean Execution Times (Microseconds), Array Size = 16 MB

CPU

AvxCalcMeanU8Cpp

AvxCalcMeanU8_

i7-4790S

7103

1063

i9-7900X

6332

1048

i7-8700K

5870

861

Pixel Conversions

In order to implement certain image-processing algorithms, it is often necessary to convert the pixels of an 8-bit grayscale image from unsigned integer to single-precision floating-point values and vice versa. The sample code example of this section, Ch07_06, illustrates how to do this using the AVX instruction set. Listing 7-6 shows the source code for example Ch07_06.
//------------------------------------------------
//        Ch07_06.cpp
//------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include <iomanip>
#include <cstdint>
#include <random>
#include "AlignedMem.h"
using namespace std;
// Ch07_06_Misc.cpp
extern uint32_t ConvertImgVerify(const float* src1, const float* src2, uint32_t num_pixels);
extern uint32_t ConvertImgVerify(const uint8_t* src1, const uint8_t* src2, uint32_t num_pixels);
// Ch07_06_.asm
extern "C" bool ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels);
extern "C" bool ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels);
extern "C" uint32_t c_NumPixelsMax = 16777216;
template <typename T> void Init(T* x, size_t n, unsigned int seed, T scale)
{
  uniform_int_distribution<> ui_dist {0, 255};
  default_random_engine rng {seed};
  for (size_t i = 0; i < n; i++)
  {
    T temp = (T)ui_dist(rng);
    x[i] = (scale == 1) ? temp : temp / scale;
  }
}
bool ConvertImgU8ToF32Cpp(float* des, const uint8_t* src, uint32_t num_pixels)
{
  // Make sure num_pixels is valid
  if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))
    return false;
  if ((num_pixels % 32) != 0)
    return false;
  // Make sure src and des are aligned to a 16-byte boundary
  if (!AlignedMem::IsAligned(src, 16))
    return false;
  if (!AlignedMem::IsAligned(des, 16))
    return false;
  // Convert the image
  for (uint32_t i = 0; i < num_pixels; i++)
    des[i] = src[i] / 255.0f;
  return true;
}
bool ConvertImgF32ToU8Cpp(uint8_t* des, const float* src, uint32_t num_pixels)
{
  // Make sure num_pixels is valid
  if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))
    return false;
  if ((num_pixels % 32) != 0)
    return false;
  // Make sure src and des are aligned to a 16-byte boundary
  if (!AlignedMem::IsAligned(src, 16))
    return false;
  if (!AlignedMem::IsAligned(des, 16))
    return false;
  for (uint32_t i = 0; i < num_pixels; i++)
  {
    if (src[i] > 1.0f)
      des[i] = 255;
    else if (src[i] < 0.0)
      des[i] = 0;
    else
      des[i] = (uint8_t)(src[i] * 255.0f);
  }
  return true;
}
void ConvertImgU8ToF32(void)
{
  const uint32_t num_pixels = 1024;
  AlignedArray<uint8_t> src_aa(num_pixels, 16);
  AlignedArray<float> des1_aa(num_pixels, 16);
  AlignedArray<float> des2_aa(num_pixels, 16);
  uint8_t* src = src_aa.Data();
  float* des1 = des1_aa.Data();
  float* des2 = des2_aa.Data();
  Init(src, num_pixels, 12, (uint8_t)1);
  bool rc1 = ConvertImgU8ToF32Cpp(des1, src, num_pixels);
  bool rc2 = ConvertImgU8ToF32_(des2, src, num_pixels);
  if (!rc1 || !rc2)
  {
    cout << "Invalid return code - ";
    cout << "rc1 = " << boolalpha << rc1 << ", ";
    cout << "rc2 = " << boolalpha << rc2 << '\n';
    return;
  }
  uint32_t num_diff = ConvertImgVerify(des1, des2, num_pixels);
  cout << "\nResults for ConvertImgU8ToF32\n";
  cout << " num_pixels = " << num_pixels << '\n';
  cout << " num_diff = " << num_diff << '\n';
}
void ConvertImgF32ToU8(void)
{
  const uint32_t num_pixels = 1024;
  AlignedArray<float> src_aa(num_pixels, 16);
  AlignedArray<uint8_t> des1_aa(num_pixels, 16);
  AlignedArray<uint8_t> des2_aa(num_pixels, 16);
  float* src = src_aa.Data();
  uint8_t* des1 = des1_aa.Data();
  uint8_t* des2 = des2_aa.Data();
  // Initialize the src pixel buffer. The first few entries in src
  // are set to known values for test purposes.
  Init(src, num_pixels, 20, 1.0f);
  src[0] = 0.125f;    src[8] = 0.01f;
  src[1] = 0.75f;     src[9] = 0.99f;
  src[2] = -4.0f;     src[10] = 1.1f;
  src[3] = 3.0f;     src[11] = -1.1f;
  src[4] = 0.0f;     src[12] = 0.99999f;
  src[5] = 1.0f;     src[13] = 0.5f;
  src[6] = -0.01f;    src[14] = -0.0;
  src[7] = 1.01f;     src[15] = .333333f;
  bool rc1 = ConvertImgF32ToU8Cpp(des1, src, num_pixels);
  bool rc2 = ConvertImgF32ToU8_(des2, src, num_pixels);
  if (!rc1 || !rc2)
  {
    cout << "Invalid return code - ";
    cout << "rc1 = " << boolalpha << rc1 << ", ";
    cout << "rc2 = " << boolalpha << rc2 << '\n';
    return;
  }
  uint32_t num_diff = ConvertImgVerify(des1, des2, num_pixels);
  cout << "\nResults for ConvertImgF32ToU8\n";
  cout << " num_pixels = " << num_pixels << '\n';
  cout << " num_diff = " << num_diff << '\n';
}
int main()
{
  ConvertImgU8ToF32();
  ConvertImgF32ToU8();
  return 0;
}
;-------------------------------------------------
;        Ch07_06.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
    include <cmpequ.asmh>
          .const
          align 16
Uint8ToFloat    real4 255.0, 255.0, 255.0, 255.0
FloatToUint8Min   real4 0.0, 0.0, 0.0, 0.0
FloatToUint8Max   real4 1.0, 1.0, 1.0, 1.0
FloatToUint8Scale  real4 255.0, 255.0, 255.0, 255.0
    extern c_NumPixelsMax:dword
; extern "C" bool ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels)
    .code
ConvertImgU8ToF32_ proc frame
    _CreateFrame U2F_,0,160
    _SaveXmmRegs xmm6,xmm7,xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15
    _EndProlog
; Make sure num_pixels is valid and pixel buffers are properly aligned
    xor eax,eax             ;set error return code
    or r8d,r8d
    jz Done               ;jump if num_pixels is zero
    cmp r8d,[c_NumPixelsMax]
    ja Done               ;jump if num_pixels too big
    test r8d,1fh
    jnz Done              ;jump if num_pixels % 32 != 0
    test rcx,0fh
    jnz Done              ;jump if des not aligned
    test rdx,0fh
    jnz Done              ;jump if src not aligned
; Initialize processing loop registers
    shr r8d,5                ;number of pixel blocks
    vmovaps xmm6,xmmword ptr [Uint8ToFloat] ;xmm6 = packed 255.0f
    vpxor xmm7,xmm7,xmm7          ;xmm7 = packed 0
; Load the next block of 32 pixels
@@:   vmovdqa xmm0,xmmword ptr [rdx]   ;xmm0 = 16 pixels (x[i+15]:x[i])
    vmovdqa xmm1,xmmword ptr [rdx+16]  ;xmm8 = 16 pixels (x[i+31]:x[i+16])
; Promote the pixel values in xmm0 from unsigned bytes to unsigned dwords
    vpunpcklbw xmm2,xmm0,xmm7
    vpunpckhbw xmm3,xmm0,xmm7
    vpunpcklwd xmm8,xmm2,xmm7
    vpunpckhwd xmm9,xmm2,xmm7
    vpunpcklwd xmm10,xmm3,xmm7
    vpunpckhwd xmm11,xmm3,xmm7     ;xmm11:xmm8 = 16 dword pixels
; Promote the pixel values in xmm1 from unsigned bytes to unsigned dwords
    vpunpcklbw xmm2,xmm1,xmm7
    vpunpckhbw xmm3,xmm1,xmm7
    vpunpcklwd xmm12,xmm2,xmm7
    vpunpckhwd xmm13,xmm2,xmm7
    vpunpcklwd xmm14,xmm3,xmm7
    vpunpckhwd xmm15,xmm3,xmm7     ;xmm15:xmm12 = 16 dword pixels
; Convert pixel values from dwords to SPFP
    vcvtdq2ps xmm8,xmm8
    vcvtdq2ps xmm9,xmm9
    vcvtdq2ps xmm10,xmm10
    vcvtdq2ps xmm11,xmm11        ;xmm11:xmm8 = 16 SPFP pixels
    vcvtdq2ps xmm12,xmm12
    vcvtdq2ps xmm13,xmm13
    vcvtdq2ps xmm14,xmm14
    vcvtdq2ps xmm15,xmm15        ;xmm15:xmm12 = 16 SPFP pixels
; Normalize all pixel values to [0.0, 1.0] and save the results
    vdivps xmm0,xmm8,xmm6
    vmovaps xmmword ptr [rcx],xmm0   ;save pixels 0 - 3
    vdivps xmm1,xmm9,xmm6
    vmovaps xmmword ptr [rcx+16],xmm1  ;save pixels 4 - 7
    vdivps xmm2,xmm10,xmm6
    vmovaps xmmword ptr [rcx+32],xmm2  ;save pixels 8 - 11
    vdivps xmm3,xmm11,xmm6
    vmovaps xmmword ptr [rcx+48],xmm3  ;save pixels 12 - 15
    vdivps xmm0,xmm12,xmm6
    vmovaps xmmword ptr [rcx+64],xmm0  ;save pixels 16 - 19
    vdivps xmm1,xmm13,xmm6
    vmovaps xmmword ptr [rcx+80],xmm1  ;save pixels 20 - 23
    vdivps xmm2,xmm14,xmm6
    vmovaps xmmword ptr [rcx+96],xmm2  ;save pixels 24 - 27
    vdivps xmm3,xmm15,xmm6
    vmovaps xmmword ptr [rcx+112],xmm3 ;save pixels 28 - 31
    add rdx,32             ;update src ptr
    add rcx,128             ;update des ptr
    sub r8d,1
    jnz @B               ;repeat until done
    mov eax,1              ;set success return code
Done:  _RestoreXmmRegs xmm6,xmm7,xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15
    _DeleteFrame
    ret
ConvertImgU8ToF32_ endp
; extern "C" bool ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels)
ConvertImgF32ToU8_ proc frame
    _CreateFrame F2U_,0,96
    _SaveXmmRegs xmm6,xmm7,xmm12,xmm13,xmm14,xmm15
    _EndProlog
; Make sure num_pixels is valid and pixel buffers are properly aligned
    xor eax,eax             ;set error return code
    or r8d,r8d
    jz Done               ;jump if num_pixels is zero
    cmp r8d,[c_NumPixelsMax]
    ja Done               ;jump if num_pixels too big
    test r8d,1fh
    jnz Done              ;jump if num_pixels % 32 != 0
    test rcx,0fh
    jnz Done              ;jump if des not aligned
    test rdx,0fh
    jnz Done              ;jump if src not aligned
; Load required packed constants into registers
    vmovaps xmm13,xmmword ptr [FloatToUint8Scale] ;xmm13 = packed 255.0
    vmovaps xmm14,xmmword ptr [FloatToUint8Min]  ;xmm14 = packed 0.0
    vmovaps xmm15,xmmword ptr [FloatToUint8Max]  ;xmm15 = packed 1.0
    shr r8d,4              ;number of pixel blocks
LP1:  mov r9d,4              ;num pixel quartets per block
; Convert 16 float pixels to uint8_t
LP2:  vmovaps xmm0,xmmword ptr [rdx]   ;xmm0 = next pixel quartet
    vcmpps xmm1,xmm0,xmm14,CMP_LT    ;compare pixels to 0.0
    vandnps xmm2,xmm1,xmm0       ;clip pixels < 0.0 to 0.0
    vcmpps xmm3,xmm2,xmm15,CMP_GT    ;compare pixels to 1.0
    vandps xmm4,xmm3,xmm15       ;clip pixels > 1.0 to 1.0
    vandnps xmm5,xmm3,xmm2       ;xmm5 = pixels <= 1.0
    vorps xmm6,xmm5,xmm4        ;xmm6 = final clipped pixels
    vmulps xmm7,xmm6,xmm13       ;xmm7 = FP pixels [0.0, 255.0]
    vcvtps2dq xmm0,xmm7         ;xmm0 = dword pixels [0, 255]
    vpackusdw xmm1,xmm0,xmm0      ;xmm1[63:0] = word pixels
    vpackuswb xmm2,xmm1,xmm1      ;xmm2[31:0] = bytes pixels
; Save the current byte pixel quartet
    vpextrd eax,xmm2,0         ;eax = new pixel quartet
    vpsrldq xmm12,xmm12,4        ;adjust xmm12 for new quartet
    vpinsrd xmm12,xmm12,eax,3      ;xmm12[127:96] = new quartet
    add rdx,16             ;update src ptr
    sub r9d,1
    jnz LP2               ;repeat until done
; Save the current byte pixel block (16 pixels)
    vmovdqa xmmword ptr [rcx],xmm12   ;save current pixel block
    add rcx,16             ;update des ptr
    sub r8d,1
    jnz LP1               ;repeat until done
    mov eax,1              ;set success return code
Done:  _RestoreXmmRegs xmm6,xmm7,xmm12,xmm13,xmm14,xmm15
    _DeleteFrame
    ret
ConvertImgF32ToU8_ endp
    end
Listing 7-6.

Example Ch07_06

The C++ code in Listing 7-6 is straightforward. The function ConvertImgU8ToF32Cpp contains code that converts pixel values from uint8_t [0, 255] to single-precision floating-point [0.0, 1.0]. This function contains a simple for loop that calculates des[i] = src[i] / 255.0. The counterpart function ConvertImgF32ToU8Cpp performs the inverse operation. Note that this function clips any pixel values greater than 1.0 or less than 0.0 before performing the floating-point to uint8_t conversion. The functions ConvertImgU8ToF32 and ConvertImgF32ToU8 contain code that initialize test arrays and exercise the C++ and assembly language conversion routines. Note that the latter function initializes the first few entries of the source buffer to known values in order to demonstrate the aforementioned clipping operation.

The processing loop of the assembly language function ConvertImgU8ToF32_ converts 32 pixels from uint8_t (or byte) to single-precision floating-point during each iteration. The conversion technique begins with the size promotion of packed pixels from unsigned byte to unsigned doubleword integers using a series of vpunpck[h|l]bw and vpunpck[h|l]wd instructions. The doubleword values are then converted to single-precision floating-point values using the instruction vcvtdq2ps (Convert Packed Doubleword Integers to Packed Single-Precision Floating-Point Values). The resultant packed floating-point values are normalized to [0.0, 1.0] and saved to the destination buffer.

The assembly language function ConvertImgF32ToU8_ performs packed single-precision floating-point to packed unsigned byte conversions. The inner loop (starting at the label LP2) of this conversion function uses the instructions vcmpps xmm1,xmm0,xmm14,CMP_LT, vcmpps xmm3,xmm2,xmm15,CMP_GT, and some Boolean logic to clip any pixels values less than 0.0 or greater than 1.0. Figure 7-5 illustrates this technique in greater detail. The vcvtps2dq xmm0,xmm7 instruction converts the four single-precision floating-point values in XMM7 to doubleword integers and saves the results in register XMM0. The next two instructions, vpackusdw xmm1,xmm0,xmm0 and vpackuswb xmm2,xmm1,xmm1, size-reduce the packed doubleword integers to packed unsigned bytes. Following the execution of the vpackuswb instruction, register XMM2[31:0] contains four packed unsigned byte values. This pixel quartet is then copied to XMM12[127:96] using the instruction sequence vpextrd eax,xmm2,0, vpsrldq xmm12,xmm12,4, and vpinsrd xmm12,xmm12,eax,3. The vpinsrd (Insert Dword) instructions that’s used here copies the doubleword value in register EAX to doubleword element position 3 in register XMM12 (or XMM12[127:96]).
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig5_HTML.jpg
Figure 7-5.

Illustration of floating-point clipping technique used in function ConvertImgF32ToU8_

The inner loop conversion process that’s described in the previous paragraph executes for four iterations. Following the completion of the inner loop, XMM12 contains 16 unsigned byte pixel values. This pixel block is then saved to the destination buffer using a vmovdqa xmmword ptr [rcx],xmm12 instruction. The outer loop repeats until all pixels have been converted. Here is the output for source code example Ch07_06.
Results for ConvertImgU8ToF32
 num_pixels = 1024
 num_diff = 0
Results for ConvertImgF32ToU8
 num_pixels = 1024
 num_diff = 0

Image Histograms

Many image-processing algorithms require a histogram of an image’s pixel intensity values. Figure 7-6 shows a sample grayscale image and its histogram. The next source code example, Ch07_07, illustrates how to build a histogram of pixel intensity values for an image containing 8-bit grayscale pixel values. This example also explains how to use the stack in an assembly language function to store intermediate results. Listing 7-7 shows the source code for example Ch07_07.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig6_HTML.jpg
Figure 7-6.

Sample grayscale image and its histogram

//------------------------------------------------
//        Ch07_07.h
//------------------------------------------------
#pragma once
#include <cstdint>
// Ch07_07.cpp
extern bool AvxBuildImageHistogramCpp(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels);
// Ch07_07_.asm
// Functions defined in Sse64ImageHistogram_.asm
extern "C" bool AvxBuildImageHistogram_(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels);
// Ch07_07_BM.cpp
extern void AvxBuildImageHistogram_BM(void);
//------------------------------------------------
//        Ch07_07.cpp
//------------------------------------------------
#include "stdafx.h"
#include <cstdint>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
#include "Ch07_07.h"
#include "AlignedMem.h"
#include "ImageMatrix.h"
using namespace std;
extern "C" uint32_t c_NumPixelsMax = 16777216;
bool AvxBuildImageHistogramCpp(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels)
{
  // Make sure num_pixels is valid
  if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))
    return false;
  if (num_pixels % 32 != 0)
    return false;
  // Make sure histo is aligned to a 16-byte boundary
  if (!AlignedMem::IsAligned(histo, 16))
    return false;
  // Make sure pixel_buff is aligned to a 16-byte boundary
  if (!AlignedMem::IsAligned(pixel_buff, 16))
    return false;
  // Build the histogram
  memset(histo, 0, 256 * sizeof(uint32_t));
  for (uint32_t i = 0; i < num_pixels; i++)
    histo[pixel_buff[i]]++;
  return true;
}
void AvxBuildImageHistogram(void)
{
  const wchar_t* image_fn = L"..\\Ch07_Data\\TestImage1.bmp";
  const wchar_t* csv_fn = L"Ch07_07_AvxBuildImageHistogram_Histograms.csv";
  ImageMatrix im(image_fn);
  uint32_t num_pixels = im.GetNumPixels();
  uint8_t* pixel_buff = im.GetPixelBuffer<uint8_t>();
  AlignedArray<uint32_t> histo1_aa(256, 16);
  AlignedArray<uint32_t> histo2_aa(256, 16);
  bool rc1 = AvxBuildImageHistogramCpp(histo1_aa.Data(), pixel_buff, num_pixels);
  bool rc2 = AvxBuildImageHistogram_(histo2_aa.Data(), pixel_buff, num_pixels);
  cout << "\nResults for AvxBuildImageHistogram\n";
  if (!rc1 || !rc2)
  {
    cout << "Bad return code: ";
    cout << "rc1 = " << rc1 << ", rc2 = " << rc2 << '\n';
    return;
  }
  ofstream ofs(csv_fn);
  if (ofs.bad())
    cout << "File create error - " << csv_fn << '\n';
  else
  {
    bool compare_error = false;
    uint32_t* histo1 = histo1_aa.Data();
    uint32_t* histo2 = histo2_aa.Data();
    const char* delim = ", ";
    for (uint32_t i = 0; i < 256; i++)
    {
      ofs << i << delim;
      ofs << histo1[i] << delim << histo2[i] << '\n';
      if (histo1[i] != histo2[i])
      {
        compare_error = true;
        cout << " Histogram compare error at index " << i << '\n';
        cout << " counts: " << histo1[i] << delim << histo2[i] << '\n';
      }
    }
    if (!compare_error)
      cout << " Histograms are identical\n";
    ofs.close();
  }
}
int main()
{
  try
  {
    AvxBuildImageHistogram();
    AvxBuildImageHistogram_BM();
  }
  catch (...)
  {
    cout << "Unexpected exception has occurred\n";
    cout << "File = " << __FILE__ << '\n';
  }
  return 0;
}
;-------------------------------------------------
;        Ch07_07.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
; extern bool AvxBuildImageHistogram_(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels)
;
; Returns:   0 = invalid argument value, 1 = success
    .code
    extern c_NumPixelsMax:dword
AvxBuildImageHistogram_ proc frame
    _CreateFrame BIH_,1024,0,rbx,rsi,rdi
    _EndProlog
; Make sure num_pixels is valid
    xor eax,eax             ;set error code
    test r8d,r8d
    jz Done               ;jump if num_pixels is zero
    cmp r8d,[c_NumPixelsMax]
    ja Done               ;jump if num_pixels too big
    test r8d,1fh
    jnz Done              ;jump if num_pixels % 32 != 0
; Make sure histo & pixel_buff are properly aligned
    mov rsi,rcx             ;rsi = ptr to histo
    test rsi,0fh
    jnz Done              ;jump if histo misaligned
    mov r9,rdx
    test r9,0fh
    jnz Done              ;jump if pixel_buff misaligned
; Initialize local histogram buffers (set all entries to zero)
    xor eax,eax
    mov rdi,rsi             ;rdi = ptr to histo
    mov rcx,128             ;rcx = size in qwords
    rep stosq              ;zero histo
    mov rdi,rbp             ;rdi = ptr to histo2
    mov rcx,128             ;rcx = size in qwords
    rep stosq              ;zero histo2
; Perform processing loop initializations
    shr r8d,5              ;number of pixel blocks (32 pixels/block)
    mov rdi,rbp             ;ptr to histo2
; Build the histograms
    align 16              ;align jump target
@@:   vmovdqa xmm0,xmmword ptr [r9]    ;load pixel block
    vmovdqa xmm1,xmmword ptr [r9+16]  ;load pixel block
; Process pixels 0 - 3
    vpextrb rax,xmm0,0
    add dword ptr [rsi+rax*4],1     ;count pixel 0
    vpextrb rbx,xmm0,1
    add dword ptr [rdi+rbx*4],1     ;count pixel 1
    vpextrb rcx,xmm0,2
    add dword ptr [rsi+rcx*4],1     ;count pixel 2
    vpextrb rdx,xmm0,3
    add dword ptr [rdi+rdx*4],1     ;count pixel 3
; Process pixels 4 - 7
    vpextrb rax,xmm0,4
    add dword ptr [rsi+rax*4],1     ;count pixel 4
    vpextrb rbx,xmm0,5
    add dword ptr [rdi+rbx*4],1     ;count pixel 5
    vpextrb rcx,xmm0,6
    add dword ptr [rsi+rcx*4],1     ;count pixel 6
    vpextrb rdx,xmm0,7
    add dword ptr [rdi+rdx*4],1     ;count pixel 7
; Process pixels 8 - 11
    vpextrb rax,xmm0,8
    add dword ptr [rsi+rax*4],1     ;count pixel 8
    vpextrb rbx,xmm0,9
    add dword ptr [rdi+rbx*4],1     ;count pixel 9
    vpextrb rcx,xmm0,10
    add dword ptr [rsi+rcx*4],1     ;count pixel 10
    vpextrb rdx,xmm0,11
    add dword ptr [rdi+rdx*4],1     ;count pixel 11
; Process pixels 12 - 15
    vpextrb rax,xmm0,12
    add dword ptr [rsi+rax*4],1     ;count pixel 12
    vpextrb rbx,xmm0,13
    add dword ptr [rdi+rbx*4],1     ;count pixel 13
    vpextrb rcx,xmm0,14
    add dword ptr [rsi+rcx*4],1     ;count pixel 14
    vpextrb rdx,xmm0,15
    add dword ptr [rdi+rdx*4],1     ;count pixel 15
; Process pixels 16 - 19
    vpextrb rax,xmm1,0
    add dword ptr [rsi+rax*4],1     ;count pixel 16
    vpextrb rbx,xmm1,1
    add dword ptr [rdi+rbx*4],1     ;count pixel 17
    vpextrb rcx,xmm1,2
    add dword ptr [rsi+rcx*4],1     ;count pixel 18
    vpextrb rdx,xmm1,3
    add dword ptr [rdi+rdx*4],1     ;count pixel 19
; Process pixels 20 - 23
    vpextrb rax,xmm1,4
    add dword ptr [rsi+rax*4],1     ;count pixel 20
    vpextrb rbx,xmm1,5
    add dword ptr [rdi+rbx*4],1     ;count pixel 21
    vpextrb rcx,xmm1,6
    add dword ptr [rsi+rcx*4],1     ;count pixel 22
    vpextrb rdx,xmm1,7
    add dword ptr [rdi+rdx*4],1     ;count pixel 23
; Process pixels 24 - 27
    vpextrb rax,xmm1,8
    add dword ptr [rsi+rax*4],1     ;count pixel 24
    vpextrb rbx,xmm1,9
    add dword ptr [rdi+rbx*4],1     ;count pixel 25
    vpextrb rcx,xmm1,10
    add dword ptr [rsi+rcx*4],1     ;count pixel 26
    vpextrb rdx,xmm1,11
    add dword ptr [rdi+rdx*4],1     ;count pixel 27
; Process pixels 28 - 31
    vpextrb rax,xmm1,12
    add dword ptr [rsi+rax*4],1     ;count pixel 28
    vpextrb rbx,xmm1,13
    add dword ptr [rdi+rbx*4],1     ;count pixel 29
    vpextrb rcx,xmm1,14
    add dword ptr [rsi+rcx*4],1     ;count pixel 30
    vpextrb rdx,xmm1,15
    add dword ptr [rdi+rdx*4],1     ;count pixel 31
    add r9,32              ;r9 = next pixel block
    sub r8d,1
    jnz @B               ;repeat loop if not done
; Merge intermediate histograms into final histogram
    mov ecx,32             ;ecx = num iterations
    xor eax,eax             ;rax = common offset
@@:   vmovdqa xmm0,xmmword ptr [rsi+rax]     ;load histo counts
    vmovdqa xmm1,xmmword ptr [rsi+rax+16]
    vpaddd xmm0,xmm0,xmmword ptr [rdi+rax]   ;add counts from histo2
    vpaddd xmm1,xmm1,xmmword ptr [rdi+rax+16]
    vmovdqa xmmword ptr [rsi+rax],xmm0     ;save final result
    vmovdqa xmmword ptr [rsi+rax+16],xmm1
    add rax,32
    sub ecx,1
    jnz @B
    mov eax,1              ;set success return code
Done: _DeleteFrame rbx,rsi,rdi
    ret
AvxBuildImageHistogram_ endp
    end
Listing 7-7.

Example Ch07_07

Near the top of the C++ code is a function named AvxBuildImageHistogramCpp. This function constructs an image histogram using a rudimentary technique. Prior to the histogram’s actual construction, the number of image pixels is validated for size (greater than 0 and not greater than c_NumPixelMax) and divisibility by 32. The divisibility test is performed to ensure compatibility with the assembly language function AvxBuildImageHistogram_. Next, the addresses of histo and pixel_buff are verified for proper alignment. The call to memset initializes each histogram pixel count bin to zero. A simple for loop is then used to construct the histogram.

The function AvxBuildImageHistogram uses a C++ class named ImageMatrix to load the pixels of an image into memory. (The source code for ImageMatrix is not shown but included as part of the chapter download package.) The variables num_pixels and pixel_buff are then initialized using the member functions ImageMatrix::GetNumPixels and ImageMatrix::GetPixelBuffer. Two histogram buffers then are allocated using the C++ template class AlignedArray<uint32_t>. Following the construction of the histograms using the functions AvxBuildImageHistogramCpp and AvxBuildImageHistogram_, the pixel counts in the two histogram buffers are compared for equivalence and written to a comma-separated-value text file.

The assembly language function AvxBuildImageHistogram_ constructs an image histogram using the AVX instruction set. In order to improve performance, this function builds two intermediate histograms and merges them into a final histogram. AvxBuildImageHistogram_ begins by creating a stack frame using the _CreateFrame macro. Note that the stack frame created by _CreateFrame includes 1024 bytes (256 doublewords, one for each grayscale intensity level) of local storage space, which is used for one of the intermediate histogram buffers. Following the execution of the code generated by _CreateFrame, register RBP points to the intermediate histogram on the stack (see Figure 5-6). The caller-provided buffer histo is used as the second intermediate histogram buffer. Following the _EndProlog macro, the function AvxBuildImageHistogram_ validates num_pixels for size and divisibility by 32; it the checks the addresses of histo and pixel_buff for proper alignment. The count values in both intermediate histograms are then initialized to zero using the stosq instruction.

The main processing loop begins with two vmovdqa instructions that load 32 image pixels into registers XMM1:XMM0. Note that prior to the first vmovdqa instruction, the MASM directive align 16 is used to align this instruction on a 16-byte boundary. Aligning the target of a jump instruction on a 16-byte boundary is an optimization technique that often improves performance. Chapter 15 discusses this and other optimization techniques in greater detail. Next, a vpextrb rax,xmm0,0 instruction extracts pixel element 0 (i.e., XMM0[7:0]) from register XMM0 and copies it to the low-order bits of register RAX; the high-order bits of RAX are set to zero. The ensuing add dword ptr [rsi+rax*4],1 instruction updates the appropriate pixel count bin in the first intermediate histogram. The next two instructions, vpextrb rbx,xmm0,1 and add dword ptr [rdi+rbx*4],1, process pixel element 1 in the same manner using the second intermediate histogram. This pixel-processing technique is then repeated for the remaining pixels in the current block.

Following the execution of the processing loop, the pixel count values in the two intermediate histograms are summed using packed integer arithmetic to create the final histogram. The _DeleteFrame macro is then used to release the local stack frame and restore the previously-saved non-volatile general-purpose registers. Here is the output for source code example Ch07_07:
Results for AvxBuildImageHistogram
 Histograms are identical
Running benchmark function AvxBuildImageHistogram_BM - please wait
Benchmark times save to file Ch07_07_AvxBuildImageHistogram_BM_CHROMIUM.csv
Table 7-3 shows benchmark timing measurements for the histogram build functions.
Table 7-3.

Histogram Build Mean Execution Times (Microseconds) Using TestImage1.bmp

CPU

AvxBuildImageHistogramCpp

AvxBuildImageHistogram_

i7-4790S

277

230

i9-7900X

255

199

i7-8700K

241

191

Image Thresholding

Image thresholding is an image-processing technique that creates a binary image (i.e., an image with only two colors) from a grayscale image. This binary (or mask) image signifies which pixels in the original image are greater than a predetermined or algorithmically derived intensity threshold value. Figure 7-7 illustrates a thresholding operation. Mask images are often employed to perform additional calculations using the grayscale pixels values of the original image. For example, one typical use of the mask image that’s shown in Figure 7-7 is to compute the mean intensity value of all above-threshold pixels in the original image. The application of a mask image simplifies calculating the mean since it facilitates the use of simple Boolean expressions to exclude unwanted pixels from the computations.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig7_HTML.jpg
Figure 7-7.

Sample grayscale and mask images

Source code example Ch07_08 demonstrates how to calculate the mean intensity of image pixels above a specified threshold. It also shows how to call a C++ function from an assembly language function. Listing 7-8 shows the source code for example Ch07_08.
//------------------------------------------------
//        Ch07_08.h
//------------------------------------------------
#pragma once
#include <cstdint>
// Image threshold data structure. This structure must agree with the
// structure that's defined in Ch07_08_.asm
struct ITD
{
  uint8_t* m_PbSrc;         // Source image pixel buffer
  uint8_t* m_PbMask;        // Mask mask pixel buffer
  uint32_t m_NumPixels;       // Number of source image pixels
  uint32_t m_NumMaskedPixels;    // Number of masked pixels
  uint32_t m_SumMaskedPixels;    // Sum of masked pixels
  uint8_t m_Threshold;       // Image threshold value
  uint8_t m_Pad[3];         // Available for future use
  double m_MeanMaskedPixels;    // Mean of masked pixels
};
// Functions defined in Ch07_08.cpp
extern bool AvxThresholdImageCpp(ITD* itd);
extern bool AvxCalcImageMeanCpp(ITD* itd);
extern "C" bool IsValid(uint32_t num_pixels, const uint8_t* pb_src, const uint8_t* pb_mask);
// Functions defined in Ch07_08_.asm
extern "C" bool AvxThresholdImage_(ITD* itd);
extern "C" bool AvxCalcImageMean_(ITD* itd);
// Functions defined in Ch07_08_BM.cpp
extern void AvxThreshold_BM(void);
// Miscellaneous constants
const uint8_t c_TestThreshold = 96;
//------------------------------------------------
//        Ch07_08.cpp
//------------------------------------------------
#include "stdafx.h"
#include <cstdint>
#include <iostream>
#include <iomanip>
#include "Ch07_08.h"
#include "AlignedMem.h"
#include "ImageMatrix.h"
using namespace std;
extern "C" uint32_t c_NumPixelsMax = 16777216;
bool IsValid(uint32_t num_pixels, const uint8_t* pb_src, const uint8_t* pb_mask)
{
  const size_t alignment = 16;
  // Make sure num_pixels is valid
  if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))
    return false;
  if ((num_pixels % 64) != 0)
    return false;
  // Make sure image buffers are properly aligned
  if (!AlignedMem::IsAligned(pb_src, alignment))
    return false;
  if (!AlignedMem::IsAligned(pb_mask, alignment))
    return false;
  return true;
}
bool AvxThresholdImageCpp(ITD* itd)
{
  uint8_t* pb_src = itd->m_PbSrc;
  uint8_t* pb_mask = itd->m_PbMask;
  uint8_t threshold = itd->m_Threshold;
  uint32_t num_pixels = itd->m_NumPixels;
  // Verify pixel count and buffer alignment
  if (!IsValid(num_pixels, pb_src, pb_mask))
    return false;
  // Threshold the image
  for (uint32_t i = 0; i < num_pixels; i++)
    *pb_mask++ = (*pb_src++ > threshold) ? 0xff : 0x00;
  return true;
}
bool AvxCalcImageMeanCpp(ITD* itd)
{
  uint8_t* pb_src = itd->m_PbSrc;
  uint8_t* pb_mask = itd->m_PbMask;
  uint32_t num_pixels = itd->m_NumPixels;
  // Verify pixel count and buffer alignment
  if (!IsValid(num_pixels, pb_src, pb_mask))
    return false;
  // Calculate mean of masked pixels
  uint32_t sum_masked_pixels = 0;
  uint32_t num_masked_pixels = 0;
  for (uint32_t i = 0; i < num_pixels; i++)
  {
    uint8_t mask_val = *pb_mask++;
    num_masked_pixels += mask_val & 1;
    sum_masked_pixels += (*pb_src++ & mask_val);
  }
  itd->m_NumMaskedPixels = num_masked_pixels;
  itd->m_SumMaskedPixels = sum_masked_pixels;
  if (num_masked_pixels > 0)
    itd->m_MeanMaskedPixels = (double)sum_masked_pixels / num_masked_pixels;
  else
    itd->m_MeanMaskedPixels = -1.0;
  return true;
}
void AvxThreshold(void)
{
  const wchar_t* fn_src = L"..\\Ch07_Data\\TestImage2.bmp";
  const wchar_t* fn_mask1 = L"Ch07_08_AvxThreshold_TestImage2_Mask1.bmp";
  const wchar_t* fn_mask2 = L"Ch07_08_AvxThreshold_TestImage2_Mask2.bmp";
  ImageMatrix im_src(fn_src);
  int im_h = im_src.GetHeight();
  int im_w = im_src.GetWidth();
  ImageMatrix im_mask1(im_h, im_w, PixelType::Gray8);
  ImageMatrix im_mask2(im_h, im_w, PixelType::Gray8);
  ITD itd1, itd2;
  itd1.m_PbSrc = im_src.GetPixelBuffer<uint8_t>();
  itd1.m_PbMask = im_mask1.GetPixelBuffer<uint8_t>();
  itd1.m_NumPixels = im_src.GetNumPixels();
  itd1.m_Threshold = c_TestThreshold;
  itd2.m_PbSrc = im_src.GetPixelBuffer<uint8_t>();
  itd2.m_PbMask = im_mask2.GetPixelBuffer<uint8_t>();
  itd2.m_NumPixels = im_src.GetNumPixels();
  itd2.m_Threshold = c_TestThreshold;
  // Threshold image
  bool rc1 = AvxThresholdImageCpp(&itd1);
  bool rc2 = AvxThresholdImage_(&itd2);
  if (!rc1 || !rc2)
  {
    cout << "\nInvalid return code: ";
    cout << "rc1 = " << rc1 << ", rc2 = " << rc2 << '\n';
    return;
  }
  im_mask1.SaveToBitmapFile(fn_mask1);
  im_mask2.SaveToBitmapFile(fn_mask2);
  // Calculate mean of masked pixels
  rc1 = AvxCalcImageMeanCpp(&itd1);
  rc2 = AvxCalcImageMean_(&itd2);
  if (!rc1 || !rc2)
  {
    cout << "\nInvalid return code: ";
    cout << "rc1 = " << rc1 << ", rc2 = " << rc2 << '\n';
    return;
  }
  // Print results
  const int w = 12;
  cout << fixed << setprecision(4);
  cout << "\nResults for AvxThreshold\n\n";
  cout << "              C++    X86-AVX\n";
  cout << "---------------------------------------------\n";
  cout << "SumPixelsMasked:  ";
  cout << setw(w) << itd1.m_SumMaskedPixels << " ";
  cout << setw(w) << itd2.m_SumMaskedPixels << '\n';
  cout << "NumPixelsMasked:  ";
  cout << setw(w) << itd1.m_NumMaskedPixels << " ";
  cout << setw(w) << itd2.m_NumMaskedPixels << '\n';
  cout << "MeanMaskedPixels: ";
  cout << setw(w) << itd1.m_MeanMaskedPixels << " ";
  cout << setw(w) << itd2.m_MeanMaskedPixels << '\n';
}
int main()
{
  try
  {
    AvxThreshold();
    AvxThreshold_BM();
  }
  catch (...)
  {
    cout << "Unexpected exception has occurred\n";
  }
  return 0;
}
;-------------------------------------------------
;        Ch07_08.asm
;-------------------------------------------------
    include <MacrosX86-64-AVX.asmh>
; Image threshold data structure (see Ch07_08.h)
ITD         struct
PbSrc        qword ?
PbMask       qword ?
NumPixels      dword ?
NumMaskedPixels   dword ?
SumMaskedPixels   dword ?
Threshold      byte ?
Pad         byte 3 dup(?)
MeanMaskedPixels  real8 ?
ITD         ends
        .const
        align 16
PixelScale   byte 16 dup(80h)      ;uint8 to int8 scale value
CountPixelsMask byte 16 dup(01h)      ;mask to count pixels
R8_MinusOne   real8 -1.0         ;invalid mean value
        .code
        extern IsValid:proc
; extern "C" bool AvxThresholdImage_(ITD* itd);
;
; Returns:   0 = invalid size or unaligned image buffer, 1 = success
AvxThresholdImage_ proc frame
    _CreateFrame TI_,0,0,rbx
    _EndProlog
; Verify the arguments in the ITD structure
    mov rbx,rcx             ;copy itd ptr to non-volatile register
    mov ecx,[rbx+ITD.NumPixels]     ;ecx = num_pixels
    mov rdx,[rbx+ITD.PbSrc]       ;rdx = pb_src
    mov r8,[rbx+ITD.PbMask]       ;r8 = pb_mask
    sub rsp,32             ;allocate home area for IsValid
    call IsValid            ;validate args
    or al,al
    jz Done               ;jump if invalid
; Initialize registers for processing loop
    mov ecx,[rbx+ITD.NumPixels]       ;ecx = num_pixels
    shr ecx,6                ;ecx = number of 64b pixel blocks
    mov rdx,[rbx+ITD.PbSrc]         ;rdx = pb_src
    mov r8,[rbx+ITD.PbMask]         ;r8 = pb_mask
    movzx r9d,byte ptr [rbx+ITD.Threshold] ;r9d = threshold
    vmovd xmm1,r9d             ;xmm1[7:0] = threshold
    vpxor xmm0,xmm0,xmm0          ;mask for vpshufb
    vpshufb xmm1,xmm1,xmm0         ;xmm1 = packed threshold
    vmovdqa xmm4,xmmword ptr [PixelScale]  ;packed pixel scale factor
    vpsubb xmm5,xmm1,xmm4          ;scaled threshold
; Create the mask image
@@:   vmovdqa xmm0,xmmword ptr [rdx]   ;original image pixels
    vpsubb xmm1,xmm0,xmm4        ;scaled image pixels
    vpcmpgtb xmm2,xmm1,xmm5       ;mask pixels
    vmovdqa xmmword ptr [r8],xmm2    ;save mask result
    vmovdqa xmm0,xmmword ptr [rdx+16]
    vpsubb xmm1,xmm0,xmm4
    vpcmpgtb xmm2,xmm1,xmm5
    vmovdqa xmmword ptr [r8+16],xmm2
    vmovdqa xmm0,xmmword ptr [rdx+32]
    vpsubb xmm1,xmm0,xmm4
    vpcmpgtb xmm2,xmm1,xmm5
    vmovdqa xmmword ptr [r8+32],xmm2
    vmovdqa xmm0,xmmword ptr [rdx+48]
    vpsubb xmm1,xmm0,xmm4
    vpcmpgtb xmm2,xmm1,xmm5
    vmovdqa xmmword ptr [r8+48],xmm2
    add rdx,64
    add r8,64              ;update pointers
    sub ecx,1              ;update counter
    jnz @B               ;repeat until done
    mov eax,1              ;set success return code
Done:  _DeleteFrame rbx
    ret
AvxThresholdImage_ endp
;
; Macro _UpdateBlockSums
;
_UpdateBlockSums macro disp
    vmovdqa xmm0,xmmword ptr [rdx+disp] ;xmm0 = 16 image pixels
    vmovdqa xmm1,xmmword ptr [r8+disp] ;xmm1 = 16 mask pixels
    vpand xmm2,xmm1,xmm8        ;xmm2 = 16 mask pixels (0x00 or 0x01)
    vpaddb xmm6,xmm6,xmm2        ;update block num_masked_pixels
    vpand xmm2,xmm0,xmm1        ;zero out unmasked image pixel
    vpunpcklbw xmm3,xmm2,xmm9      ;promote image pixels from byte to word
    vpunpckhbw xmm4,xmm2,xmm9
    vpaddw xmm4,xmm4,xmm3
    vpaddw xmm7,xmm7,xmm4        ;update block sum_mask_pixels
    endm
; extern "C" bool AvxCalcImageMean_(ITD* itd);
;
; Returns: 0 = invalid image size or unaligned image buffer, 1 = success
AvxCalcImageMean_ proc frame
    _CreateFrame CIM_,0,64,rbx
    _SaveXmmRegs xmm6,xmm7,xmm8,xmm9
    _EndProlog
; Verify the arguments in the ITD structure
    mov rbx,rcx             ;rbx = itd ptr
    mov ecx,[rbx+ITD.NumPixels]     ;ecx = num_pixels
    mov rdx,[rbx+ITD.PbSrc]       ;rdx = pb_src
    mov r8,[rbx+ITD.PbMask]       ;r8 = pb_mask
    sub rsp,32             ;allocate home area for IsValid
    call IsValid            ;validate args
    or al,al
    jz Done               ;jump if invalid
; Initialize registers for processing loop
    mov ecx,[rbx+ITD.NumPixels]     ;ecx = num_pixels
    shr ecx,6              ;ecx = number of 64b pixel blocks
    mov rdx,[rbx+ITD.PbSrc]       ;rdx = pb_src
    mov r8,[rbx+ITD.PbMask]       ;r8 = pb_mask
    vmovdqa xmm8,xmmword ptr [CountPixelsMask] ;mask for counting pixels
    vpxor xmm9,xmm9,xmm9            ;xmm9 = packed zero
    xor r10d,r10d            ;r10d = num_masked_pixels (1 dword)
    vpxor xmm5,xmm5,xmm5        ;sum_masked_pixels (4 dwords)
;Calculate num_mask_pixels and sum_mask_pixels
LP1:  vpxor xmm6,xmm6,xmm6        ;num_masked_pixels_tmp (16 byte values)
    vpxor xmm7,xmm7,xmm7        ;sum_masked_pixels_tmp (8 word values)
    _UpdateBlockSums 0
    _UpdateBlockSums 16
    _UpdateBlockSums 32
    _UpdateBlockSums 48
; Update num_masked_pixels
    vpsrldq xmm0,xmm6,8
    vpaddb xmm6,xmm6,xmm0        ;num_mask_pixels_tmp (8 byte vals)
    vpsrldq xmm0,xmm6,4
    vpaddb xmm6,xmm6,xmm0        ;num_mask_pixels_tmp (4 byte vals)
    vpsrldq xmm0,xmm6,2
    vpaddb xmm6,xmm6,xmm0        ;num_mask_pixels_tmp (2 byte vals)
    vpsrldq xmm0,xmm6,1
    vpaddb xmm6,xmm6,xmm0        ;num_mask_pixels_tmp (1 byte val)
    vpextrb eax,xmm6,0
    add r10d,eax            ;num_mask_pixels += num_mask_pixels_tmp
; Update sum_masked_pixels
    vpunpcklwd xmm0,xmm7,xmm9      ;promote sum_mask_pixels_tmp to dwords
    vpunpckhwd xmm1,xmm7,xmm9
    vpaddd xmm5,xmm5,xmm0
    vpaddd xmm5,xmm5,xmm1        ;sum_mask_pixels += sum_masked_pixels_tmp
    add rdx,64             ;update pb_src pointer
    add r8,64              ;update pb_mask pointer
    sub rcx,1              ;update loop counter
    jnz LP1               ;repeat if not done
; Compute mean of masked pixels
    vphaddd xmm0,xmm5,xmm5
    vphaddd xmm1,xmm0,xmm0
    vmovd eax,xmm1           ;eax = final sum_mask_pixels
    test r10d,r10d           ;is num_mask_pixels zero?
    jz NoMean              ;if yes, skip calc of mean
    vcvtsi2sd xmm0,xmm0,eax       ;xmm0 = sum_masked_pixels
    vcvtsi2sd xmm1,xmm1,r10d      ;xmm1 = num_masked_pixels
    vdivsd xmm2,xmm0,xmm1        ;xmm2 = mean_masked_pixels
    jmp @F
NoMean: vmovsd xmm2,[R8_MinusOne]        ;use -1.0 for no mean
@@:   mov [rbx+ITD.SumMaskedPixels],eax    ;save sum_masked_pixels
    mov [rbx+ITD.NumMaskedPixels],r10d   ;save num_masked_pixels
    vmovsd [rbx+ITD.MeanMaskedPixels],xmm2 ;save mean
    mov eax,1                ;set success return code
Done:  _RestoreXmmRegs xmm6,xmm7,xmm8,xmm9
    _DeleteFrame rbx
    ret
AvxCalcImageMean_ endp
    end
Listing 7-8.

Example Ch07_08

The algorithm that’s used in example Ch07_08 consists of two phases. Phase 1 constructs the mask image that’s shown in Figure 7-7. Phase 2 computes the mean intensity of all pixels in the grayscale image whose corresponding mask image pixel is white (i.e., above the specified threshold). The file Ch07_08.h that’s shown in Listing 7-8 defines a structure named ITD that maintains data required by the algorithm. Note this structure contains two count values: m_NumPixels and m_NumMaskedPixels. The former value is the total number of image pixels, while the latter value represents the number of image pixels greater than m_Threshold.

The C++ code in Listing 7-8 contains separate thresholding and mean calculating functions. The function AvxThresholdImageCpp constructs the mask image by comparing each pixel in the grayscale image to the threshold value that’s specified by itd->m_Threshold. If a grayscale image pixel is greater than this value, its corresponding pixel in the mask image is set to 0xff; otherwise, the mask image pixel is set to 0x00. The function AvxCalcImageMeanCpp uses this mask image to calculate the mean intensity value of all grayscale image pixels greater than the threshold value. Note that the for loop in this function computes num_mask_pixels and sum_mask_pixels using simple Boolean expressions instead of logical compare operations. The former technique is often faster and easier to implement using SIMD arithmetic.

Listing 7-8 also shows assembly language implementations of the thresholding and mean calculating functions. Following its prolog, the function AvxThresholdImage_ validates the arguments in the supplied ITD structure by calling the C++ function IsValid. Prior to the call instruction, AvxThresholdImage_ loads the required argument values for IsValid into the appropriate registers and allocates a home area using a sub rsp,32 instruction. After argument validation, the movzx r9d,byte ptr [rbx+ITD.Threshold] instruction loads the threshold value into register R9D. The ensuing vpshufb xmm1,xmm1,xmm0 instruction “broadcasts” the threshold value to all byte positions in register XMM1. The vpshufb instruction uses the low-order four bits of each byte in the second source operand as an index to permute the bytes in the destination operand (a zero is copied if the high-order bit is set in the seconds source operand byte). Figure 7-8 illustrates this process. The packed threshold value is then scaled using a vpsubb instruction. The reason for doing this is explained in the next paragraph.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig8_HTML.jpg
Figure 7-8.

Execution examples of the instruction vpshufb

The processing loop in function AvxThreshholdImage_ uses the vpcmpgtb (Compare Packed Signed Integers for Greater Than) instruction to create the mask image. This instruction performs pairwise compares of the byte elements in the two source operands. If a byte in the first source operand is greater than the corresponding byte in the second operand, the destination operand byte is set to 0xff; otherwise, the destination operand byte is set to 0x00. Figure 7-9 illustrates execution of the vpcmpgtb instruction. It is important to note that vpcmpgtb executes its compares using signed integer arithmetic. This means that the pixels values in the grayscale image, which are unsigned byte values, must be re-scaled for compatibility with the vpcmpgtb instruction. The vpsubb instruction remaps the image’s grayscale pixels values from [0, 255] to [-128, 127]. This is also the reason that a vpsubb instruction was used on the packed threshold value prior to the start of the loop. Following each compare operation, the vmovdqa instruction saves the mask pixels to the specified buffer. Similar to example Ch07_04, the function AvxThresholdImage_ uses a partially unrolled processing loop to handle 64 pixels per iteration.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig9_HTML.jpg
Figure 7-9.

Execution of the instruction vpcmpgtb

The assembly language function AvxCalcImageMean_ also begins by validating its arguments using the C++ function IsValid. Following argument validation, the xor r10d,r10d and vpxor xmm5,xmm5,xmm5 instructions initialize num_masked_pixels and sum_masked_pixels (four doublewords) to zero, respectively. The processing loop in function AvxCalcImageMean_ uses a macro named _UpdateBlockSums to compute the intermediate values num_masked_pixels_tmp and sum_masked_pixels_tmp for a block of 64 pixels. This macro performs its calculations using packed byte and word arithmetic, which reduces the number of byte to doubleword size-promotions that must be carried out. Figure 7-10 illustrates the arithmetic and Boolean operations that are performed by _UpdateBlockSums. The values num_masked_pixels (R10D) and sum_masked_pixels (XMM5) are then updated and the processing loop repeats until all pixels have been processed.
../images/326959_2_En_7_Chapter/326959_2_En_7_Fig10_HTML.jpg
Figure 7-10.

Masked pixel sum and pixel count calculations performed by macro _UpdateBlockSums

Following completion of its processing loop, function AvxCalcImageMean_ calculates the final mean intensity value using scalar double-precision floating-point arithmetic. Note that num_mask_pixels is tested prior to calculating the mean in order to avoid a division-by-zero error. Here is the output for source code example Ch07_08:
Results for AvxThreshold
              C++    X86-AVX
---------------------------------------------
SumPixelsMasked:    23813043   23813043
NumPixelsMasked:     138220    138220
MeanMaskedPixels:   172.2836   172.2836
Running benchmark function AvxThreshold_BM - please wait
Benchmark times save to file Ch07_08_AvxThreshold_BM_CHROMIUM.csv
Table 7-4 shows timing measurements for the source code example Ch07_08. Note that the measurements in this table are for an entire image thresholding and mean calculation sequence.
Table 7-4.

Mean Execution Times (Microseconds) to Perform Image Thresholding and Mean Calculation Using TestImage2.bmp

CPU

C++

Assembly Language

i7-4790S

289

50

i9-7900X

250

40

i7-8700K

242

39

Summary

Here are the key learning points for Chapter 7:
  • The vpadd[b|w|d|q] instructions perform packed addition. The vpadds[b|w] and vpaddus[b|w] instructions perform packed signed and unsigned saturated addition.

  • The vpsub[b|w|d|q] instructions perform packed subtraction. The vpsubs[b|w] and vpsubus[b|w] instructions perform packed signed and unsigned saturated subtraction.

  • The vpmul[h|l]w instructions carry out multiplication using packed word operands. The vpmuldq and vpmulld instructions carry out multiplication using packed doubleword operands.

  • The vpsll[w|d|q] and vpsrl[w|d|q] instructions execute logical left and right shifts using packed operands. The vpsra[w|d|q] instructions execute arithmetic right shifts using packed operands. The vps[l|r]dq instructions execute logical left and right shifts using 128-bit wide operands.

  • Assembly language functions can use the vpand, vpor, and vpxor instructions to perform bitwise AND, inclusive OR, and exclusive OR operations using packed integer operands.

  • The instructions vpextr[b|w|d|q] extract an element value from a packed operand. The vpinsr[b|w|d|q] instructions insert an element value into a packed operand.

  • The vpunpckl[bw|dw|dq] and vpunpckh[bw|dw|dq] instructions unpack and interleave the contents of their two source operands. These instructions are frequently used to size-promote packed integer operands. The vpackus[bw|dw] instructions size-reduce packed integer operands using unsigned saturated arithmetic.

  • The vpminu[b|w|d] and vpmaxu[b|w|d] instructions perform packed unsigned integer minimum-maximum compares.

  • The vpshufb instruction rearranges the bytes of a packed operand according to a control mask.

  • The vpcmpgt[b|w|d|q] instructions perform signed integer greater than compares using packed operands.

  • Aligning the target of a jump instruction to a 16-byte boundary often results in faster executing for loops.