Daniel KusswurmModern X86 Assembly Language Programminghttps://doi.org/10.1007/978-1-4842-4063-2_7

7. AVX Programming – Packed Integers

Daniel Kusswurm¹

(1)

Geneva, IL, USA

In the previous chapter, you learned how to use the AVX instruction set to perform calculations using packed floating-point operands. In this chapter, you learn how to carry out computations using packed integer operands. Similar to the previous chapter, the first few source code examples in this chapter demonstrate basic arithmetic operations using packed integers. The remaining source code examples illustrate how to use the computational resources of AVX to perform common image processing operations, including histogram creation and thresholding.

AVX supports packed integer operations using 128-bit wide operands, and that is the focus of the source code examples in this chapter. Performing packed integer operations using 256-bit operands requires a processor that supports AVX2. You learn about AVX2 programming with packed integers in Chapter 10.

Packed Integer Addition and Subtraction

Listing 7-1 shows the C++ and assembly language source code for example Ch07_01. This example demonstrates how to perform packed integer addition and subtraction using signed and unsigned 16-bit integers. It also illustrates both wraparound and saturated arithmetic.

//------------------------------------------------

// Ch07_01.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <string>

#include "XmmVal.h"

using namespace std;

extern "C" void AvxPackedAddI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);

extern "C" void AvxPackedSubI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);

extern "C" void AvxPackedAddU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);

extern "C" void AvxPackedSubU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2]);

// Signed packed addition and subtraction

void AvxPackedAddI16(void)

{

alignas(16) XmmVal a;

alignas(16) XmmVal b;

alignas(16) XmmVal c[2];

a.m_I16[0] = 10; b.m_I16[0] = 100;

a.m_I16[1] = 200; b.m_I16[1] = -200;

a.m_I16[2] = 30; b.m_I16[2] = 32760;

a.m_I16[3] = -32766; b.m_I16[3] = -400;

a.m_I16[4] = 50; b.m_I16[4] = 500;

a.m_I16[5] = 60; b.m_I16[5] = -600;

a.m_I16[6] = 32000; b.m_I16[6] = 1200;

a.m_I16[7] = -32000; b.m_I16[7] = -950;

AvxPackedAddI16_(a, b, c);

cout << "\nResults for AxvPackedAddI16 - Wraparound Addition\n";

cout << "a: " << a.ToStringI16() << '\n';

cout << "b: " << b.ToStringI16() << '\n';

cout << "c[0]: " << c[0].ToStringI16() << '\n';

cout << "\nResults for AxvPackedAddI16 - Saturated Addition\n";

cout << "a: " << a.ToStringI16() << '\n';

cout << "b: " << b.ToStringI16() << '\n';

cout << "c[1]: " << c[1].ToStringI16() << '\n';

}

void AvxPackedSubI16(void)

{

alignas(16) XmmVal a;

alignas(16) XmmVal b;

alignas(16) XmmVal c[2];

a.m_I16[0] = 10; b.m_I16[0] = 100;

a.m_I16[1] = 200; b.m_I16[1] = -200;

a.m_I16[2] = -30; b.m_I16[2] = 32760;

a.m_I16[3] = -32766; b.m_I16[3] = 400;

a.m_I16[4] = 50; b.m_I16[4] = 500;

a.m_I16[5] = 60; b.m_I16[5] = -600;

a.m_I16[6] = 32000; b.m_I16[6] = 1200;

a.m_I16[7] = -32000; b.m_I16[7] = 950;

AvxPackedSubI16_(a, b, c);

cout << "\nResults for AxvPackedSubI16 - Wraparound Subtraction\n";

cout << "a: " << a.ToStringI16() << '\n';

cout << "b: " << b.ToStringI16() << '\n';

cout << "c[0]: " << c[0].ToStringI16() << '\n';

cout << "\nResults for AxvPackedSubI16 - Saturated Subtraction\n";

cout << "a: " << a.ToStringI16() << '\n';

cout << "b: " << b.ToStringI16() << '\n';

cout << "c[1]: " << c[1].ToStringI16() << '\n';

}

// Unsigned packed addition and subtraction

void AvxPackedAddU16(void)

{

XmmVal a;

XmmVal b;

XmmVal c[2];

a.m_U16[0] = 10; b.m_U16[0] = 100;

a.m_U16[1] = 200; b.m_U16[1] = 200;

a.m_U16[2] = 300; b.m_U16[2] = 65530;

a.m_U16[3] = 32766; b.m_U16[3] = 40000;

a.m_U16[4] = 50; b.m_U16[4] = 500;

a.m_U16[5] = 20000; b.m_U16[5] = 25000;

a.m_U16[6] = 32000; b.m_U16[6] = 1200;

a.m_U16[7] = 32000; b.m_U16[7] = 50000;

AvxPackedAddU16_(a, b, c);

cout << "\nResults for AxvPackedAddU16 - Wraparound Addition\n";

cout << "a: " << a.ToStringU16() << '\n';

cout << "b: " << b.ToStringU16() << '\n';

cout << "c[0]: " << c[0].ToStringU16() << '\n';

cout << "\nResults for AxvPackedAddU16 - Saturated Addition\n";

cout << "a: " << a.ToStringU16() << '\n';

cout << "b: " << b.ToStringU16() << '\n';

cout << "c[1]: " << c[1].ToStringU16() << '\n';

}

void AvxPackedSubU16(void)

{

XmmVal a;

XmmVal b;

XmmVal c[2];

a.m_U16[0] = 10; b.m_U16[0] = 100;

a.m_U16[1] = 200; b.m_U16[1] = 200;

a.m_U16[2] = 30; b.m_U16[2] = 7;

a.m_U16[3] = 65000; b.m_U16[3] = 5000;

a.m_U16[4] = 60; b.m_U16[4] = 500;

a.m_U16[5] = 25000; b.m_U16[5] = 28000;

a.m_U16[6] = 32000; b.m_U16[6] = 1200;

a.m_U16[7] = 1200; b.m_U16[7] = 950;

AvxPackedSubU16_(a, b, c);

cout << "\nResults for AxvPackedSubU16 - Wraparound Subtraction\n";

cout << "a: " << a.ToStringU16() << '\n';

cout << "b: " << b.ToStringU16() << '\n';

cout << "c[0]: " << c[0].ToStringU16() << '\n';

cout << "\nResults for AxvPackedSubI16 - Saturated Subtraction\n";

cout << "a: " << a.ToStringU16() << '\n';

cout << "b: " << b.ToStringU16() << '\n';

cout << "c[1]: " << c[1].ToStringU16() << '\n';

}

int main()

{

string sep(75, '-');

AvxPackedAddI16();

AvxPackedSubI16();

cout << '\n' << sep << '\n';

AvxPackedAddU16();

AvxPackedSubU16();

return 0;

}

;-------------------------------------------------

; Ch07_01.asm

;-------------------------------------------------

; extern "C" void AvxPackedAddI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])

.code

AvxPackedAddI16_ proc

; Packed signed word addition

vmovdqa xmm0,xmmword ptr [rcx] ;xmm0 = a

vmovdqa xmm1,xmmword ptr [rdx] ;xmm1 = b

vpaddw xmm2,xmm0,xmm1 ;packed add - wraparound

vpaddsw xmm3,xmm0,xmm1 ;packed add - saturated

vmovdqa xmmword ptr [r8],xmm2 ;save c[0]

vmovdqa xmmword ptr [r8+16],xmm3 ;save c[1]

ret

AvxPackedAddI16_ endp

; extern "C" void AvxPackedSubI16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])

AvxPackedSubI16_ proc

; Packed signed word subtraction

vmovdqa xmm0,xmmword ptr [rcx] ;xmm0 = a

vmovdqa xmm1,xmmword ptr [rdx] ;xmm1 = b

vpsubw xmm2,xmm0,xmm1 ;packed sub - wraparound

vpsubsw xmm3,xmm0,xmm1 ;packed sub - saturated

vmovdqa xmmword ptr [r8],xmm2 ;save c[0]

vmovdqa xmmword ptr [r8+16],xmm3 ;save c[1]

ret

AvxPackedSubI16_ endp

; extern "C" void AvxPackedAddU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])

AvxPackedAddU16_ proc

; Packed unsigned word addition

vmovdqu xmm0,xmmword ptr [rcx] ;xmm0 = a

vmovdqu xmm1,xmmword ptr [rdx] ;xmm1 = b

vpaddw xmm2,xmm0,xmm1 ;packed add - wraparound

vpaddusw xmm3,xmm0,xmm1 ;packed add - saturated

vmovdqu xmmword ptr [r8],xmm2 ;save c[0]

vmovdqu xmmword ptr [r8+16],xmm3 ;save c[1]

ret

AvxPackedAddU16_ endp

; extern "C" void AvxPackedSubU16_(const XmmVal& a, const XmmVal& b, XmmVal c[2])

AvxPackedSubU16_ proc

; Packed unsigned word subtraction

vmovdqu xmm0,xmmword ptr [rcx] ;xmm0 = a

vmovdqu xmm1,xmmword ptr [rdx] ;xmm1 = b

vpsubw xmm2,xmm0,xmm1 ;packed sub - wraparound

vpsubusw xmm3,xmm0,xmm1 ;packed sub - saturated

vmovdqu xmmword ptr [r8],xmm2 ;save c[0]

vmovdqu xmmword ptr [r8+16],xmm3 ;save c[1]

ret

AvxPackedSubU16_ endp

end

Listing 7-1.

Example Ch07_01

Toward the top of the C++ code are the declarations for the assembly language functions that perform packed integer addition and subtraction. Each function takes two XmmVal arguments and saves its results to an XmmVal array. The structure XmmVal, which you learned about in Chapter 6 (see Listing 6-1), contains a publicly-accessible anonymous union with members that correspond to the packed data types that can be used with an XMM register. The XmmVal structure also defines several member functions that format the contents of an XmmVal for display.

The C++ function AvxPackedAddI16 contains test code that exercises the assembly language function AvxPackedAddI16_. This function performs packed signed 16-bit integer (word) addition using both wraparound and saturated arithmetic. Note that the XmmVal variables a, b, and c are all defined using the C++ specifier alignas(16), which aligns each XmmVal to a 16-byte boundary. Following the execution of the function AvxPackedaddI16_, the results are displayed using a series of stream writes to cout. The C++ function AvxPackedSubI16, which is similar to AvxPackedAddI16, uses the assembly language function AvxPackedSubI16_.

A parallel set of C++ functions, AvxPackedAddU16 and AvxPackedSubU16, contain code that exercise the assembly language functions AvxPackedAddU16_ and AvxPackedSubU16_. These functions perform packed unsigned 16-bit integer addition and subtraction, respectively. Note that the XmmVal variables in AvxPackedAddU16 and AvxPackedSubU16 do not use the alignas(16) specifier, which means that these values are not guaranteed to be aligned on a 16-byte boundary. The reason for doing this is to demonstrate the use of the AVX instruction vmovdqu (Move Unaligned Packed Integer Values), as you’ll soon see.

The assembly language function AvxPackedAddI6_ starts with a vmovdqa xmm0,xmmword ptr [rcx] instruction that loads argument value a into register XMM0. The ensuing vmovdqa xmm1,xmmword ptr [rdx] instruction copies b into register XMM1. The next two instructions, vpaddw xmm2,xmm0,xmm1 and vpaddsw xmm3,xmm0,xmm1, carry out packed signed 16-bit integer addition using wrapround and saturated arithmetic, respectively. The final two vmovdqa instructions save the calculated results to XmmVal array c. Assembly language function AvxPackedSubI16_ is similar to AvxPackedAddI16_ and uses the instructions vpsubw and vpsubsw to carry out packed signed 16-bit integer subtraction.

The assembly language function AvxPackedAddU16_ begins with a vmovdqu xmm0,xmmword ptr [rcx] instruction that loads a into register XMM0. A vmovdqu instruction is used here since XmmVal a was defined in the C++ code without the alignas(16) specifier. Note that function AvxPackedAddU16_ uses vmovdqu for demonstration purposes only; a properly aligned XmmVal and a vmovdqa instruction should have been used instead. It’s already been mentioned a number of times in this book but warrants repeating due to its importance: SIMD operands in memory should be properly aligned whenever possible in order to avoid potential performance penalties that can occur whenever the processor accesses an unaligned operand in memory.

Following the loading of argument values a and b into register XMM0 and XMM1, function AvxPackedAddU16_ performs packed unsigned 16-bit integer addition using the instructions vpaddw xmm2,xmm0,xmm1 (wraparound arithmetic) and vpaddusw xmm3,xmm0,xmm1 (saturated arithmetic). Two vmovdqu instructions save the results to array c. The function AvxPackedSubU16_ implements packed unsigned 16-bit integer subtraction using the vpsubw and vpsubusw instructions. This function also uses the vmovdqu instruction to load argument values and save results. Here are the results for source code example Ch07_01:

Results for AxvPackedAddI16 - Wraparound Addition

a: 10 200 30 -32766 | 50 60 32000 -32000

b: 100 -200 32760 -400 | 500 -600 1200 -950

c[0]: 110 0 -32746 32370 | 550 -540 -32336 32586

Results for AxvPackedAddI16 - Saturated Addition

a: 10 200 30 -32766 | 50 60 32000 -32000

b: 100 -200 32760 -400 | 500 -600 1200 -950

c[1]: 110 0 32767 -32768 | 550 -540 32767 -32768

Results for AxvPackedSubI16 - Wraparound Subtraction

a: 10 200 -30 -32766 | 50 60 32000 -32000

b: 100 -200 32760 400 | 500 -600 1200 950

c[0]: -90 400 32746 32370 | -450 660 30800 32586

Results for AxvPackedSubI16 - Saturated Subtraction

a: 10 200 -30 -32766 | 50 60 32000 -32000

b: 100 -200 32760 400 | 500 -600 1200 950

c[1]: -90 400 -32768 -32768 | -450 660 30800 -32768

---------------------------------------------------------------------------

Results for AxvPackedAddU16 - Wraparound Addition

a: 10 200 300 32766 | 50 20000 32000 32000

b: 100 200 65530 40000 | 500 25000 1200 50000

c[0]: 110 400 294 7230 | 550 45000 33200 16464

Results for AxvPackedAddU16 - Saturated Addition

a: 10 200 300 32766 | 50 20000 32000 32000

b: 100 200 65530 40000 | 500 25000 1200 50000

c[1]: 110 400 65535 65535 | 550 45000 33200 65535

Results for AxvPackedSubU16 - Wraparound Subtraction

a: 10 200 30 65000 | 60 25000 32000 1200

b: 100 200 7 5000 | 500 28000 1200 950

c[0]: 65446 0 23 60000 | 65096 62536 30800 250

Results for AxvPackedSubI16 - Saturated Subtraction

a: 10 200 30 65000 | 60 25000 32000 1200

b: 100 200 7 5000 | 500 28000 1200 950

c[1]: 0 0 23 60000 | 0 0 30800 250

AVX also supports packed integer addition and subtraction using 8-, 32-, and 64-bit integers. The vpaddb, vpaddsb, vpaddusb, vpsubb, vpsubsb, and vpsubusb instructions are the 8-bit (byte) versions of the packed 16-bit instructions that were demonstrated in this example. The vpadd[d|q] and vpsub[d|q] instructions can be employed to perform packed 32-bit (doubleword) or 64-bit (quadword) addition and subtraction using wraparound arithmetic. AVX does not support saturated addition and subtraction using packed doubleword or quadword integers.

Packed Integer Shifts

The next source code example is named Ch07_02. This example illustrates how to perform logical and arithmetic shift operations using packed integer operands. Listing 7-2 shows the C++ and assembly language source code for example Ch07_02.

//------------------------------------------------

// Ch07_02.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include "XmmVal.h"

using namespace std;

// The order of the name constants in the following enum must

// correspond to the table values defined in .asm file.

enum ShiftOp : unsigned int

{

U16_LL, // shift left logical - word

U16_RL, // shift right logical - word

U16_RA, // shift right arithmetic - word

U32_LL, // shift left logical - doubleword

U32_RL, // shift right logical - doubleword

U32_RA, // shift right arithmetic - doubleword

};

extern "C" bool AvxPackedIntegerShift_(XmmVal& b, const XmmVal& a, ShiftOp shift_op, unsigned int count);

void AvxPackedIntegerShiftU16(void)

{

unsigned int count = 2;

alignas(16) XmmVal a;

alignas(16) XmmVal b;

a.m_U16[0] = 0x1234;

a.m_U16[1] = 0xFF00;

a.m_U16[2] = 0x00CC;

a.m_U16[3] = 0x8080;

a.m_U16[4] = 0x00FF;

a.m_U16[5] = 0xAAAA;

a.m_U16[6] = 0x0F0F;

a.m_U16[7] = 0x0101;

AvxPackedIntegerShift_(b, a, U16_LL, count);

cout << "\nResults for ShiftOp::U16_LL (count = " << count << ")\n";

cout << "a: " << a.ToStringX16() << '\n';

cout << "b: " << b.ToStringX16() << '\n';

AvxPackedIntegerShift_(b, a, U16_RL, count);

cout << "\nResults for ShiftOp::U16_RL (count = " << count << ")\n";

cout << "a: " << a.ToStringX16() << '\n';

cout << "b: " << b.ToStringX16() << '\n';

AvxPackedIntegerShift_(b, a, U16_RA, count);

cout << "\nResults for ShiftOp::U16_RA (count = " << count << ")\n";

cout << "a: " << a.ToStringX16() << '\n';

cout << "b: " << b.ToStringX16() << '\n';

}

void AvxPackedIntegerShiftU32(void)

{

unsigned int count = 4;

alignas(16) XmmVal a;

alignas(16) XmmVal b;

a.m_U32[0] = 0x12345678;

a.m_U32[1] = 0xFF00FF00;

a.m_U32[2] = 0x03030303;

a.m_U32[3] = 0x80800F0F;

AvxPackedIntegerShift_(b, a, U32_LL, count);

cout << "\nResults for ShiftOp::U32_LL (count = " << count << ")\n";

cout << "a: " << a.ToStringX32() << '\n';

cout << "b: " << b.ToStringX32() << '\n';

AvxPackedIntegerShift_(b, a, U32_RL, count);

cout << "\nResults for ShiftOp::U32_RL (count = " << count << ")\n";

cout << "a: " << a.ToStringX32() << '\n';

cout << "b: " << b.ToStringX32() << '\n';

AvxPackedIntegerShift_(b, a, U32_RA, count);

cout << "\nResults for ShiftOp::U32_RA (count = " << count << ")\n";

cout << "a: " << a.ToStringX32() << '\n';

cout << "b: " << b.ToStringX32() << '\n';

}

int main(void)

{

string sep(75, '-');

AvxPackedIntegerShiftU16();

cout << '\n' << sep << '\n';

AvxPackedIntegerShiftU32();

return 0;

}

;-------------------------------------------------

; Ch07_02.asm

;-------------------------------------------------

; extern "C" bool AvxPackedIntegerShift_(XmmVal& b, const XmmVal& a, ShiftOp shift_op, unsigned int count)

;

; Returns: 0 = invalid shift_op argument, 1 = success

;

; Note: This module requires linker option /LARGEADDRESSAWARE:NO

; to be explicitly set.

.code

AvxPackedIntegerShift_ proc

; Make sure 'shift_op' is valid

mov r8d,r8d ;zero extend shift_op

cmp r8,ShiftOpTableCount ;compare against table count

jae Error ;jump if shift_op is invalid

; Jump to the operation specified by shift_op

vmovdqa xmm0,xmmword ptr [rdx] ;xmm0 = a

vmovd xmm1,r9d ;xmm1[31:0] = shift count

mov eax,1 ;set success return code

jmp [ShiftOpTable+r8*8]

; Packed shift left logical - word

U16_LL: vpsllw xmm2,xmm0,xmm1

vmovdqa xmmword ptr [rcx],xmm2

ret

; Packed shift right logical - word

U16_RL: vpsrlw xmm2,xmm0,xmm1

vmovdqa xmmword ptr [rcx],xmm2

ret

; Packed shift right arithmetic - word

U16_RA: vpsraw xmm2,xmm0,xmm1

vmovdqa xmmword ptr [rcx],xmm2

ret

; Packed shift left logical - doubleword

U32_LL: vpslld xmm2,xmm0,xmm1

vmovdqa xmmword ptr [rcx],xmm2

ret

; Packed shift right logical - doubleword

U32_RL: vpsrld xmm2,xmm0,xmm1

vmovdqa xmmword ptr [rcx],xmm2

ret

; Packed shift right arithmetic - doubleword

U32_RA: vpsrad xmm2,xmm0,xmm1

vmovdqa xmmword ptr [rcx],xmm2

ret

Error: xor eax,eax ;set error code

vpxor xmm0,xmm0,xmm0

vmovdqa xmmword ptr [rcx],xmm0 ;set result to zero

ret

; The order of the labels in the following table must correspond

; to the enums that are defined in .cpp file.

align 8

ShiftOpTable qword U16_LL, U16_RL, U16_RA

qword U32_LL, U32_RL, U32_RA

ShiftOpTableCount equ ($ - ShiftOpTable) / size qword

AvxPackedIntegerShift_ endp

end

Listing 7-2.

Example Ch07_02

The C++ code that’s shown in Listing 7-2 begins with the definition of an enum named ShiftOp, which is used to select a shift operation. Supported shift operations include logical left, logical right, and arithmetic right using packed word and doubleword values. Following enum ShiftOp is the declaration for the function AvxPackedIntegerShift_. This function carries out the requested shift operation using the supplied XmmVal argument and the specified count value. The C++ functions AvxPackedIntegerShiftU16 and AvxPackedIntegerShiftU32 initialize test cases for performing various shift operations using packed words and doublewords, respectively.

Assembly language function AvxPackedIntegerShift_ uses a jump table to execute the specified shift operation. This is similar to what you saw in source code examples Ch05_06 (Chapter 5) and Ch06_03 (Chapter 6). Upon entry to AvxPackedIntegerShift_, the argument value shift_op is tested for validity. Following validation of shift_op, a vmovdqa xmm0,xmmword ptr [rdx] instruction loads a into register XMM0. The subsequent vmovd xmm1,r9d instruction copies argument value count into the low-order doubleword of register XMM1. This is followed by a jmp [ShiftOpTable+r8*8] instruction that transfers program control to the appropriate code block.

Each distinct code block in AvxPackedIntegerShift_ performs a particular shift operation. For example, the code block adjacent to the label U16_LL uses the AVX instruction vpsllw xmm2,xmm0,xmm1 to perform a logical left shift using packed words. It is important to note that every word element in XMM0 is independently shifted left by the number of bits specified in XMM1[31:0]. The code blocks adjacent to the labels U16_RL and U16_RA carry out logical and arithmetic right shifts of packed words using the instructions vpsrlw and vpsraw, respectively. The function AvxPackedIntegerShift_ employs a similar structure to perform packed shift operations on doublewords using the instructions vpslld, vpslrd, and vpsrad. All of the code blocks in AvxPackedIntegerShift_ conclude with a vmovdqa xmmword ptr [rcx],xmm2 instruction that saves the calculated result. Here is the output for source code example Ch07_02:

Results for ShiftOp::U16_LL (count = 2)

a: 1234 FF00 00CC 8080 | 00FF AAAA 0F0F 0101

b: 48D0 FC00 0330 0200 | 03FC AAA8 3C3C 0404

Results for ShiftOp::U16_RL (count = 2)

a: 1234 FF00 00CC 8080 | 00FF AAAA 0F0F 0101

b: 048D 3FC0 0033 2020 | 003F 2AAA 03C3 0040

Results for ShiftOp::U16_RA (count = 2)

a: 1234 FF00 00CC 8080 | 00FF AAAA 0F0F 0101

b: 048D FFC0 0033 E020 | 003F EAAA 03C3 0040

---------------------------------------------------------------------------

Results for ShiftOp::U32_LL (count = 4)

a: 12345678 FF00FF00 | 03030303 80800F0F

b: 23456780 F00FF000 | 30303030 0800F0F0

Results for ShiftOp::U32_RL (count = 4)

a: 12345678 FF00FF00 | 03030303 80800F0F

b: 01234567 0FF00FF0 | 00303030 080800F0

Results for ShiftOp::U32_RA (count = 4)

a: 12345678 FF00FF00 | 03030303 80800F0F

b: 01234567 FFF00FF0 | 00303030 F80800F0

The AVX instructions vpsllq, vpslrq, and vpsraq can be used to perform shift operations using packed quadwords. Somewhat surprisingly, AVX does not support shift operations using packed byte operands. AVX also includes the shift instructions vps[l|r]dq, which carry out logical left or logical right byte shifts of a 128-bit wide operand in an XMM register. You’ll see how these instructions work in the next section.

Packed Integer Multiplication

Besides packed integer addition and subtraction, AVX also includes instructions that perform packed integer multiplication. These instructions are slightly different than the corresponding packed addition and subtraction instructions. Part of this difference is due to the fact that in order to calculate a non-truncated product, integer multiplication requires a destination operand to be twice the size of the original source operands. For example, the non-truncated product of two signed 16-bit integers is a signed 32-bit integer. Listing 7-3 shows the source code for example Ch07_03. This example demonstrates how to perform packed integer multiplication using signed 16-bit and 32-bit integers.

//------------------------------------------------

// Ch07_03.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include "XmmVal.h"

using namespace std;

extern "C" void AvxPackedMulI16_(XmmVal c[2], const XmmVal& a, const XmmVal& b);

extern "C" void AvxPackedMulI32A_(XmmVal c[2], const XmmVal& a, const XmmVal& b);

extern "C" void AvxPackedMulI32B_(XmmVal* c, const XmmVal& a, const XmmVal& b);

void AvxPackedMulI16(void)

{

alignas(16) XmmVal a;

alignas(16) XmmVal b;

alignas(16) XmmVal c[2];

a.m_I16[0] = 10; b.m_I16[0] = -5;

a.m_I16[1] = 3000; b.m_I16[1] = 100;

a.m_I16[2] = -2000; b.m_I16[2] = -9000;

a.m_I16[3] = 42; b.m_I16[3] = 1000;

a.m_I16[4] = -5000; b.m_I16[4] = 25000;

a.m_I16[5] = 8; b.m_I16[5] = 16384;

a.m_I16[6] = 10000; b.m_I16[6] = 3500;

a.m_I16[7] = -60; b.m_I16[7] = 6000;

AvxPackedMulI16_(c, a, b);

cout << "\nResults for AvxPackedMulI16\n";

for (size_t i = 0; i < 8; i++)

{

cout << "a[" << i << "]: " << setw(8) << a.m_I16[i] << " ";

cout << "b[" << i << "]: " << setw(8) << b.m_I16[i] << " ";

if (i < 4)

{

cout << "c[0][" << i << "]: ";

cout << setw(12) << c[0].m_I32[i] << '\n';

}

else

{

cout << "c[1][" << i - 4 << "]: ";

cout << setw(12) << c[1].m_I32[i - 4] << '\n';

}

void AvxPackedMulI32A(void)

{

alignas(16) XmmVal a;

alignas(16) XmmVal b;

alignas(16) XmmVal c[2];

a.m_I32[0] = 10; b.m_I32[0] = -500;

a.m_I32[1] = 3000; b.m_I32[1] = 100;

a.m_I32[2] = -40000; b.m_I32[2] = -120000;

a.m_I32[3] = 4200; b.m_I32[3] = 1000;

AvxPackedMulI32A_(c, a, b);

cout << "\nResults for AvxPackedMulI32A\n";

for (size_t i = 0; i < 4; i++)

{

cout << "a[" << i << "]: " << setw(10) << a.m_I32[i] << " ";

cout << "b[" << i << "]: " << setw(10) << b.m_I32[i] << " ";

if (i < 2)

{

cout << "c[0][" << i << "]: ";

cout << setw(14) << c[0].m_I64[i] << '\n';

}

else

{

cout << "c[1][" << i - 2 << "]: ";

cout << setw(14) << c[1].m_I64[i - 2] << '\n';

}

void AvxPackedMulI32B(void)

{

alignas(16) XmmVal a;

alignas(16) XmmVal b;

alignas(16) XmmVal c;

a.m_I32[0] = 10; b.m_I32[0] = -500;

a.m_I32[1] = 3000; b.m_I32[1] = 100;

a.m_I32[2] = -2000; b.m_I32[2] = -12000;

a.m_I32[3] = 4200; b.m_I32[3] = 1000;

AvxPackedMulI32B_(&c, a, b);

cout << "\nResults for AvxPackedMulI32B\n";

for (size_t i = 0; i < 4; i++)

{

cout << "a[" << i << "]: " << setw(10) << a.m_I32[i] << " ";

cout << "b[" << i << "]: " << setw(10) << b.m_I32[i] << " ";

cout << "c[" << i << "]: " << setw(10) << c.m_I32[i] << '\n';

}

int main()

{

string sep(75, '-');

AvxPackedMulI16();

cout << '\n' << sep << '\n';

AvxPackedMulI32A();

cout << '\n' << sep << '\n';

AvxPackedMulI32B();

return 0;

}

;-------------------------------------------------

; Ch07_03.asm

;-------------------------------------------------

; extern "C" void AvxPackedMulI16_(XmmVal c[2], const XmmVal* a, const XmmVal* b)

.code

AvxPackedMulI16_ proc

vmovdqa xmm0,xmmword ptr [rdx] ;xmm0 = a

vmovdqa xmm1,xmmword ptr [r8] ;xmm1 = b

vpmullw xmm2,xmm0,xmm1 ;xmm2 = packed a * b low result

vpmulhw xmm3,xmm0,xmm1 ;xmm3 = packed a * b high result

vpunpcklwd xmm4,xmm2,xmm3 ;merge low and high results

vpunpckhwd xmm5,xmm2,xmm3 ;into final signed dwords

vmovdqa xmmword ptr [rcx],xmm4 ;save final results

vmovdqa xmmword ptr [rcx+16],xmm5

ret

AvxPackedMulI16_ endp

; extern "C" void AvxPackedMulI32A_(XmmVal c[2], const XmmVal* a, const XmmVal* b)

AvxPackedMulI32A_ proc

; Perform packed signed dword multiplication. Note that vpmuldq

; performs following operations:

;

; xmm2[63:0] = xmm0[31:0] * xmm1[31:0]

; xmm2[127:64] = xmm0[95:64] * xmm1[95:64]

vmovdqa xmm0,xmmword ptr [rdx] ;xmm0 = a

vmovdqa xmm1,xmmword ptr [r8] ;xmm1 = b

vpmuldq xmm2,xmm0,xmm1

; Shift source operands right by 4 bytes and repeat vpmuldq

vpsrldq xmm0,xmm0,4

vpsrldq xmm1,xmm1,4

vpmuldq xmm3,xmm0,xmm1

; Save results

vpextrq qword ptr [rcx],xmm2,0 ;save xmm2[63:0]

vpextrq qword ptr [rcx+8],xmm3,0 ;save xmm3[63:0]

vpextrq qword ptr [rcx+16],xmm2,1 ;save xmm2[127:63]

vpextrq qword ptr [rcx+24],xmm3,1 ;save xmm3[127:63]

ret

AvxPackedMulI32A_ endp

; extern "C" void AvxPackedMulI32B_(XmmVal*, const XmmVal* a, const XmmVal* b)

AvxPackedMulI32B_ proc

; Perform packed signed integer multiplication and save low packed dword result

vmovdqa xmm0,xmmword ptr [rdx] ;xmm0 = a

vpmulld xmm1,xmm0,xmmword ptr [r8] ;xmm1 = packed a * b

vmovdqa xmmword ptr [rcx],xmm1 ;save packed dword result

ret

AvxPackedMulI32B_ endp

end

Listing 7-3.

Example Ch07_03

The C++ function AvxPackedMulI16 contains code that initializes XmmVal variables a and b using signed 16-bit integers. This function then invokes the assembly language function AxvPackedMulI16_, which performs packed multiplication using signed 16-bit integers. The results are then streamed to cout. Note that the results displayed by function AvxPackedMulI16 are signed 32-bit integer products. The other two C++ functions in Listing 7-3, AvxPackedMulI32A and AvxPackedMul32B, initialize test cases for performing packed signed 32-bit integer multiplication. The former of these functions computes a packed signed 64-bit integer product, while the latter calculates a packed signed 32-bit integer product.

The assembly language function AvxPackedMulI16_ begins with two vmovdqa instructions that load argument values a and b into registers XMM0 and XMM1, respectively. The ensuing vpmullw xmm2,xmm0,xmm1 instruction multiplies the packed signed 16-bit integers in XMM0 and XMM1 and saves the low-order 16 bits of each 32-bit product in XMM2. This is followed by a vpmulhw xmm3,xmm0,xmm1 instruction that calculates and saves the high-order 16 bits of each 32-bit product. The next two instructions, vpunpcklwd xmm4,xmm2,xmm3 and vpunpckhwd xmm5,xmm2,xmm3, create the final packed 32-bit signed integer products by interleaving the low-order (vpunpcklwd) or high-order (vpunpckhwd) words of their source operands. Figure 7-1 illustrates the instruction sequence that’s employed by AvxPackedMulI16_.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig1_HTML.jpg — Figure 7-1.
Instruction sequence used in *AvxPackedMulI16_* to perform packed 16-bit signed integer multiplication

The next assembly language function in Listing 7-3, AvxPackedMulI32A_, performs packed signed 32-bit integer multiplication. This function begins with two vmovdqa instructions that load XmmVal argument values a and b into registers XMM0 and XMM1, respectively. The vpmuldq xmm2,xmm0,xmm1 instruction that follows performs packed signed 32-bit multiplication using the even numbered elements of the two source operands. It then saves the signed 64-bit products in XMM2. Two vpsrldq instructions are then used to right shift by four bytes the contents of registers XMM0 and XMM1. This is followed by another vpmuldq instruction that calculates the remaining 64-bit products. Figure 7-2 show the execution details of this instruction sequence.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig2_HTML.jpg — Figure 7-2.
Execution of *vpmuldq* and *vpsrldq* instructions

Following the execution of the second vpmuldq instruction, registers XMM2 and XMM3 contain the four signed 64-bit products. These values are then saved to the specified destination buffer using a series of vpextrq (Extract Quadword) instructions. This instruction copies the quadword element that’s specified by the immediate (or second source) operand from the first source operand and saves it to the destination operand. For example, the instruction vpextrq qword ptr [rcx],xmm2,0 saves the low-order quadword of XMM2 to the memory location specified by RCX. The first source operand of a vpextrq instruction must be an XMM register; the destination operand can be a general-purpose register or a memory location. AVX also includes instructions that you can use to extract byte (vpextrb), word (vpextrw), or doubleword (vpextrd) elements.

The final assembly language function in this source code example is named AvxPackedMulI32B_. This function also performs packed signed 32-bit integer multiplication but saves truncated 32-bit products. Function AvxPackedMulI32B_ uses the vpmulld instruction that performs element-by-element doubleword multiplication similar to packed addition or subtraction. The low-order 32 bits of each product are then saved to the destination operand. The results for source code example Ch07_03 are as follows:

Results for AvxPackedMulI16

a[0]: 10 b[0]: -5 c[0][0]: -50

a[1]: 3000 b[1]: 100 c[0][1]: 300000

a[2]: -2000 b[2]: -9000 c[0][2]: 18000000

a[3]: 42 b[3]: 1000 c[0][3]: 42000

a[4]: -5000 b[4]: 25000 c[1][0]: -125000000

a[5]: 8 b[5]: 16384 c[1][1]: 131072

a[6]: 10000 b[6]: 3500 c[1][2]: 35000000

a[7]: -60 b[7]: 6000 c[1][3]: -360000

---------------------------------------------------------------------------

Results for AvxPackedMulI32A

a[0]: 10 b[0]: -500 c[0][0]: -5000

a[1]: 3000 b[1]: 100 c[0][1]: 300000

a[2]: -40000 b[2]: -120000 c[1][0]: 4800000000

a[3]: 4200 b[3]: 1000 c[1][1]: 4200000

---------------------------------------------------------------------------

Results for AvxPackedMulI32B

a[0]: 10 b[0]: -500 c[0]: -5000

a[1]: 3000 b[1]: 100 c[1]: 300000

a[2]: -2000 b[2]: -12000 c[2]: 24000000

a[3]: 4200 b[3]: 1000 c[3]: 4200000

Packed Integer Image Processing

The source code examples presented thus far were intended to familiarize you with AVX packed integer programming. Each example included a simple assembly language function that demonstrated the operation of several AVX instructions using instances of the structure XmmVal . For some real-world application programs, it may be appropriate to create a small set of functions similar to the ones you’ve seen thus far. However, in order to fully exploit the benefits of the AVX, you need to code functions that implement complete algorithms using common data structures.

The source code examples in this section present algorithms that process arrays of unsigned 8-bit integers using the AVX instruction set. In the first example, you learn how to determine the minimum and maximum value of an array. This sample program has a certain practicality to it since digital images often use arrays of unsigned 8-bit integers to represent images in memory, and many image-processing algorithms (e.g., contrast enhancement) often need to determine the minimum (darkest) and maximum (lightest) pixels in an image. The second sample program illustrates how to calculate the mean value of an array of unsigned 8-bit integers. This is another example of a realistic algorithm that is directly relevant to the province of image processing. The final three source code examples implement universal image processing algorithms, including pixel conversion, histogram creation, and thresholding.

Pixel Minimum-Maximum Values

Source code example Ch07_04, shown in Listing 7-4, demonstrates how to find the minimum and maximum values in an array of unsigned 8-bit integers. This example also explains how to dynamically allocate aligned storage space for an array.

//------------------------------------------------

// AlignedMem.h

//------------------------------------------------

#pragma once

#include <cstdint>

#include <malloc.h>

#include <stdexcept>

class AlignedMem

{

public:

static void* Allocate(size_t mem_size, size_t mem_alignment)

{

void* p = _aligned_malloc(mem_size, mem_alignment);

if (p == NULL)

throw std::runtime_error("Memory allocation error: AllocateAlignedMem()");

return p;

}

static void Release(void* p)

{

_aligned_free(p);

}

template <typename T> static bool IsAligned(const T* p, size_t alignment)

{

if (p == nullptr)

return false;

if (((uintptr_t)p % alignment) != 0)

return false;

return true;

}

};

template <class T> class AlignedArray

{

T* m_Data;

size_t m_Size;

public:

AlignedArray(void) = delete;

AlignedArray(const AlignedArray& aa) = delete;

AlignedArray(AlignedArray&& aa) = delete;

AlignedArray& operator = (const AlignedArray& aa) = delete;

AlignedArray& operator = (AlignedArray&& aa) = delete;

AlignedArray(size_t size, size_t alignment)

{

m_Size = size;

m_Data = (T*)AlignedMem::Allocate(size * sizeof(T), alignment);

}

∼AlignedArray()

{

AlignedMem::Release(m_Data);

}

T* Data(void) { return m_Data; }

size_t Size(void) { return m_Size; }

void Fill(T val)

{

for (size_t i = 0; i < m_Size; i++)

m_Data[i] = val;

}

};

//------------------------------------------------

// Ch07_04.h

//------------------------------------------------

#pragma once

#include <cstdint>

// Ch07_04.cpp

extern void Init(uint8_t* x, size_t n, unsigned int seed);

extern bool AvxCalcMinMaxU8Cpp(const uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max);

// Ch07_04_BM.cpp

extern void AvxCalcMinMaxU8_BM(void);

// Ch07_04_.asm

extern "C" bool AvxCalcMinMaxU8_(const uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max);

// c_NumElements must be > 0 and even multiple of 64

const size_t c_NumElements = 16 * 1024 * 1024;

const unsigned int c_RngSeedVal = 23;

//------------------------------------------------

// Ch07_04.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <cstdint>

#include <random>

#include "Ch07_04.h"

#include "AlignedMem.h"

using namespace std;

void Init(uint8_t* x, size_t n, unsigned int seed)

{

uniform_int_distribution<> ui_dist {5, 250};

default_random_engine rng {seed};

for (size_t i = 0; i < n; i++)

x[i] = (uint8_t)ui_dist(rng);

// Use known values for min & max (for test purposes)

x[(n / 4) * 3 + 1] = 2;

x[n / 4 + 11] = 3;

x[n / 2] = 252;

x[n / 2 + 13] = 253;

x[n / 8 + 5] = 4;

x[n / 8 + 7] = 254;

}

bool AvxCalcMinMaxU8Cpp(const uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max)

{

if (n == 0 || (n & 0x3f) != 0)

return false;

if (!AlignedMem::IsAligned(x, 16))

return false;

uint8_t x_min_temp = 0xff;

uint8_t x_max_temp = 0;

for (size_t i = 0; i < n; i++)

{

uint8_t val = *x++;

if (val < x_min_temp)

x_min_temp = val;

else if (val > x_max_temp)

x_max_temp = val;

}

*x_min = x_min_temp;

*x_max = x_max_temp;

return true;

}

void AvxCalcMinMaxU8()

{

size_t n = c_NumElements;

AlignedArray<uint8_t> x_aa(n, 16);

uint8_t* x = x_aa.Data();

Init(x, n, c_RngSeedVal);

uint8_t x_min1 = 0, x_max1 = 0;

uint8_t x_min2 = 0, x_max2 = 0;

bool rc1 = AvxCalcMinMaxU8Cpp(x, n, &x_min1, &x_max1);

bool rc2 = AvxCalcMinMaxU8_(x, n, &x_min2, &x_max2);

cout << "\nResults for AvxCalcMinMaxU8\n";

cout << "rc1: " << rc1 << " x_min1: " << (int)x_min1;

cout << " x_max1: " << (int)x_max1 << '\n';

cout << "rc2: " << rc1 << " x_min2: " << (int)x_min2;

cout << " x_max2: " << (int)x_max2 << '\n';

}

int main()

{

AvxCalcMinMaxU8();

AvxCalcMinMaxU8_BM();

return 0;

}

;-------------------------------------------------

; Ch07_04_.asm

;-------------------------------------------------

; extern "C" bool AvxCalcMinMaxU8_(uint8_t* x, size_t n, uint8_t* x_min, uint8_t* x_max)

;

; Returns: 0 = invalid n or unaligned array, 1 = success

.const

align 16

StartMinVal qword 0ffffffffffffffffh ;Initial packed min values

qword 0ffffffffffffffffh

StartMaxVal qword 0000000000000000h ;Initial packed max values

qword 0000000000000000h

.code

AvxCalcMinMaxU8_ proc

; Make sure 'n' is valid

xor eax,eax ;set error return code

or rdx,rdx ;is n == 0?

jz Done ;jump if yes

test rdx,3fh ;is n a multiple of 64?

jnz Done ;jump if no

test rcx,0fh ;is x properly aligned?

jnz Done ;jump if no

; Initialize packed min-max values

vmovdqa xmm2,xmmword ptr [StartMinVal]

vmovdqa xmm3,xmm2 ;xmm3:xmm2 = packed min values

vmovdqa xmm4,xmmword ptr [StartMaxVal]

vmovdqa xmm5,xmm4 ;xmm5:xmm4 = packed max values

; Scan array for min & max values

@@: vmovdqa xmm0,xmmword ptr [rcx] ;xmm0 = x[i + 15] : x[i]

vmovdqa xmm1,xmmword ptr [rcx+16] ;xmm1 = x[i + 31] : x[i + 16]

vpminub xmm2,xmm2,xmm0

vpminub xmm3,xmm3,xmm1 ;xmm3:xmm2 = updated min values

vpmaxub xmm4,xmm4,xmm0

vpmaxub xmm5,xmm5,xmm1 ;xmm5:xmm4 = updated max values

vmovdqa xmm0,xmmword ptr [rcx+32] ;xmm0 = x[i + 47] : x[i + 32]

vmovdqa xmm1,xmmword ptr [rcx+48] ;xmm1 = x[i + 63] : x[i + 48]

vpminub xmm2,xmm2,xmm0

vpminub xmm3,xmm3,xmm1 ;xmm3:xmm2 = updated min values

vpmaxub xmm4,xmm4,xmm0

vpmaxub xmm5,xmm5,xmm1 ;xmm5:xmm4 = updated max values

add rcx,64

sub rdx,64

jnz @B

; Determine final minimum value

vpminub xmm0,xmm2,xmm3 ;xmm0[127:0] = final 16 min vals

vpsrldq xmm1,xmm0,8 ;xmm1[63:0] = xmm0[127:64]

vpminub xmm2,xmm1,xmm0 ;xmm2[63:0] = final 8 min vals

vpsrldq xmm3,xmm2,4 ;xmm3[31:0] = xmm2[63:32]

vpminub xmm0,xmm3,xmm2 ;xmm0[31:0] = final 4 min vals

vpsrldq xmm1,xmm0,2 ;xmm1[15:0] = xmm0[31:16]

vpminub xmm2,xmm1,xmm0 ;xmm2[15:0] = final 2 min vals

vpextrw eax,xmm2,0 ;ax = final 2 min vals

cmp al,ah

jbe @F ;jump if al <= ah

mov al,ah ;al = final min value

@@: mov [r8],al ;save final min

; Determine final maximum value

vpmaxub xmm0,xmm4,xmm5 ;xmm0[127:0] = final 16 max vals

vpsrldq xmm1,xmm0,8 ;xmm1[63:0] = xmm0[127:64]

vpmaxub xmm2,xmm1,xmm0 ;xmm2[63:0] = final 8 max vals

vpsrldq xmm3,xmm2,4 ;xmm3[31:0] = xmm2[63:32]

vpmaxub xmm0,xmm3,xmm2 ;xmm0[31:0] = final 4 max vals

vpsrldq xmm1,xmm0,2 ;xmm1[15:0] = xmm0[31:16]

vpmaxub xmm2,xmm1,xmm0 ;xmm2[15:0] = final 2 max vals

vpextrw eax,xmm2,0 ;ax = final 2 min vals

cmp al,ah

jae @F ;jump if al >= ah

mov al,ah ;al = final max value

@@: mov [r9],al ;save final max

mov eax,1 ;set success return code

Done: ret

AvxCalcMinMaxU8_ endp

end

Listing 7-4.

Example Ch07_04

Listing 7-4 begins with the source code for the header file AlignedMem .h. This file defines a couple of simple C++ classes that facilitate dynamically allocated aligned arrays. The class AlignedMem is a basic wrapper class for the Visual C++ runtime functions _aligned_malloc and _aligned_free. This class also includes a template member function named AlignedMem::IsAligned that validates the alignment of an array in memory. The header file AlignedMem.h also defines a template class named AlignedArray . Class AlignedArray, which is used in this and subsequent source code examples, contains code that implements and manages dynamically allocated aligned arrays. Note that this class contains only minimal functionality to support the source code examples in this book, which is why many of the standard constructors and assignment operators are disabled.

The primary C++ code in example Ch07_04 begins with the definition of a function name Init. This function initializes an array of unsigned 8-bit integers with random values in order to simulate the pixel values of an image. Function Init uses the C++ standard template library (STL) classes uniform_int_distribution and default_random_engine to generate random values for the array. Appendix A contains a list of references that you can consult if you’re interested in learning more about these classes. Note that function Init sets some of the pixel values in the target array to know values for test purposes.

The function AvxCalcMinMaxU8Cpp implements a C++ version of the pixel value min-max algorithm. Parameters for this function include a pointer to the array, the number of array elements, and pointers for the minimum and maximum values. The algorithm itself consists of an unsophisticated for loop that sweeps though the array to find the minimum and maximum pixel values. Note that function AvxCalcMinMaxU8Cpp (and its counterpart assembly language function AvxCalcMinMaxU8_) requires the size of the array to be an even multiple of 64. The reason for this is that the assembly language function AvxCalcMinMaxU8_ (arbitrarily) processes 64 pixels during each loop iteration, as you’ll soon see. Also note that the source pixel array must be aligned to a 16-byte boundary. The C++ template function AlignedMem::IsAligned performs this check.

The C++ function AvxCalcMinMaxU8 contains code that initializes a test array and exercises the two pixel min-max functions. This function uses the aforementioned template class named AlignedArray to dynamically allocate an array of unsigned 8-bit integers that’s aligned to a 16-byte boundary. The constructor arguments for this class include the number of array elements and the alignment boundary. Following the AlignedArray<uint8_t> x_aa(n, 16) statement, AvxCalcMinMaxU8 obtains a raw C++ pointer to the array buffer using the member function AlignedArray::Data(). This pointer is passed as an argument to the two min-max functions.

The assembly language function AvxCalcMinMaxU8_ implements the same algorithm as its C++ counterpart with one significant difference. It processes array elements using 16-byte packets, which is the maximum number of unsigned 8-bit integers that can be stored in an XMM register. The function AvxCalcMinMaxU8_ begins by validating the size of argument n. It then checks array x for proper alignment. Following argument validation, AvxCalcMinMaxU8_ loads register pairs XMM3:XMM2 and XMM5:XMM4 with the initial packed minimum and maximum values, respectively. This enables the processing loop to track 32 min-max values simultaneously.

During each processing loop iteration, the function AvxCalcMinMaxU8_ loads 32 pixel values into register pair XMM1:XMM0 using the instructions vmovdqa xmm0,xmmword ptr [rcx] and vmovdqa xmm1,xmmword ptr [rcx+16]. The next two instructions, vpminub xmm2,xmm2,xmm0 and vpminub xmm3,xmm3,xmm1, update the current pixel minimums in register pair XMM3:XMM2. The ensuing vpmaxub instructions update the current pixel maximums in register pair XMM5:XMM4. Another sequence of vmovdqa, vpminub, and vpmaxub instructions handles the next group of 32 pixels. The processing of multiple data items during each loop iteration reduces the number of executed jump instructions and often results in faster code. This optimization technique is commonly called loop unrolling (or unwinding). You’ll learn more about loop unrolling and jump instruction optimization techniques in Chapter 15.

Following the execution of the pixel value min-max processing loop, the values in register pairs XMM3:XMM2 and XMM5:XMM4 must be reduced in order to obtain the final minimum and maximum values. The vpminub xmm0,xmm2,xmm3 instruction reduces the number of pixel minimum values from 32 to 16. The next instruction, vpsrldq xmm1,xmm0,8, right shifts the contents of XMM0 by eight bytes and saves the result in register XMM1 (i.e., XMM1[63:0] = XMM0[127:64]). This facilitates the use of the subsequent vpminub xmm2,xmm1,xmm0 instruction that reduces the number of minimum values from 16 to 8. Two more vpsrldq-vpminub instruction sequences are then employed to reduce the number of pixel minimums to two as shown in Figure 7-3. The vpextrw eax,xmm2,0 extracts the low-order word (XMM2[15:0]) from register XMM2 and saves it to the low-order word of register EAX (or register AX). The cmp al,ah, jbe, and mov al,ah instructions determine the final pixel minimum value. AvxCalcMinMaxU8_ uses a similar reduction technique to determine the maximum pixel value.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig3_HTML.jpg — Figure 7-3.
Reduction of pixel minimum values using the instructions *vpminub* and *vpsrldq*

Here is the output for source code example Ch07_04:

Results for AvxCalcMinMaxU8

rc1: 1 x_min1: 2 x_max1: 254

rc2: 1 x_min2: 2 x_max2: 254

Running benchmark function AvxCalcMinMaxU8_BM - please wait

Benchmark times save to file Ch07_04_AvxCalcMinMaxU8_BM_CHROMIUM.csv

Table 7-1 shows some timing measurements for the functions AvxCalcMinMaxU8 and AvxCalcMinMaxU8_ using several different Intel processors. These measurements were made using the procedure that’s described in Chapter 6. The benchmark source code for this and subsequent examples is not shown but included with the chapter download packages.

Table 7-1.

Pixel Value Min-Max Mean Execution Times (Microseconds), Array Size = 16 MB

CPU	AvxCalcMinMaxU8Cpp	AvxCalcMinMaxU8_
i7-4790S	17642	1007
i9-7900X	13638	874
i7-8700K	12622	721

Pixel Mean Intensity

The next source code example, Ch07_05, contains code that calculates the arithmetic mean of an array of 8-bit unsigned integers. This example also illustrates how to size-promote packed unsigned integers. Listing 7-5 shows the source code for example Ch07_05.

//------------------------------------------------

// Ch07_05.h

//------------------------------------------------

#pragma once

#include <cstdint>

// Ch07_05.cpp

extern void Init(uint8_t* x, size_t n, unsigned int seed);

extern bool AvxCalcMeanU8Cpp(const uint8_t* x, size_t n, int64_t* sum_x, double* mean);

// Ch07_05_BM.cpp

extern void AvxCalcMeanU8_BM(void);

// Ch07_05_.asm

extern "C" bool AvxCalcMeanU8_(const uint8_t* x, size_t n, int64_t* sum_x, double* mean);

// Common constants

const size_t c_NumElements = 16 * 1024 * 1024; // Must be multiple of 64

const size_t c_NumElementsMax = 64 * 1024 * 1024; // Used to avoid overflows

const unsigned int c_RngSeedVal = 29;

//------------------------------------------------

// Ch07_05.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#include <random>

#include "Ch07_05.h"

#include "AlignedMem.h"

using namespace std;

extern "C" size_t g_NumElementsMax = c_NumElementsMax; // Used in .asm code

void Init(uint8_t* x, size_t n, unsigned int seed)

{

uniform_int_distribution<> ui_dist {0, 255};

default_random_engine rng {seed};

for (size_t i = 0; i < n; i++)

x[i] = (uint8_t)ui_dist(rng);

}

bool AvxCalcMeanU8Cpp(const uint8_t* x, size_t n, int64_t* sum_x, double* mean_x)

{

if (n == 0 || n > c_NumElementsMax)

return false;

if ((n % 64) != 0)

return false;

if (!AlignedMem::IsAligned(x, 16))

return false;

int64_t sum_x_temp = 0;

for (int i = 0; i < n; i++)

sum_x_temp += x[i];

*sum_x = sum_x_temp;

*mean_x = (double)sum_x_temp / n;

return true;

}

void AvxCalcMeanU8()

{

const size_t n = c_NumElements;

AlignedArray<uint8_t> x_aa(n, 16);

uint8_t* x = x_aa.Data();

Init(x, n, c_RngSeedVal);

bool rc1, rc2;

int64_t sum_x1 = -1, sum_x2 = -1;

double mean_x1 = -1, mean_x2 = -1;

rc1 = AvxCalcMeanU8Cpp(x, n, &sum_x1, &mean_x1);

rc2 = AvxCalcMeanU8_(x, n, &sum_x2, &mean_x2);

cout << "\nResults for MmxCalcMeanU8\n";

cout << fixed << setprecision(6);

cout << "rc1: " << rc1 << " ";

cout << "sum_x1: " << sum_x1 << " ";

cout << "mean_x1: " << mean_x1 << '\n';

cout << "rc2: " << rc2 << " ";

cout << "sum_x2: " << sum_x2 << " ";

cout << "mean_x2: " << mean_x2 << '\n';

}

int main()

{

AvxCalcMeanU8();

AvxCalcMeanU8_BM();

return 0;

}

;-------------------------------------------------

; Ch07_05.asm

;-------------------------------------------------

include <MacrosX86-64-AVX.asmh>

extern g_NumElementsMax:qword

; extern "C" bool AvxCalcMeanU8_(const Uint8* x, size_t n, int64_t* sum_x, double* mean);

;

; Returns 0 = invalid n or unaligned array, 1 = success

.code

AvxCalcMeanU8_ proc frame

_CreateFrame CM_,0,64

_SaveXmmRegs xmm6,xmm7,xmm8,xmm9

_EndProlog

; Verify function arguments

xor eax,eax ;set error return code

or rdx,rdx

jz Done ;jump if n == 0

cmp rdx,[g_NumElementsMax]

jae Done ;jump if n > NumElementsMax

test rdx,3fh

jnz Done ;jump if (n % 64) != 0

test rcx,0fh

jnz Done ;jump if x is not properly aligned

; Perform required initializations

mov r10,rdx ;save n for later use

add rdx,rcx ;rdx = end of array

vpxor xmm8,xmm8,xmm8 ;xmm8 = packed intermediate sums (4 dwords)

vpxor xmm9,xmm9,xmm9 ;xmm9 = packed zero for promotions

; Promote 32 pixel values from bytes to words, then sum the words

@@: vmovdqa xmm0,xmmword ptr [rcx]

vmovdqa xmm1,xmmword ptr [rcx+16] ;xmm1:xmm0 = 32 pixels

vpunpcklbw xmm2,xmm0,xmm9 ;xmm2 = 8 words

vpunpckhbw xmm3,xmm0,xmm9 ;xmm3 = 8 words

vpunpcklbw xmm4,xmm1,xmm9 ;xmm4 = 8 words

vpunpckhbw xmm5,xmm1,xmm9 ;xmm5 = 8 words

vpaddw xmm0,xmm2,xmm3

vpaddw xmm1,xmm4,xmm5

vpaddw xmm6,xmm0,xmm1 ;xmm6 = packed sums (8 words)

; Promote another 32 pixel values from bytes to words, then sum the words

vmovdqa xmm0,xmmword ptr [rcx+32]

vmovdqa xmm1,xmmword ptr [rcx+48] ;xmm1:xmm0 = 32 pixels

vpunpcklbw xmm2,xmm0,xmm9 ;xmm2 = 8 words

vpunpckhbw xmm3,xmm0,xmm9 ;xmm3 = 8 words

vpunpcklbw xmm4,xmm1,xmm9 ;xmm4 = 8 words

vpunpckhbw xmm5,xmm1,xmm9 ;xmm5 = 8 words

vpaddw xmm0,xmm2,xmm3

vpaddw xmm1,xmm4,xmm5

vpaddw xmm7,xmm0,xmm1 ;xmm7 = packed sums (8 words)

; Promote packed sums to dwords, then update dword sums

vpaddw xmm0,xmm6,xmm7 ;xmm0 = packed sums (8 words)

vpunpcklwd xmm1,xmm0,xmm9 ;xmm1 = packed sums (4 dwords)

vpunpckhwd xmm2,xmm0,xmm9 ;xmm2 = packed sums (4 dwords)

vpaddd xmm8,xmm8,xmm1

vpaddd xmm8,xmm8,xmm2

add rcx,64 ;rcx = next 64 byte block

cmp rcx,rdx

jne @B ;repeat loop if not done

; Compute final sum_x (note vpextrd zero extends extracted dword to 64 bits)

vpextrd eax,xmm8,0 ;rax = partial sum 0

vpextrd edx,xmm8,1 ;rdx = partial sum 1

add rax,rdx

vpextrd ecx,xmm8,2 ;rcx = partial sum 2

vpextrd edx,xmm8,3 ;rdx = partial sum 3

add rax,rcx

add rax,rdx

mov [r8],rax ;save sum_x

; Compute mean value

vcvtsi2sd xmm0,xmm0,rax ;xmm0 = sum_x (DPFP)

vcvtsi2sd xmm1,xmm1,r10 ;xmm1 = n (DPFP)

vdivsd xmm2,xmm0,xmm1 ;calc mean = sum_x / n

vmovsd real8 ptr [r9],xmm2 ;save mean

mov eax,1 ;set success return code

Done: _RestoreXmmRegs xmm6,xmm7,xmm8,xmm9

_DeleteFrame

ret

AvxCalcMeanU8_ endp

end

Listing 7-5.

Example Ch07_05

The organization of the C++ code in example Ch07_05 is somewhat similar to the previous example. The C++ function AvxCalcMeanU8Cpp uses a simple summing loop and scalar arithmetic to calculate the mean of an array of 8-bit unsigned integers. Like the previous example, the number of array elements must be an integral multiple of 64 and the source array must be aligned to a 16-byte boundary. Note that the function AvxCalcMeanU8Cpp also verifies that the number of array elements is not greater than c_NumElementsMax. This size restriction enables the assembly language function AvcCalcMeanU8_ to carry out its calculations using packed doublewords sans any safeguards for arithmetic overflows. The remaining C++ code that’s shown in Listing 7-5 performs test array initialization and streams results to cout.

The assembly language function AvxCalcMeanU8_ begins by performing the same validations of the array size as its C++ counterpart. The address of the array is also check for proper alignment. Following argument validation, AvxCalcMeanU8_ carries out its required initializations. The add rdx,rcx instruction computes the address of the first byte beyond the end of the array. The function AvxCalcMeanU8_ uses this address instead of a counter to terminate the processing loop. Register XMM8 is then initialized to all zeros. The processing loop uses this register to maintain intermediate packed doubleword sums.

Each processing loop iteration begins with two vmovdqa instructions that load 32 unsigned byte values into registers XMM1:XMM0. The pixel values are then size-promoted to words using the vpunpcklbw (Unpack Low Data) and vpunpckhbw (Unpack High Data). These instructions interleave the byte values contained in the two source operands to form word values, as shown in Figure 7-4. Note that register XMM9 contains all zeros, which means that the unsigned byte values are zero extended during size promotion to words. A series of vpaddw instructions then sums the packed unsigned word values. The function AvxCalcMeanU8_ processes another block of 32 pixels using the same sequence of instructions. The unsigned word sums in registers XMM6 and XMM7 are then summed using a vpaddw instruction, size-promoted to doublewords using vpunpcklwd and vpunpckhwd, and added to the intermediate packed doubleword sums in register XMM8. Figure 7-4 illustrates this instruction sequence in greater detail.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig4_HTML.jpg — Figure 7-4.
Execution of the *vpunpck[h|l]bw*, and *vpunpck[h|l]wd* instructions

Following termination of the processing loop, the intermediate doubleword sums in register XMM8 are totaled to generate the final pixel sum. The function uses several vpextrd instructions to copy each doubleword value from XMM8 to a general-purpose register. Note that this instruction uses an immediate operand to specify which element value to copy. Following computation of the pixel sum, AvxCalcMeanU8_ calculates the final pixel mean using simple scalar arithmetic. Here are the results for source code example Ch07_05:

Results for AvxCalcMeanU8

rc1: 1 sum_x1: 2139023922 mean_x1: 127.495761

rc2: 1 sum_x2: 2139023922 mean_x2: 127.495761

Running benchmark function AvxCalcMeanU8_BM - please wait

Benchmark times save to file Ch07_05_AvxCalcMeanU8_BM_CHROMIUM.csv

Table 7-2 shows some benchmark timing measurements for source code example Ch07_05.

Table 7-2.

Source Code Example Ch07_05 Mean Execution Times (Microseconds), Array Size = 16 MB

CPU	AvxCalcMeanU8Cpp	AvxCalcMeanU8_
i7-4790S	7103	1063
i9-7900X	6332	1048
i7-8700K	5870	861

Pixel Conversions

In order to implement certain image-processing algorithms, it is often necessary to convert the pixels of an 8-bit grayscale image from unsigned integer to single-precision floating-point values and vice versa. The sample code example of this section, Ch07_06, illustrates how to do this using the AVX instruction set. Listing 7-6 shows the source code for example Ch07_06.

//------------------------------------------------

// Ch07_06.cpp

//------------------------------------------------

#include "stdafx.h"

#include <iostream>

#include <iomanip>

#include <cstdint>

#include <random>

#include "AlignedMem.h"

using namespace std;

// Ch07_06_Misc.cpp

extern uint32_t ConvertImgVerify(const float* src1, const float* src2, uint32_t num_pixels);

extern uint32_t ConvertImgVerify(const uint8_t* src1, const uint8_t* src2, uint32_t num_pixels);

// Ch07_06_.asm

extern "C" bool ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels);

extern "C" bool ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels);

extern "C" uint32_t c_NumPixelsMax = 16777216;

template <typename T> void Init(T* x, size_t n, unsigned int seed, T scale)

{

uniform_int_distribution<> ui_dist {0, 255};

default_random_engine rng {seed};

for (size_t i = 0; i < n; i++)

{

T temp = (T)ui_dist(rng);

x[i] = (scale == 1) ? temp : temp / scale;

}

bool ConvertImgU8ToF32Cpp(float* des, const uint8_t* src, uint32_t num_pixels)

{

// Make sure num_pixels is valid

if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))

return false;

if ((num_pixels % 32) != 0)

return false;

// Make sure src and des are aligned to a 16-byte boundary

if (!AlignedMem::IsAligned(src, 16))

return false;

if (!AlignedMem::IsAligned(des, 16))

return false;

// Convert the image

for (uint32_t i = 0; i < num_pixels; i++)

des[i] = src[i] / 255.0f;

return true;

}

bool ConvertImgF32ToU8Cpp(uint8_t* des, const float* src, uint32_t num_pixels)

{

// Make sure num_pixels is valid

if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))

return false;

if ((num_pixels % 32) != 0)

return false;

// Make sure src and des are aligned to a 16-byte boundary

if (!AlignedMem::IsAligned(src, 16))

return false;

if (!AlignedMem::IsAligned(des, 16))

return false;

for (uint32_t i = 0; i < num_pixels; i++)

{

if (src[i] > 1.0f)

des[i] = 255;

else if (src[i] < 0.0)

des[i] = 0;

else

des[i] = (uint8_t)(src[i] * 255.0f);

}

return true;

}

void ConvertImgU8ToF32(void)

{

const uint32_t num_pixels = 1024;

AlignedArray<uint8_t> src_aa(num_pixels, 16);

AlignedArray<float> des1_aa(num_pixels, 16);

AlignedArray<float> des2_aa(num_pixels, 16);

uint8_t* src = src_aa.Data();

float* des1 = des1_aa.Data();

float* des2 = des2_aa.Data();

Init(src, num_pixels, 12, (uint8_t)1);

bool rc1 = ConvertImgU8ToF32Cpp(des1, src, num_pixels);

bool rc2 = ConvertImgU8ToF32_(des2, src, num_pixels);

if (!rc1 || !rc2)

{

cout << "Invalid return code - ";

cout << "rc1 = " << boolalpha << rc1 << ", ";

cout << "rc2 = " << boolalpha << rc2 << '\n';

return;

}

uint32_t num_diff = ConvertImgVerify(des1, des2, num_pixels);

cout << "\nResults for ConvertImgU8ToF32\n";

cout << " num_pixels = " << num_pixels << '\n';

cout << " num_diff = " << num_diff << '\n';

}

void ConvertImgF32ToU8(void)

{

const uint32_t num_pixels = 1024;

AlignedArray<float> src_aa(num_pixels, 16);

AlignedArray<uint8_t> des1_aa(num_pixels, 16);

AlignedArray<uint8_t> des2_aa(num_pixels, 16);

float* src = src_aa.Data();

uint8_t* des1 = des1_aa.Data();

uint8_t* des2 = des2_aa.Data();

// Initialize the src pixel buffer. The first few entries in src

// are set to known values for test purposes.

Init(src, num_pixels, 20, 1.0f);

src[0] = 0.125f; src[8] = 0.01f;

src[1] = 0.75f; src[9] = 0.99f;

src[2] = -4.0f; src[10] = 1.1f;

src[3] = 3.0f; src[11] = -1.1f;

src[4] = 0.0f; src[12] = 0.99999f;

src[5] = 1.0f; src[13] = 0.5f;

src[6] = -0.01f; src[14] = -0.0;

src[7] = 1.01f; src[15] = .333333f;

bool rc1 = ConvertImgF32ToU8Cpp(des1, src, num_pixels);

bool rc2 = ConvertImgF32ToU8_(des2, src, num_pixels);

if (!rc1 || !rc2)

{

cout << "Invalid return code - ";

cout << "rc1 = " << boolalpha << rc1 << ", ";

cout << "rc2 = " << boolalpha << rc2 << '\n';

return;

}

uint32_t num_diff = ConvertImgVerify(des1, des2, num_pixels);

cout << "\nResults for ConvertImgF32ToU8\n";

cout << " num_pixels = " << num_pixels << '\n';

cout << " num_diff = " << num_diff << '\n';

}

int main()

{

ConvertImgU8ToF32();

ConvertImgF32ToU8();

return 0;

}

;-------------------------------------------------

; Ch07_06.asm

;-------------------------------------------------

include <MacrosX86-64-AVX.asmh>

include <cmpequ.asmh>

.const

align 16

Uint8ToFloat real4 255.0, 255.0, 255.0, 255.0

FloatToUint8Min real4 0.0, 0.0, 0.0, 0.0

FloatToUint8Max real4 1.0, 1.0, 1.0, 1.0

FloatToUint8Scale real4 255.0, 255.0, 255.0, 255.0

extern c_NumPixelsMax:dword

; extern "C" bool ConvertImgU8ToF32_(float* des, const uint8_t* src, uint32_t num_pixels)

.code

ConvertImgU8ToF32_ proc frame

_CreateFrame U2F_,0,160

_SaveXmmRegs xmm6,xmm7,xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15

_EndProlog

; Make sure num_pixels is valid and pixel buffers are properly aligned

xor eax,eax ;set error return code

or r8d,r8d

jz Done ;jump if num_pixels is zero

cmp r8d,[c_NumPixelsMax]

ja Done ;jump if num_pixels too big

test r8d,1fh

jnz Done ;jump if num_pixels % 32 != 0

test rcx,0fh

jnz Done ;jump if des not aligned

test rdx,0fh

jnz Done ;jump if src not aligned

; Initialize processing loop registers

shr r8d,5 ;number of pixel blocks

vmovaps xmm6,xmmword ptr [Uint8ToFloat] ;xmm6 = packed 255.0f

vpxor xmm7,xmm7,xmm7 ;xmm7 = packed 0

; Load the next block of 32 pixels

@@: vmovdqa xmm0,xmmword ptr [rdx] ;xmm0 = 16 pixels (x[i+15]:x[i])

vmovdqa xmm1,xmmword ptr [rdx+16] ;xmm8 = 16 pixels (x[i+31]:x[i+16])

; Promote the pixel values in xmm0 from unsigned bytes to unsigned dwords

vpunpcklbw xmm2,xmm0,xmm7

vpunpckhbw xmm3,xmm0,xmm7

vpunpcklwd xmm8,xmm2,xmm7

vpunpckhwd xmm9,xmm2,xmm7

vpunpcklwd xmm10,xmm3,xmm7

vpunpckhwd xmm11,xmm3,xmm7 ;xmm11:xmm8 = 16 dword pixels

; Promote the pixel values in xmm1 from unsigned bytes to unsigned dwords

vpunpcklbw xmm2,xmm1,xmm7

vpunpckhbw xmm3,xmm1,xmm7

vpunpcklwd xmm12,xmm2,xmm7

vpunpckhwd xmm13,xmm2,xmm7

vpunpcklwd xmm14,xmm3,xmm7

vpunpckhwd xmm15,xmm3,xmm7 ;xmm15:xmm12 = 16 dword pixels

; Convert pixel values from dwords to SPFP

vcvtdq2ps xmm8,xmm8

vcvtdq2ps xmm9,xmm9

vcvtdq2ps xmm10,xmm10

vcvtdq2ps xmm11,xmm11 ;xmm11:xmm8 = 16 SPFP pixels

vcvtdq2ps xmm12,xmm12

vcvtdq2ps xmm13,xmm13

vcvtdq2ps xmm14,xmm14

vcvtdq2ps xmm15,xmm15 ;xmm15:xmm12 = 16 SPFP pixels

; Normalize all pixel values to [0.0, 1.0] and save the results

vdivps xmm0,xmm8,xmm6

vmovaps xmmword ptr [rcx],xmm0 ;save pixels 0 - 3

vdivps xmm1,xmm9,xmm6

vmovaps xmmword ptr [rcx+16],xmm1 ;save pixels 4 - 7

vdivps xmm2,xmm10,xmm6

vmovaps xmmword ptr [rcx+32],xmm2 ;save pixels 8 - 11

vdivps xmm3,xmm11,xmm6

vmovaps xmmword ptr [rcx+48],xmm3 ;save pixels 12 - 15

vdivps xmm0,xmm12,xmm6

vmovaps xmmword ptr [rcx+64],xmm0 ;save pixels 16 - 19

vdivps xmm1,xmm13,xmm6

vmovaps xmmword ptr [rcx+80],xmm1 ;save pixels 20 - 23

vdivps xmm2,xmm14,xmm6

vmovaps xmmword ptr [rcx+96],xmm2 ;save pixels 24 - 27

vdivps xmm3,xmm15,xmm6

vmovaps xmmword ptr [rcx+112],xmm3 ;save pixels 28 - 31

add rdx,32 ;update src ptr

add rcx,128 ;update des ptr

sub r8d,1

jnz @B ;repeat until done

mov eax,1 ;set success return code

Done: _RestoreXmmRegs xmm6,xmm7,xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15

_DeleteFrame

ret

ConvertImgU8ToF32_ endp

; extern "C" bool ConvertImgF32ToU8_(uint8_t* des, const float* src, uint32_t num_pixels)

ConvertImgF32ToU8_ proc frame

_CreateFrame F2U_,0,96

_SaveXmmRegs xmm6,xmm7,xmm12,xmm13,xmm14,xmm15

_EndProlog

; Make sure num_pixels is valid and pixel buffers are properly aligned

xor eax,eax ;set error return code

or r8d,r8d

jz Done ;jump if num_pixels is zero

cmp r8d,[c_NumPixelsMax]

ja Done ;jump if num_pixels too big

test r8d,1fh

jnz Done ;jump if num_pixels % 32 != 0

test rcx,0fh

jnz Done ;jump if des not aligned

test rdx,0fh

jnz Done ;jump if src not aligned

; Load required packed constants into registers

vmovaps xmm13,xmmword ptr [FloatToUint8Scale] ;xmm13 = packed 255.0

vmovaps xmm14,xmmword ptr [FloatToUint8Min] ;xmm14 = packed 0.0

vmovaps xmm15,xmmword ptr [FloatToUint8Max] ;xmm15 = packed 1.0

shr r8d,4 ;number of pixel blocks

LP1: mov r9d,4 ;num pixel quartets per block

; Convert 16 float pixels to uint8_t

LP2: vmovaps xmm0,xmmword ptr [rdx] ;xmm0 = next pixel quartet

vcmpps xmm1,xmm0,xmm14,CMP_LT ;compare pixels to 0.0

vandnps xmm2,xmm1,xmm0 ;clip pixels < 0.0 to 0.0

vcmpps xmm3,xmm2,xmm15,CMP_GT ;compare pixels to 1.0

vandps xmm4,xmm3,xmm15 ;clip pixels > 1.0 to 1.0

vandnps xmm5,xmm3,xmm2 ;xmm5 = pixels <= 1.0

vorps xmm6,xmm5,xmm4 ;xmm6 = final clipped pixels

vmulps xmm7,xmm6,xmm13 ;xmm7 = FP pixels [0.0, 255.0]

vcvtps2dq xmm0,xmm7 ;xmm0 = dword pixels [0, 255]

vpackusdw xmm1,xmm0,xmm0 ;xmm1[63:0] = word pixels

vpackuswb xmm2,xmm1,xmm1 ;xmm2[31:0] = bytes pixels

; Save the current byte pixel quartet

vpextrd eax,xmm2,0 ;eax = new pixel quartet

vpsrldq xmm12,xmm12,4 ;adjust xmm12 for new quartet

vpinsrd xmm12,xmm12,eax,3 ;xmm12[127:96] = new quartet

add rdx,16 ;update src ptr

sub r9d,1

jnz LP2 ;repeat until done

; Save the current byte pixel block (16 pixels)

vmovdqa xmmword ptr [rcx],xmm12 ;save current pixel block

add rcx,16 ;update des ptr

sub r8d,1

jnz LP1 ;repeat until done

mov eax,1 ;set success return code

Done: _RestoreXmmRegs xmm6,xmm7,xmm12,xmm13,xmm14,xmm15

_DeleteFrame

ret

ConvertImgF32ToU8_ endp

end

Listing 7-6.

Example Ch07_06

The C++ code in Listing 7-6 is straightforward. The function ConvertImgU8ToF32Cpp contains code that converts pixel values from uint8_t [0, 255] to single-precision floating-point [0.0, 1.0]. This function contains a simple for loop that calculates des[i] = src[i] / 255.0. The counterpart function ConvertImgF32ToU8Cpp performs the inverse operation. Note that this function clips any pixel values greater than 1.0 or less than 0.0 before performing the floating-point to uint8_t conversion. The functions ConvertImgU8ToF32 and ConvertImgF32ToU8 contain code that initialize test arrays and exercise the C++ and assembly language conversion routines. Note that the latter function initializes the first few entries of the source buffer to known values in order to demonstrate the aforementioned clipping operation.

The processing loop of the assembly language function ConvertImgU8ToF32_ converts 32 pixels from uint8_t (or byte) to single-precision floating-point during each iteration. The conversion technique begins with the size promotion of packed pixels from unsigned byte to unsigned doubleword integers using a series of vpunpck[h|l]bw and vpunpck[h|l]wd instructions. The doubleword values are then converted to single-precision floating-point values using the instruction vcvtdq2ps (Convert Packed Doubleword Integers to Packed Single-Precision Floating-Point Values). The resultant packed floating-point values are normalized to [0.0, 1.0] and saved to the destination buffer.

The assembly language function ConvertImgF32ToU8_ performs packed single-precision floating-point to packed unsigned byte conversions. The inner loop (starting at the label LP2) of this conversion function uses the instructions vcmpps xmm1,xmm0,xmm14,CMP_LT, vcmpps xmm3,xmm2,xmm15,CMP_GT, and some Boolean logic to clip any pixels values less than 0.0 or greater than 1.0. Figure 7-5 illustrates this technique in greater detail. The vcvtps2dq xmm0,xmm7 instruction converts the four single-precision floating-point values in XMM7 to doubleword integers and saves the results in register XMM0. The next two instructions, vpackusdw xmm1,xmm0,xmm0 and vpackuswb xmm2,xmm1,xmm1, size-reduce the packed doubleword integers to packed unsigned bytes. Following the execution of the vpackuswb instruction, register XMM2[31:0] contains four packed unsigned byte values. This pixel quartet is then copied to XMM12[127:96] using the instruction sequence vpextrd eax,xmm2,0, vpsrldq xmm12,xmm12,4, and vpinsrd xmm12,xmm12,eax,3. The vpinsrd (Insert Dword) instructions that’s used here copies the doubleword value in register EAX to doubleword element position 3 in register XMM12 (or XMM12[127:96]).

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig5_HTML.jpg — Figure 7-5.
Illustration of floating-point clipping technique used in function *ConvertImgF32ToU8_*

The inner loop conversion process that’s described in the previous paragraph executes for four iterations. Following the completion of the inner loop, XMM12 contains 16 unsigned byte pixel values. This pixel block is then saved to the destination buffer using a vmovdqa xmmword ptr [rcx],xmm12 instruction. The outer loop repeats until all pixels have been converted. Here is the output for source code example Ch07_06.

Results for ConvertImgU8ToF32

num_pixels = 1024

num_diff = 0

Results for ConvertImgF32ToU8

num_pixels = 1024

num_diff = 0

Image Histograms

Many image-processing algorithms require a histogram of an image’s pixel intensity values. Figure 7-6 shows a sample grayscale image and its histogram. The next source code example, Ch07_07, illustrates how to build a histogram of pixel intensity values for an image containing 8-bit grayscale pixel values. This example also explains how to use the stack in an assembly language function to store intermediate results. Listing 7-7 shows the source code for example Ch07_07.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig6_HTML.jpg — Figure 7-6.
Sample grayscale image and its histogram

//------------------------------------------------

// Ch07_07.h

//------------------------------------------------

#pragma once

#include <cstdint>

// Ch07_07.cpp

extern bool AvxBuildImageHistogramCpp(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels);

// Ch07_07_.asm

// Functions defined in Sse64ImageHistogram_.asm

extern "C" bool AvxBuildImageHistogram_(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels);

// Ch07_07_BM.cpp

extern void AvxBuildImageHistogram_BM(void);

//------------------------------------------------

// Ch07_07.cpp

//------------------------------------------------

#include "stdafx.h"

#include <cstdint>

#include <iostream>

#include <iomanip>

#include <fstream>

#include <string>

#include "Ch07_07.h"

#include "AlignedMem.h"

#include "ImageMatrix.h"

using namespace std;

extern "C" uint32_t c_NumPixelsMax = 16777216;

bool AvxBuildImageHistogramCpp(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels)

{

// Make sure num_pixels is valid

if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))

return false;

if (num_pixels % 32 != 0)

return false;

// Make sure histo is aligned to a 16-byte boundary

if (!AlignedMem::IsAligned(histo, 16))

return false;

// Make sure pixel_buff is aligned to a 16-byte boundary

if (!AlignedMem::IsAligned(pixel_buff, 16))

return false;

// Build the histogram

memset(histo, 0, 256 * sizeof(uint32_t));

for (uint32_t i = 0; i < num_pixels; i++)

histo[pixel_buff[i]]++;

return true;

}

void AvxBuildImageHistogram(void)

{

const wchar_t* image_fn = L"..\\Ch07_Data\\TestImage1.bmp";

const wchar_t* csv_fn = L"Ch07_07_AvxBuildImageHistogram_Histograms.csv";

ImageMatrix im(image_fn);

uint32_t num_pixels = im.GetNumPixels();

uint8_t* pixel_buff = im.GetPixelBuffer<uint8_t>();

AlignedArray<uint32_t> histo1_aa(256, 16);

AlignedArray<uint32_t> histo2_aa(256, 16);

bool rc1 = AvxBuildImageHistogramCpp(histo1_aa.Data(), pixel_buff, num_pixels);

bool rc2 = AvxBuildImageHistogram_(histo2_aa.Data(), pixel_buff, num_pixels);

cout << "\nResults for AvxBuildImageHistogram\n";

if (!rc1 || !rc2)

{

cout << "Bad return code: ";

cout << "rc1 = " << rc1 << ", rc2 = " << rc2 << '\n';

return;

}

ofstream ofs(csv_fn);

if (ofs.bad())

cout << "File create error - " << csv_fn << '\n';

else

{

bool compare_error = false;

uint32_t* histo1 = histo1_aa.Data();

uint32_t* histo2 = histo2_aa.Data();

const char* delim = ", ";

for (uint32_t i = 0; i < 256; i++)

{

ofs << i << delim;

ofs << histo1[i] << delim << histo2[i] << '\n';

if (histo1[i] != histo2[i])

{

compare_error = true;

cout << " Histogram compare error at index " << i << '\n';

cout << " counts: " << histo1[i] << delim << histo2[i] << '\n';

}

if (!compare_error)

cout << " Histograms are identical\n";

ofs.close();

}

int main()

{

try

{

AvxBuildImageHistogram();

AvxBuildImageHistogram_BM();

}

catch (...)

{

cout << "Unexpected exception has occurred\n";

cout << "File = " << __FILE__ << '\n';

}

return 0;

}

;-------------------------------------------------

; Ch07_07.asm

;-------------------------------------------------

include <MacrosX86-64-AVX.asmh>

; extern bool AvxBuildImageHistogram_(uint32_t* histo, const uint8_t* pixel_buff, uint32_t num_pixels)

;

; Returns: 0 = invalid argument value, 1 = success

.code

extern c_NumPixelsMax:dword

AvxBuildImageHistogram_ proc frame

_CreateFrame BIH_,1024,0,rbx,rsi,rdi

_EndProlog

; Make sure num_pixels is valid

xor eax,eax ;set error code

test r8d,r8d

jz Done ;jump if num_pixels is zero

cmp r8d,[c_NumPixelsMax]

ja Done ;jump if num_pixels too big

test r8d,1fh

jnz Done ;jump if num_pixels % 32 != 0

; Make sure histo & pixel_buff are properly aligned

mov rsi,rcx ;rsi = ptr to histo

test rsi,0fh

jnz Done ;jump if histo misaligned

mov r9,rdx

test r9,0fh

jnz Done ;jump if pixel_buff misaligned

; Initialize local histogram buffers (set all entries to zero)

xor eax,eax

mov rdi,rsi ;rdi = ptr to histo

mov rcx,128 ;rcx = size in qwords

rep stosq ;zero histo

mov rdi,rbp ;rdi = ptr to histo2

mov rcx,128 ;rcx = size in qwords

rep stosq ;zero histo2

; Perform processing loop initializations

shr r8d,5 ;number of pixel blocks (32 pixels/block)

mov rdi,rbp ;ptr to histo2

; Build the histograms

align 16 ;align jump target

@@: vmovdqa xmm0,xmmword ptr [r9] ;load pixel block

vmovdqa xmm1,xmmword ptr [r9+16] ;load pixel block

; Process pixels 0 - 3

vpextrb rax,xmm0,0

add dword ptr [rsi+rax*4],1 ;count pixel 0

vpextrb rbx,xmm0,1

add dword ptr [rdi+rbx*4],1 ;count pixel 1

vpextrb rcx,xmm0,2

add dword ptr [rsi+rcx*4],1 ;count pixel 2

vpextrb rdx,xmm0,3

add dword ptr [rdi+rdx*4],1 ;count pixel 3

; Process pixels 4 - 7

vpextrb rax,xmm0,4

add dword ptr [rsi+rax*4],1 ;count pixel 4

vpextrb rbx,xmm0,5

add dword ptr [rdi+rbx*4],1 ;count pixel 5

vpextrb rcx,xmm0,6

add dword ptr [rsi+rcx*4],1 ;count pixel 6

vpextrb rdx,xmm0,7

add dword ptr [rdi+rdx*4],1 ;count pixel 7

; Process pixels 8 - 11

vpextrb rax,xmm0,8

add dword ptr [rsi+rax*4],1 ;count pixel 8

vpextrb rbx,xmm0,9

add dword ptr [rdi+rbx*4],1 ;count pixel 9

vpextrb rcx,xmm0,10

add dword ptr [rsi+rcx*4],1 ;count pixel 10

vpextrb rdx,xmm0,11

add dword ptr [rdi+rdx*4],1 ;count pixel 11

; Process pixels 12 - 15

vpextrb rax,xmm0,12

add dword ptr [rsi+rax*4],1 ;count pixel 12

vpextrb rbx,xmm0,13

add dword ptr [rdi+rbx*4],1 ;count pixel 13

vpextrb rcx,xmm0,14

add dword ptr [rsi+rcx*4],1 ;count pixel 14

vpextrb rdx,xmm0,15

add dword ptr [rdi+rdx*4],1 ;count pixel 15

; Process pixels 16 - 19

vpextrb rax,xmm1,0

add dword ptr [rsi+rax*4],1 ;count pixel 16

vpextrb rbx,xmm1,1

add dword ptr [rdi+rbx*4],1 ;count pixel 17

vpextrb rcx,xmm1,2

add dword ptr [rsi+rcx*4],1 ;count pixel 18

vpextrb rdx,xmm1,3

add dword ptr [rdi+rdx*4],1 ;count pixel 19

; Process pixels 20 - 23

vpextrb rax,xmm1,4

add dword ptr [rsi+rax*4],1 ;count pixel 20

vpextrb rbx,xmm1,5

add dword ptr [rdi+rbx*4],1 ;count pixel 21

vpextrb rcx,xmm1,6

add dword ptr [rsi+rcx*4],1 ;count pixel 22

vpextrb rdx,xmm1,7

add dword ptr [rdi+rdx*4],1 ;count pixel 23

; Process pixels 24 - 27

vpextrb rax,xmm1,8

add dword ptr [rsi+rax*4],1 ;count pixel 24

vpextrb rbx,xmm1,9

add dword ptr [rdi+rbx*4],1 ;count pixel 25

vpextrb rcx,xmm1,10

add dword ptr [rsi+rcx*4],1 ;count pixel 26

vpextrb rdx,xmm1,11

add dword ptr [rdi+rdx*4],1 ;count pixel 27

; Process pixels 28 - 31

vpextrb rax,xmm1,12

add dword ptr [rsi+rax*4],1 ;count pixel 28

vpextrb rbx,xmm1,13

add dword ptr [rdi+rbx*4],1 ;count pixel 29

vpextrb rcx,xmm1,14

add dword ptr [rsi+rcx*4],1 ;count pixel 30

vpextrb rdx,xmm1,15

add dword ptr [rdi+rdx*4],1 ;count pixel 31

add r9,32 ;r9 = next pixel block

sub r8d,1

jnz @B ;repeat loop if not done

; Merge intermediate histograms into final histogram

mov ecx,32 ;ecx = num iterations

xor eax,eax ;rax = common offset

@@: vmovdqa xmm0,xmmword ptr [rsi+rax] ;load histo counts

vmovdqa xmm1,xmmword ptr [rsi+rax+16]

vpaddd xmm0,xmm0,xmmword ptr [rdi+rax] ;add counts from histo2

vpaddd xmm1,xmm1,xmmword ptr [rdi+rax+16]

vmovdqa xmmword ptr [rsi+rax],xmm0 ;save final result

vmovdqa xmmword ptr [rsi+rax+16],xmm1

add rax,32

sub ecx,1

jnz @B

mov eax,1 ;set success return code

Done: _DeleteFrame rbx,rsi,rdi

ret

AvxBuildImageHistogram_ endp

end

Listing 7-7.

Example Ch07_07

Near the top of the C++ code is a function named AvxBuildImageHistogramCpp. This function constructs an image histogram using a rudimentary technique. Prior to the histogram’s actual construction, the number of image pixels is validated for size (greater than 0 and not greater than c_NumPixelMax) and divisibility by 32. The divisibility test is performed to ensure compatibility with the assembly language function AvxBuildImageHistogram_. Next, the addresses of histo and pixel_buff are verified for proper alignment. The call to memset initializes each histogram pixel count bin to zero. A simple for loop is then used to construct the histogram.

The function AvxBuildImageHistogram uses a C++ class named ImageMatrix to load the pixels of an image into memory. (The source code for ImageMatrix is not shown but included as part of the chapter download package.) The variables num_pixels and pixel_buff are then initialized using the member functions ImageMatrix::GetNumPixels and ImageMatrix::GetPixelBuffer. Two histogram buffers then are allocated using the C++ template class AlignedArray<uint32_t>. Following the construction of the histograms using the functions AvxBuildImageHistogramCpp and AvxBuildImageHistogram_, the pixel counts in the two histogram buffers are compared for equivalence and written to a comma-separated-value text file.

The assembly language function AvxBuildImageHistogram_ constructs an image histogram using the AVX instruction set. In order to improve performance, this function builds two intermediate histograms and merges them into a final histogram. AvxBuildImageHistogram_ begins by creating a stack frame using the _CreateFrame macro. Note that the stack frame created by _CreateFrame includes 1024 bytes (256 doublewords, one for each grayscale intensity level) of local storage space, which is used for one of the intermediate histogram buffers. Following the execution of the code generated by _CreateFrame, register RBP points to the intermediate histogram on the stack (see Figure 5-6). The caller-provided buffer histo is used as the second intermediate histogram buffer. Following the _EndProlog macro, the function AvxBuildImageHistogram_ validates num_pixels for size and divisibility by 32; it the checks the addresses of histo and pixel_buff for proper alignment. The count values in both intermediate histograms are then initialized to zero using the stosq instruction.

The main processing loop begins with two vmovdqa instructions that load 32 image pixels into registers XMM1:XMM0. Note that prior to the first vmovdqa instruction, the MASM directive align 16 is used to align this instruction on a 16-byte boundary. Aligning the target of a jump instruction on a 16-byte boundary is an optimization technique that often improves performance. Chapter 15 discusses this and other optimization techniques in greater detail. Next, a vpextrb rax,xmm0,0 instruction extracts pixel element 0 (i.e., XMM0[7:0]) from register XMM0 and copies it to the low-order bits of register RAX; the high-order bits of RAX are set to zero. The ensuing add dword ptr [rsi+rax*4],1 instruction updates the appropriate pixel count bin in the first intermediate histogram. The next two instructions, vpextrb rbx,xmm0,1 and add dword ptr [rdi+rbx*4],1, process pixel element 1 in the same manner using the second intermediate histogram. This pixel-processing technique is then repeated for the remaining pixels in the current block.

Following the execution of the processing loop, the pixel count values in the two intermediate histograms are summed using packed integer arithmetic to create the final histogram. The _DeleteFrame macro is then used to release the local stack frame and restore the previously-saved non-volatile general-purpose registers. Here is the output for source code example Ch07_07:

Results for AvxBuildImageHistogram

Histograms are identical

Running benchmark function AvxBuildImageHistogram_BM - please wait

Benchmark times save to file Ch07_07_AvxBuildImageHistogram_BM_CHROMIUM.csv

Table 7-3 shows benchmark timing measurements for the histogram build functions.

Table 7-3.

Histogram Build Mean Execution Times (Microseconds) Using TestImage1.bmp

CPU	AvxBuildImageHistogramCpp	AvxBuildImageHistogram_
i7-4790S	277	230
i9-7900X	255	199
i7-8700K	241	191

Image Thresholding

Image thresholding is an image-processing technique that creates a binary image (i.e., an image with only two colors) from a grayscale image. This binary (or mask) image signifies which pixels in the original image are greater than a predetermined or algorithmically derived intensity threshold value. Figure 7-7 illustrates a thresholding operation. Mask images are often employed to perform additional calculations using the grayscale pixels values of the original image. For example, one typical use of the mask image that’s shown in Figure 7-7 is to compute the mean intensity value of all above-threshold pixels in the original image. The application of a mask image simplifies calculating the mean since it facilitates the use of simple Boolean expressions to exclude unwanted pixels from the computations.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig7_HTML.jpg — Figure 7-7.
Sample grayscale and mask images

Source code example Ch07_08 demonstrates how to calculate the mean intensity of image pixels above a specified threshold. It also shows how to call a C++ function from an assembly language function. Listing 7-8 shows the source code for example Ch07_08.

//------------------------------------------------

// Ch07_08.h

//------------------------------------------------

#pragma once

#include <cstdint>

// Image threshold data structure. This structure must agree with the

// structure that's defined in Ch07_08_.asm

struct ITD

{

uint8_t* m_PbSrc; // Source image pixel buffer

uint8_t* m_PbMask; // Mask mask pixel buffer

uint32_t m_NumPixels; // Number of source image pixels

uint32_t m_NumMaskedPixels; // Number of masked pixels

uint32_t m_SumMaskedPixels; // Sum of masked pixels

uint8_t m_Threshold; // Image threshold value

uint8_t m_Pad[3]; // Available for future use

double m_MeanMaskedPixels; // Mean of masked pixels

};

// Functions defined in Ch07_08.cpp

extern bool AvxThresholdImageCpp(ITD* itd);

extern bool AvxCalcImageMeanCpp(ITD* itd);

extern "C" bool IsValid(uint32_t num_pixels, const uint8_t* pb_src, const uint8_t* pb_mask);

// Functions defined in Ch07_08_.asm

extern "C" bool AvxThresholdImage_(ITD* itd);

extern "C" bool AvxCalcImageMean_(ITD* itd);

// Functions defined in Ch07_08_BM.cpp

extern void AvxThreshold_BM(void);

// Miscellaneous constants

const uint8_t c_TestThreshold = 96;

//------------------------------------------------

// Ch07_08.cpp

//------------------------------------------------

#include "stdafx.h"

#include <cstdint>

#include <iostream>

#include <iomanip>

#include "Ch07_08.h"

#include "AlignedMem.h"

#include "ImageMatrix.h"

using namespace std;

extern "C" uint32_t c_NumPixelsMax = 16777216;

bool IsValid(uint32_t num_pixels, const uint8_t* pb_src, const uint8_t* pb_mask)

{

const size_t alignment = 16;

// Make sure num_pixels is valid

if ((num_pixels == 0) || (num_pixels > c_NumPixelsMax))

return false;

if ((num_pixels % 64) != 0)

return false;

// Make sure image buffers are properly aligned

if (!AlignedMem::IsAligned(pb_src, alignment))

return false;

if (!AlignedMem::IsAligned(pb_mask, alignment))

return false;

return true;

}

bool AvxThresholdImageCpp(ITD* itd)

{

uint8_t* pb_src = itd->m_PbSrc;

uint8_t* pb_mask = itd->m_PbMask;

uint8_t threshold = itd->m_Threshold;

uint32_t num_pixels = itd->m_NumPixels;

// Verify pixel count and buffer alignment

if (!IsValid(num_pixels, pb_src, pb_mask))

return false;

// Threshold the image

for (uint32_t i = 0; i < num_pixels; i++)

*pb_mask++ = (*pb_src++ > threshold) ? 0xff : 0x00;

return true;

}

bool AvxCalcImageMeanCpp(ITD* itd)

{

uint8_t* pb_src = itd->m_PbSrc;

uint8_t* pb_mask = itd->m_PbMask;

uint32_t num_pixels = itd->m_NumPixels;

// Verify pixel count and buffer alignment

if (!IsValid(num_pixels, pb_src, pb_mask))

return false;

// Calculate mean of masked pixels

uint32_t sum_masked_pixels = 0;

uint32_t num_masked_pixels = 0;

for (uint32_t i = 0; i < num_pixels; i++)

{

uint8_t mask_val = *pb_mask++;

num_masked_pixels += mask_val & 1;

sum_masked_pixels += (*pb_src++ & mask_val);

}

itd->m_NumMaskedPixels = num_masked_pixels;

itd->m_SumMaskedPixels = sum_masked_pixels;

if (num_masked_pixels > 0)

itd->m_MeanMaskedPixels = (double)sum_masked_pixels / num_masked_pixels;

else

itd->m_MeanMaskedPixels = -1.0;

return true;

}

void AvxThreshold(void)

{

const wchar_t* fn_src = L"..\\Ch07_Data\\TestImage2.bmp";

const wchar_t* fn_mask1 = L"Ch07_08_AvxThreshold_TestImage2_Mask1.bmp";

const wchar_t* fn_mask2 = L"Ch07_08_AvxThreshold_TestImage2_Mask2.bmp";

ImageMatrix im_src(fn_src);

int im_h = im_src.GetHeight();

int im_w = im_src.GetWidth();

ImageMatrix im_mask1(im_h, im_w, PixelType::Gray8);

ImageMatrix im_mask2(im_h, im_w, PixelType::Gray8);

ITD itd1, itd2;

itd1.m_PbSrc = im_src.GetPixelBuffer<uint8_t>();

itd1.m_PbMask = im_mask1.GetPixelBuffer<uint8_t>();

itd1.m_NumPixels = im_src.GetNumPixels();

itd1.m_Threshold = c_TestThreshold;

itd2.m_PbSrc = im_src.GetPixelBuffer<uint8_t>();

itd2.m_PbMask = im_mask2.GetPixelBuffer<uint8_t>();

itd2.m_NumPixels = im_src.GetNumPixels();

itd2.m_Threshold = c_TestThreshold;

// Threshold image

bool rc1 = AvxThresholdImageCpp(&itd1);

bool rc2 = AvxThresholdImage_(&itd2);

if (!rc1 || !rc2)

{

cout << "\nInvalid return code: ";

cout << "rc1 = " << rc1 << ", rc2 = " << rc2 << '\n';

return;

}

im_mask1.SaveToBitmapFile(fn_mask1);

im_mask2.SaveToBitmapFile(fn_mask2);

// Calculate mean of masked pixels

rc1 = AvxCalcImageMeanCpp(&itd1);

rc2 = AvxCalcImageMean_(&itd2);

if (!rc1 || !rc2)

{

cout << "\nInvalid return code: ";

cout << "rc1 = " << rc1 << ", rc2 = " << rc2 << '\n';

return;

}

// Print results

const int w = 12;

cout << fixed << setprecision(4);

cout << "\nResults for AvxThreshold\n\n";

cout << " C++ X86-AVX\n";

cout << "---------------------------------------------\n";

cout << "SumPixelsMasked: ";

cout << setw(w) << itd1.m_SumMaskedPixels << " ";

cout << setw(w) << itd2.m_SumMaskedPixels << '\n';

cout << "NumPixelsMasked: ";

cout << setw(w) << itd1.m_NumMaskedPixels << " ";

cout << setw(w) << itd2.m_NumMaskedPixels << '\n';

cout << "MeanMaskedPixels: ";

cout << setw(w) << itd1.m_MeanMaskedPixels << " ";

cout << setw(w) << itd2.m_MeanMaskedPixels << '\n';

}

int main()

{

try

{

AvxThreshold();

AvxThreshold_BM();

}

catch (...)

{

cout << "Unexpected exception has occurred\n";

}

return 0;

}

;-------------------------------------------------

; Ch07_08.asm

;-------------------------------------------------

include <MacrosX86-64-AVX.asmh>

; Image threshold data structure (see Ch07_08.h)

ITD struct

PbSrc qword ?

PbMask qword ?

NumPixels dword ?

NumMaskedPixels dword ?

SumMaskedPixels dword ?

Threshold byte ?

Pad byte 3 dup(?)

MeanMaskedPixels real8 ?

ITD ends

.const

align 16

PixelScale byte 16 dup(80h) ;uint8 to int8 scale value

CountPixelsMask byte 16 dup(01h) ;mask to count pixels

R8_MinusOne real8 -1.0 ;invalid mean value

.code

extern IsValid:proc

; extern "C" bool AvxThresholdImage_(ITD* itd);

;

; Returns: 0 = invalid size or unaligned image buffer, 1 = success

AvxThresholdImage_ proc frame

_CreateFrame TI_,0,0,rbx

_EndProlog

; Verify the arguments in the ITD structure

mov rbx,rcx ;copy itd ptr to non-volatile register

mov ecx,[rbx+ITD.NumPixels] ;ecx = num_pixels

mov rdx,[rbx+ITD.PbSrc] ;rdx = pb_src

mov r8,[rbx+ITD.PbMask] ;r8 = pb_mask

sub rsp,32 ;allocate home area for IsValid

call IsValid ;validate args

or al,al

jz Done ;jump if invalid

; Initialize registers for processing loop

mov ecx,[rbx+ITD.NumPixels] ;ecx = num_pixels

shr ecx,6 ;ecx = number of 64b pixel blocks

mov rdx,[rbx+ITD.PbSrc] ;rdx = pb_src

mov r8,[rbx+ITD.PbMask] ;r8 = pb_mask

movzx r9d,byte ptr [rbx+ITD.Threshold] ;r9d = threshold

vmovd xmm1,r9d ;xmm1[7:0] = threshold

vpxor xmm0,xmm0,xmm0 ;mask for vpshufb

vpshufb xmm1,xmm1,xmm0 ;xmm1 = packed threshold

vmovdqa xmm4,xmmword ptr [PixelScale] ;packed pixel scale factor

vpsubb xmm5,xmm1,xmm4 ;scaled threshold

; Create the mask image

@@: vmovdqa xmm0,xmmword ptr [rdx] ;original image pixels

vpsubb xmm1,xmm0,xmm4 ;scaled image pixels

vpcmpgtb xmm2,xmm1,xmm5 ;mask pixels

vmovdqa xmmword ptr [r8],xmm2 ;save mask result

vmovdqa xmm0,xmmword ptr [rdx+16]

vpsubb xmm1,xmm0,xmm4

vpcmpgtb xmm2,xmm1,xmm5

vmovdqa xmmword ptr [r8+16],xmm2

vmovdqa xmm0,xmmword ptr [rdx+32]

vpsubb xmm1,xmm0,xmm4

vpcmpgtb xmm2,xmm1,xmm5

vmovdqa xmmword ptr [r8+32],xmm2

vmovdqa xmm0,xmmword ptr [rdx+48]

vpsubb xmm1,xmm0,xmm4

vpcmpgtb xmm2,xmm1,xmm5

vmovdqa xmmword ptr [r8+48],xmm2

add rdx,64

add r8,64 ;update pointers

sub ecx,1 ;update counter

jnz @B ;repeat until done

mov eax,1 ;set success return code

Done: _DeleteFrame rbx

ret

AvxThresholdImage_ endp

;

; Macro _UpdateBlockSums

;

_UpdateBlockSums macro disp

vmovdqa xmm0,xmmword ptr [rdx+disp] ;xmm0 = 16 image pixels

vmovdqa xmm1,xmmword ptr [r8+disp] ;xmm1 = 16 mask pixels

vpand xmm2,xmm1,xmm8 ;xmm2 = 16 mask pixels (0x00 or 0x01)

vpaddb xmm6,xmm6,xmm2 ;update block num_masked_pixels

vpand xmm2,xmm0,xmm1 ;zero out unmasked image pixel

vpunpcklbw xmm3,xmm2,xmm9 ;promote image pixels from byte to word

vpunpckhbw xmm4,xmm2,xmm9

vpaddw xmm4,xmm4,xmm3

vpaddw xmm7,xmm7,xmm4 ;update block sum_mask_pixels

endm

; extern "C" bool AvxCalcImageMean_(ITD* itd);

;

; Returns: 0 = invalid image size or unaligned image buffer, 1 = success

AvxCalcImageMean_ proc frame

_CreateFrame CIM_,0,64,rbx

_SaveXmmRegs xmm6,xmm7,xmm8,xmm9

_EndProlog

; Verify the arguments in the ITD structure

mov rbx,rcx ;rbx = itd ptr

mov ecx,[rbx+ITD.NumPixels] ;ecx = num_pixels

mov rdx,[rbx+ITD.PbSrc] ;rdx = pb_src

mov r8,[rbx+ITD.PbMask] ;r8 = pb_mask

sub rsp,32 ;allocate home area for IsValid

call IsValid ;validate args

or al,al

jz Done ;jump if invalid

; Initialize registers for processing loop

mov ecx,[rbx+ITD.NumPixels] ;ecx = num_pixels

shr ecx,6 ;ecx = number of 64b pixel blocks

mov rdx,[rbx+ITD.PbSrc] ;rdx = pb_src

mov r8,[rbx+ITD.PbMask] ;r8 = pb_mask

vmovdqa xmm8,xmmword ptr [CountPixelsMask] ;mask for counting pixels

vpxor xmm9,xmm9,xmm9 ;xmm9 = packed zero

xor r10d,r10d ;r10d = num_masked_pixels (1 dword)

vpxor xmm5,xmm5,xmm5 ;sum_masked_pixels (4 dwords)

;Calculate num_mask_pixels and sum_mask_pixels

LP1: vpxor xmm6,xmm6,xmm6 ;num_masked_pixels_tmp (16 byte values)

vpxor xmm7,xmm7,xmm7 ;sum_masked_pixels_tmp (8 word values)

_UpdateBlockSums 0

_UpdateBlockSums 16

_UpdateBlockSums 32

_UpdateBlockSums 48

; Update num_masked_pixels

vpsrldq xmm0,xmm6,8

vpaddb xmm6,xmm6,xmm0 ;num_mask_pixels_tmp (8 byte vals)

vpsrldq xmm0,xmm6,4

vpaddb xmm6,xmm6,xmm0 ;num_mask_pixels_tmp (4 byte vals)

vpsrldq xmm0,xmm6,2

vpaddb xmm6,xmm6,xmm0 ;num_mask_pixels_tmp (2 byte vals)

vpsrldq xmm0,xmm6,1

vpaddb xmm6,xmm6,xmm0 ;num_mask_pixels_tmp (1 byte val)

vpextrb eax,xmm6,0

add r10d,eax ;num_mask_pixels += num_mask_pixels_tmp

; Update sum_masked_pixels

vpunpcklwd xmm0,xmm7,xmm9 ;promote sum_mask_pixels_tmp to dwords

vpunpckhwd xmm1,xmm7,xmm9

vpaddd xmm5,xmm5,xmm0

vpaddd xmm5,xmm5,xmm1 ;sum_mask_pixels += sum_masked_pixels_tmp

add rdx,64 ;update pb_src pointer

add r8,64 ;update pb_mask pointer

sub rcx,1 ;update loop counter

jnz LP1 ;repeat if not done

; Compute mean of masked pixels

vphaddd xmm0,xmm5,xmm5

vphaddd xmm1,xmm0,xmm0

vmovd eax,xmm1 ;eax = final sum_mask_pixels

test r10d,r10d ;is num_mask_pixels zero?

jz NoMean ;if yes, skip calc of mean

vcvtsi2sd xmm0,xmm0,eax ;xmm0 = sum_masked_pixels

vcvtsi2sd xmm1,xmm1,r10d ;xmm1 = num_masked_pixels

vdivsd xmm2,xmm0,xmm1 ;xmm2 = mean_masked_pixels

jmp @F

NoMean: vmovsd xmm2,[R8_MinusOne] ;use -1.0 for no mean

@@: mov [rbx+ITD.SumMaskedPixels],eax ;save sum_masked_pixels

mov [rbx+ITD.NumMaskedPixels],r10d ;save num_masked_pixels

vmovsd [rbx+ITD.MeanMaskedPixels],xmm2 ;save mean

mov eax,1 ;set success return code

Done: _RestoreXmmRegs xmm6,xmm7,xmm8,xmm9

_DeleteFrame rbx

ret

AvxCalcImageMean_ endp

end

Listing 7-8.

Example Ch07_08

The algorithm that’s used in example Ch07_08 consists of two phases. Phase 1 constructs the mask image that’s shown in Figure 7-7. Phase 2 computes the mean intensity of all pixels in the grayscale image whose corresponding mask image pixel is white (i.e., above the specified threshold). The file Ch07_08.h that’s shown in Listing 7-8 defines a structure named ITD that maintains data required by the algorithm. Note this structure contains two count values: m_NumPixels and m_NumMaskedPixels. The former value is the total number of image pixels, while the latter value represents the number of image pixels greater than m_Threshold.

The C++ code in Listing 7-8 contains separate thresholding and mean calculating functions. The function AvxThresholdImageCpp constructs the mask image by comparing each pixel in the grayscale image to the threshold value that’s specified by itd->m_Threshold. If a grayscale image pixel is greater than this value, its corresponding pixel in the mask image is set to 0xff; otherwise, the mask image pixel is set to 0x00. The function AvxCalcImageMeanCpp uses this mask image to calculate the mean intensity value of all grayscale image pixels greater than the threshold value. Note that the for loop in this function computes num_mask_pixels and sum_mask_pixels using simple Boolean expressions instead of logical compare operations. The former technique is often faster and easier to implement using SIMD arithmetic.

Listing 7-8 also shows assembly language implementations of the thresholding and mean calculating functions. Following its prolog, the function AvxThresholdImage_ validates the arguments in the supplied ITD structure by calling the C++ function IsValid. Prior to the call instruction, AvxThresholdImage_ loads the required argument values for IsValid into the appropriate registers and allocates a home area using a sub rsp,32 instruction. After argument validation, the movzx r9d,byte ptr [rbx+ITD.Threshold] instruction loads the threshold value into register R9D. The ensuing vpshufb xmm1,xmm1,xmm0 instruction “broadcasts” the threshold value to all byte positions in register XMM1. The vpshufb instruction uses the low-order four bits of each byte in the second source operand as an index to permute the bytes in the destination operand (a zero is copied if the high-order bit is set in the seconds source operand byte). Figure 7-8 illustrates this process. The packed threshold value is then scaled using a vpsubb instruction. The reason for doing this is explained in the next paragraph.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig8_HTML.jpg — Figure 7-8.
Execution examples of the instruction *vpshufb*

The processing loop in function AvxThreshholdImage_ uses the vpcmpgtb (Compare Packed Signed Integers for Greater Than) instruction to create the mask image. This instruction performs pairwise compares of the byte elements in the two source operands. If a byte in the first source operand is greater than the corresponding byte in the second operand, the destination operand byte is set to 0xff; otherwise, the destination operand byte is set to 0x00. Figure 7-9 illustrates execution of the vpcmpgtb instruction. It is important to note that vpcmpgtb executes its compares using signed integer arithmetic. This means that the pixels values in the grayscale image, which are unsigned byte values, must be re-scaled for compatibility with the vpcmpgtb instruction. The vpsubb instruction remaps the image’s grayscale pixels values from [0, 255] to [-128, 127]. This is also the reason that a vpsubb instruction was used on the packed threshold value prior to the start of the loop. Following each compare operation, the vmovdqa instruction saves the mask pixels to the specified buffer. Similar to example Ch07_04, the function AvxThresholdImage_ uses a partially unrolled processing loop to handle 64 pixels per iteration.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig9_HTML.jpg — Figure 7-9.
Execution of the instruction *vpcmpgtb*

The assembly language function AvxCalcImageMean_ also begins by validating its arguments using the C++ function IsValid. Following argument validation, the xor r10d,r10d and vpxor xmm5,xmm5,xmm5 instructions initialize num_masked_pixels and sum_masked_pixels (four doublewords) to zero, respectively. The processing loop in function AvxCalcImageMean_ uses a macro named _UpdateBlockSums to compute the intermediate values num_masked_pixels_tmp and sum_masked_pixels_tmp for a block of 64 pixels. This macro performs its calculations using packed byte and word arithmetic, which reduces the number of byte to doubleword size-promotions that must be carried out. Figure 7-10 illustrates the arithmetic and Boolean operations that are performed by _UpdateBlockSums. The values num_masked_pixels (R10D) and sum_masked_pixels (XMM5) are then updated and the processing loop repeats until all pixels have been processed.

../images/326959_2_En_7_Chapter/326959_2_En_7_Fig10_HTML.jpg — Figure 7-10.
Masked pixel sum and pixel count calculations performed by macro *_UpdateBlockSums*

Following completion of its processing loop, function AvxCalcImageMean_ calculates the final mean intensity value using scalar double-precision floating-point arithmetic. Note that num_mask_pixels is tested prior to calculating the mean in order to avoid a division-by-zero error. Here is the output for source code example Ch07_08:

Results for AvxThreshold

C++ X86-AVX

---------------------------------------------

SumPixelsMasked: 23813043 23813043

NumPixelsMasked: 138220 138220

MeanMaskedPixels: 172.2836 172.2836

Running benchmark function AvxThreshold_BM - please wait

Benchmark times save to file Ch07_08_AvxThreshold_BM_CHROMIUM.csv

Table 7-4 shows timing measurements for the source code example Ch07_08. Note that the measurements in this table are for an entire image thresholding and mean calculation sequence.

Table 7-4.

Mean Execution Times (Microseconds) to Perform Image Thresholding and Mean Calculation Using TestImage2.bmp

CPU	C++	Assembly Language
i7-4790S	289	50
i9-7900X	250	40
i7-8700K	242	39

Summary

Here are the key learning points for Chapter 7:

The vpadd[b|w|d|q] instructions perform packed addition. The vpadds[b|w] and vpaddus[b|w] instructions perform packed signed and unsigned saturated addition.
The vpsub[b|w|d|q] instructions perform packed subtraction. The vpsubs[b|w] and vpsubus[b|w] instructions perform packed signed and unsigned saturated subtraction.
The vpmul[h|l]w instructions carry out multiplication using packed word operands. The vpmuldq and vpmulld instructions carry out multiplication using packed doubleword operands.
The vpsll[w|d|q] and vpsrl[w|d|q] instructions execute logical left and right shifts using packed operands. The vpsra[w|d|q] instructions execute arithmetic right shifts using packed operands. The vps[l|r]dq instructions execute logical left and right shifts using 128-bit wide operands.
Assembly language functions can use the vpand, vpor, and vpxor instructions to perform bitwise AND, inclusive OR, and exclusive OR operations using packed integer operands.
The instructions vpextr[b|w|d|q] extract an element value from a packed operand. The vpinsr[b|w|d|q] instructions insert an element value into a packed operand.
The vpunpckl[bw|dw|dq] and vpunpckh[bw|dw|dq] instructions unpack and interleave the contents of their two source operands. These instructions are frequently used to size-promote packed integer operands. The vpackus[bw|dw] instructions size-reduce packed integer operands using unsigned saturated arithmetic.
The vpminu[b|w|d] and vpmaxu[b|w|d] instructions perform packed unsigned integer minimum-maximum compares.
The vpshufb instruction rearranges the bytes of a packed operand according to a control mask.
The vpcmpgt[b|w|d|q] instructions perform signed integer greater than compares using packed operands.
Aligning the target of a jump instruction to a 16-byte boundary often results in faster executing for loops.

Previous Chapter

6. AVX Programming – Packed Floating-Point

Next Chapter

8. Advanced Vector Extensions 2

Table of Contents for Modern X86 Assembly Language Programming: Covers x86 64-bit, AVX, AVX2, and AVX-512

7. AVX Programming – Packed Integers

Packed Integer Addition and Subtraction

Packed Integer Shifts

Packed Integer Multiplication

Packed Integer Image Processing

Pixel Minimum-Maximum Values

Pixel Mean Intensity

Pixel Conversions

Image Histograms

Image Thresholding

Summary

Table of Contents for
Modern X86 Assembly Language Programming: Covers x86 64-bit, AVX, AVX2, and AVX-512