x86 彙編/SSE

x86 彙編
快速連結：暫存器 • 移動 • 跳轉 • 計算 • 邏輯 • 重排 • 其他 • FPU

SSE 代表流式 SIMD 擴充套件。它本質上是MMX 指令的浮點等效指令。SSE 暫存器為 128 位，可用於對各種資料大小和型別執行操作。與 MMX 不同，SSE 暫存器不與浮點棧重疊。

暫存器

SSE 由英特爾於 1999 年在奔騰 III 中推出，建立了八個新的 128 位暫存器

XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7

最初，SSE 暫存器只能用作四個 32 位單精度浮點數（相當於 C 中的 float）。SSE2 擴充套件了 XMM 暫存器的功能，因此現在可以將它們用作

2 個 64 位浮點數（雙精度）
2 個 64 位整數
4 個 32 位浮點數（單精度）
4 個 32 位整數
8 個 16 位整數
16 個 8 位字元（位元組）

資料移動示例

以下程式（使用NASM 語法）使用 SIMD 指令執行資料移動。

;
; nasm -felf32 -g sseMove.asm
; ld -g sseMove.o
;
global _start

section .data
	align 16
	v1:	dd 1.1, 2.2, 3.3, 4.4	; Four Single precision floats 32 bits each
	v1dp:	dq 1.1, 2.2		; Two Double precision floats 64 bits each
	v2:	dd 5.5, 6.6, 7.7, 8.8
	v2s1:	dd 5.5, 6.6, 7.7, -8.8
	v2s2:	dd 5.5, 6.6, -7.7, -8.8
	v2s3:	dd 5.5, -6.6, -7.7, -8.8
	v2s4:	dd -5.5, -6.6, -7.7, -8.8
	num1:	dd 1.2
	v3:	dd 1.2, 2.3, 4.5, 6.7	; No longer 16 byte aligned
	v3dp:	dq 1.2, 2.3		; No longer 16 byte aligned

section .bss
	mask1:	resd 1
	mask2:	resd 1
	mask3:	resd 1
	mask4:	resd 1

section .text
	_start:

;
;	op	dst,  src
;
				;
				; SSE
				;
				; Using movaps since vectors are 16 byte aligned
	movaps	xmm0, [v1]	; Move four 32-bit(single precision) floats to xmm0 
	movaps	xmm1, [v2]
	movups	xmm2, [v3]	; Need to use movups since v3 is not 16 byte aligned
	;movaps	xmm3, [v3]	; This would seg fault if uncommented 
	movss	xmm3, [num1]	; Move 32-bit float num1 to the least significant element of xmm3
	movss	xmm3, [v3]	; Move first 32-bit float of v3 to the least significant element of xmm3
	movlps	xmm4, [v3]	; Move 64-bits(two single precision floats) from memory to the lower 64-bit elements of xmm4
	movhps	xmm4, [v2]	; Move 64-bits(two single precision floats) from memory to the higher 64-bit elements of xmm4

				; Source and destination for movhlps and movlhps must be xmm registers
	movhlps	xmm5, xmm4	; Transfers the higher 64-bits of the source xmm4 to the lower 64-bits of the destination xmm5
	movlhps	xmm5, xmm4	; Transfers the lower 64-bits of the source xmm4 to the higher 64-bits of the destination xmm5


	movaps	xmm6, [v2s1]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax 
	mov	[mask1], eax	; Should be 8
	movaps	xmm6, [v2s2]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax
	mov	[mask2], eax	; Should be 12
	movaps	xmm6, [v2s3]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax
	mov	[mask3], eax	; Should be 14
	movaps	xmm6, [v2s4]
	movmskps eax, xmm6	; Extract the sign bits from four 32-bits floats in xmm6 and create 4 bit mask in eax
	mov	[mask4], eax	; Should be 15


				;
				; SSE2
				;
	movapd	xmm6, [v1dp]	; Move two 64-bit(double precision) floats to xmm6, using movapd since vector is 16 byte aligned 
				; Next two instruction should have equivalent results to movapd xmm6, [vldp]
	movhpd	xmm6, [v1dp+8]	; Move a 64-bit(double precision) float into the higher 64-bit elements of xmm6 
	movlpd	xmm6, [v1dp]	; Move a 64-bit(double precision) float into the lower 64-bit elements of xmm6
	movupd	xmm6, [v3dp]	; Move two 64-bit floats to xmm6, using movupd since vector is not 16 byte aligned

使用打包單精度浮點數進行算術運算的示例

以下程式（使用NASM 語法）對一些數字執行一些 SIMD 操作。

global _start

section .data
    v1: dd 1.1, 2.2, 3.3, 4.4    ;first set of 4 numbers
    v2: dd 5.5, 6.6, 7.7, 8.8    ;second set
    
section .bss
    v3: resd 4    ;result
    
section .text
    _start:
    
    movups xmm0, [v1]   ;load v1 into xmm0
    movups xmm1, [v2]   ;load v2 into xmm1
    
    addps xmm0, xmm1    ;add the 4 numbers in xmm1 (from v2) to the 4 numbers in xmm0 (from v1), store in xmm0. for the first float the result will be 5.5+1.1=6.6
    mulps xmm0, xmm1    ;multiply the four numbers in xmm1 (from v2, unchanged) with the results from the previous calculation (in xmm0), store in xmm0. for the first float the result will be 5.5*6.6=36.3
    subps xmm0, xmm1    ;subtract the four numbers in v2 (in xmm1, still unchanged) from result from previous calculation (in xmm1). for the first float, the result will be 36.3-5.5=30.8
    
    movups [v3], xmm0   ;store v1 in v3
    
    ;end program
    ret

結果值應為

30.800    51.480    77.000    107.360

使用 GNU 工具鏈，你可以像這樣除錯和單步執行

 % nasm -felf32 -g ssedemo.asm
 % ld -g ssedemo.o            
 % gdb -q ./a.out                
Reading symbols from a.out...done.
(gdb) break _start
Breakpoint 1 at 0x8048080
(gdb) r
Starting program: a.out 

Breakpoint 1, 0x08048080 in _start ()
(gdb) disass
Dump of assembler code for function _start:
=> 0x08048080 <+0>:	movups 0x80490a0,%xmm0
   0x08048087 <+7>:	movups 0x80490b0,%xmm1
   0x0804808e <+14>:	addps  %xmm1,%xmm0
   0x08048091 <+17>:	mulps  %xmm1,%xmm0
   0x08048094 <+20>:	subps  %xmm1,%xmm0
   0x08048097 <+23>:	movups %xmm0,0x80490c0
End of assembler dump.
(gdb) stepi
0x08048087 in _start ()
(gdb) 
0x0804808e in _start ()
(gdb) p $xmm0
$1 = {v4_float = {1.10000002, 2.20000005, 3.29999995, 4.4000001}, v2_double = {3.6000008549541236, 921.60022034645078}, v16_int8 = {-51, -52, -116, 63, 
    -51, -52, 12, 64, 51, 51, 83, 64, -51, -52, -116, 64}, v8_int16 = {-13107, 16268, -13107, 16396, 13107, 16467, -13107, 16524}, v4_int32 = {1066192077, 
    1074580685, 1079194419, 1082969293}, v2_int64 = {4615288900054469837, 4651317697086436147}, uint128 = 0x408ccccd40533333400ccccd3f8ccccd}
(gdb) x/4f &v1
0x80490a0 <v1>:	1.10000002	2.20000005	3.29999995	4.4000001
(gdb) stepi
0x08048091 in _start ()
(gdb) p $xmm0
$2 = {v4_float = {6.5999999, 8.80000019, 11, 13.2000008}, v2_double = {235929.65665283203, 5033169.0185546875}, v16_int8 = {51, 51, -45, 64, -51, -52, 12, 
    65, 0, 0, 48, 65, 52, 51, 83, 65}, v8_int16 = {13107, 16595, -13107, 16652, 0, 16688, 13108, 16723}, v4_int32 = {1087583027, 1091357901, 1093664768, 
    1095971636}, v2_int64 = {4687346494113788723, 4707162335057281024}, uint128 = 0x4153333441300000410ccccd40d33333}
(gdb)

偵錯程式命令解釋

break: 在本例中，在給定標籤處設定斷點
stepi: 在程式中向前執行一步指令
p: print 的縮寫，列印給定暫存器或變數。在 GDB 中，暫存器以 $ 為字首。
x: examine 的縮寫，檢查給定記憶體地址。"/4f" 表示 "4 個浮點數"（GDB 中的浮點數為 32 位）。你可以使用 c 表示字元，x 表示十六進位制，當然也可以使用任何其他數字代替 4。"&" 獲取 v1 的地址，與 C 中相同。

使用進行亂序的示例`shufps`

shufps IMM8, arg1, arg2	GAS 語法
shufps arg2, arg1, IMM8	英特爾語法

shufps 可用於對打包單精度浮點數進行亂序。該指令採用三個引數，arg1 為 xmm 暫存器，arg2 為 xmm 或 128 位記憶體位置，IMM8 為 8 位立即數控制位元組。shufps 將分別從 arg1 和 arg2 獲取兩個元素，並將這些元素複製到 arg2。較低的兩個元素將來自 arg1，較高的兩個元素將來自 arg2。

IMM8 控制位元組描述

IMM8 控制位元組被分成四個位欄位組，它們控制輸出到 arg2，如下所示

IMM8[1:0] 指定 arg1 中哪個元素最終位於 arg2 的最低有效元素中

IMM8[1:0] 描述

00b 複製到最低有效元素

01b 複製到第二個元素

10b 複製到第三個元素

11b 複製到最高有效元素
IMM8[3:2] 指定 arg1 中哪個元素最終位於 arg2 的第二個元素中

IMM8[3:2] 描述

00b 複製到最低有效元素

01b 複製到第二個元素

10b 複製到第三個元素

11b 複製到最高有效元素
IMM8[5:4] 指定 arg2 中哪個元素最終位於 arg2 的第三個元素中

IMM8[5:4] 描述

00b 複製到最低有效元素

01b 複製到第二個元素

10b 複製到第三個元素

11b 複製到最高有效元素
IMM8[7:6] 指定 arg2 中哪個元素最終位於 arg2 的最高有效元素中

IMM8[7:6] 描述

00b 複製到最低有效元素

01b 複製到第二個元素

10b 複製到第三個元素

11b 複製到最高有效元素

IMM8 示例

考慮位元組 0x1B

位號（0 為 LSB）	7	6	5	4	3	2	1	0
位元組值	0x1B
四位位元組值	0x1				0xB
2 位整數（十進位制）值	0		1		2		3
位值	0	0	0	1	1	0	1	1

上面顯示的 2 位值用於確定哪些元素被複制到 arg2。位 7-4 是 arg2 中的 "索引"，位 3-0 是 arg1 中的 "索引"。

由於位 7-6 為 0，因此 arg2 的最低有效元素被複制到 arg2 的最高有效元素中，即位 127-96。
由於位 5-4 為 1，因此 arg2 的第二個元素被複制到 arg2 的第三個元素中，即位 95-64。
由於位 3-2 為 2，因此 arg1 的第三個元素被複制到 arg2 的第二個元素中，即位 63-32。
由於位 0-1 為 3，因此 arg1 的第四個元素被複制到 arg2 的最低有效元素中，即位 (31-0)。

請注意，由於以下示例中的第一個和第二個引數相等，因此掩碼 0x1B 將有效地反轉 XMM 暫存器中浮點數的順序，因為 2 位整數為 0、1、2、3。如果是 3、2、1、0 (0xE4)，它將是一個無操作。如果是 0、0、0、0 (0x00)，它將是最低有效 32 位的廣播。

示例

.data
	.align 16
        v1: .float 1.1, 2.2, 3.3, 4.4
        v2: .float 5.5, 6.6, 7.7, 8.8
        v3: .float 0, 0, 0, 0
 
.text
.global _start 
_start:   
        movaps  v1,%xmm0        # load v1 into xmm0 to xmm6
        movaps  v1,%xmm1	# using movaps since v1 is 16 byte aligned
        movaps  v1,%xmm2
        movaps  v1,%xmm3
        movaps  v1,%xmm4
        movaps  v1,%xmm5
        movaps  v1,%xmm6
 
        shufps $0x1b, %xmm0, %xmm0 # reverse order of the 4 floats
        shufps $0x00, %xmm1, %xmm1 # Broadcast least significant element to all elements
        shufps $0x55, %xmm2, %xmm2 # Broadcast second element to all elements
        shufps $0xAA, %xmm3, %xmm3 # Broadcast third element to all elements
        shufps $0xFF, %xmm4, %xmm4 # Broadcast most significant element to all elements
        shufps $0x39, %xmm5, %xmm5 # Rotate elements right
        shufps $0x93, %xmm6, %xmm6 # Rotate elements left 

        movups  %xmm0,v3        #store v1 in v3
        ret

使用 GAS 構建 ELF 可執行檔案

as -g shufps.S -o shufps.o
ld -g shufps.o

文字處理指令

SSE 4.2 添加了四個字串文字處理指令 PCMPISTRI、PCMPISTRM、PCMPESTRI 和 PCMPESTRM。這些指令採用三個引數，arg1 為 xmm 暫存器，arg2 為 xmm 或 128 位記憶體位置，IMM8 為 8 位立即數控制位元組。這些指令將對 arg1 和 arg2 的打包內容執行算術比較。IMM8 指定輸入/輸出格式以及兩個中間處理階段的操作。中間處理階段 1 和階段 2 的結果將分別稱為 IntRes1 和 IntRes2。這些指令還透過對算術標誌（AF、CF、OF、PF、SF 和 ZF）的過載使用提供有關結果的附加資訊。

這些指令分多個步驟進行

比較 arg1 和 arg2
將聚合操作應用於比較結果，結果流入 IntRes1
執行可選的否定操作，結果流入 IntRes2
生成一個索引（在ECX中）或掩碼（在XMM0中）形式的輸出

IMM8 控制位元組描述

IMM8 控制位元組被分成四組位域，控制以下設定

IMM8[1:0] 指定 128 位源資料的格式（arg1 和 arg2）

IMM8[1:0]	描述
00b	無符號位元組（16 個打包的無符號位元組）
01b	無符號字（8 個打包的無符號字）
10b	有符號位元組（16 個打包的有符號位元組）
11b	有符號字（8 個打包的有符號字）

IMM8[3:2] 指定聚合操作，其結果將被放置在中間結果 1 中，我們將稱之為 IntRes1。IntRes1 的大小將取決於源資料的格式，打包位元組為 16 位，打包字為 8 位

IMM8[3:2]	描述
00b	等於任何，arg1 是一個字元集，arg2 是要搜尋的字串。如果 arg2[i] 位於 arg1 表示的集合中，則 IntRes1[i] 設定為 1 arg1 = "aeiou" arg2 = "Example string 1" IntRes1 = 0010001000010000
01b	範圍，arg1 是一組字元範圍，例如 "09az" 表示從 0 到 9 和從 a 到 z 的所有字元，arg2 是要搜尋的字串。如果 arg[i] 位於 arg1 表示的任何範圍內，則 IntRes1[i] 設定為 1 arg1 = "09az" arg2 = "Testing 1 2 3, T" IntRes1 = 0111111010101000
10b	每個都相等，arg1 是字串一，arg2 是字串二。如果 arg1[i] == arg2[i]，則 IntRes1[i] 設定為 1 arg1 = "The quick brown " arg2 = "The quack green " IntRes1 = 1111110111010011
11b	有序相等，arg1 是要搜尋的子字串，arg2 是要搜尋的字串。如果子字串 arg1 可以在位置 arg2[i] 處找到，則 IntRes1[i] 設定為 1 arg1 = "he" arg2 = ", he helped her " IntRes1 = 0010010000001000

IMM8[5:4] 指定 IntRes1 的極性或處理，到中間結果 2，將被稱為 IntRes2

IMM8[5:4]	描述
00b	正極性	IntRes2 = IntRes1
01b	負極性	IntRes2 = -1 XOR IntRes1
10b	掩碼正	IntRes2 = IntRes1
11b	掩碼負	如果 reg/mem[i] 無效，則 IntRes2 = IntRes1，否則 ~IntRes1

IMM8[6] 指定輸出選擇，或 IntRes2 如何處理到輸出中。對於 PCMPESTRI 和 PCMPISTRI，輸出是當前由 arg2 引用的資料的索引

IMM8[6] 描述

0b 最低有效索引 ECX 包含 IntRes2 中最低有效位的集合

1b 最高有效索引 ECX 包含 IntRes2 中最高有效位的集合

對於 PCMPESTRM 和 PCMPISTRM，輸出是一個掩碼，反映了 IntRes2 中所有設定的位

IMM8[6]	描述
0b	最低有效索引	位掩碼，XMM0 的最低有效位包含 IntRes2 16(8) 位掩碼。XMM0 被零擴充套件到 128 位。
1b	最高有效索引	位元組/字掩碼，XMM0 包含擴充套件到位元組/字掩碼的 IntRes2

IMM8[7] 應設定為零，因為它沒有設計的含義。

四個指令

pcmpistri IMM8, arg2, arg1	GAS 語法
pcmpistri arg1, arg2, IMM8	英特爾語法

PCMPISTRI，打包比較隱式長度字串，返回索引。比較隱式長度的字串並在 ECX 中生成索引。

運算元

arg1

XMM 暫存器

arg2

XMM 暫存器
記憶體

IMM8

8 位立即值

修改後的標誌

如果 IntRes2 為零，則 CF 被重置，否則被設定
如果在 arg2 中找到空終止字元，則 ZF 被設定，否則被重置
如果在 arg1 中找到空終止字元，則 SF 被設定，否則被重置
OF 設定為 IntRes2[0]
AF 被重置
PF 被重置

示例

;
; nasm -felf32 -g sse4_2StrPcmpistri.asm -l sse4_2StrPcmpistri.lst
; gcc -o sse4_2StrPcmpistri sse4_2StrPcmpistri.o
;
global main 

extern printf
extern strlen
extern strcmp

section .data
	align 4
	;
	; Fill buf1 with a repeating pattern of ABCD
	;
	buf1:		times 10 dd 0x44434241
	s1:		db "This is a string", 0
	s2:		db "This is a string slightly different string", 0
	s3:		db "This is a str", 0
	fmtStr1:	db "String: %s len: %d", 0x0A, 0
	fmtStr1b:	db "strlen(3): String: %s len: %d", 0x0A, 0
	fmtStr2:	db "s1: =%s= and s2: =%s= compare: %d", 0x0A, 0
	fmtStr2b:	db "strcmp(3): s1: =%s= and s2: =%s= compare: %d", 0x0A, 0

;
; Functions will follow the cdecl call convention
;
section .text
	main:			; Using main since we are using gcc to link

	sub	esp, -16	; 16 byte align the stack
	sub	esp, 16		; space for four 4 byte parameters

	;
	; Null terminate buf1, make it proper C string, length is now 39
	;
	mov	[buf1+39], byte 0x00

	lea	eax, [buf1]
	mov	[esp], eax	; Arg1: pointer of string to calculate the length of
	mov	ebx, eax	; Save pointer in ebx since we will use it again
	call	strlenSSE42
	mov	edx, eax	; Copy length of arg1 into edx
	
	mov	[esp+8], edx	; Arg3: length of string
	mov	[esp+4], ebx	; Arg2: pointer to string
	lea	eax, [fmtStr1]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf		; Call printf(3):
				;	int printf(const char *format, ...);

	lea	eax, [buf1]
	mov	[esp], eax	; Arg1: pointer of string to calculate the length of
	mov	ebx, eax	; Save pointer in ebx since we will use it again
	call	strlen		; Call strlen(3):
				;	size_t strlen(const char *s);
	mov	edx, eax	; Copy length of arg1 into edx
	
	mov	[esp+8], edx	; Arg3: length of string
	mov	[esp+4], ebx	; Arg2: pointer to string
	lea	eax, [fmtStr1b]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf		; Call printf(3):
				;	int printf(const char *format, ...);

	lea	eax, [s2]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmpSSE42

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s2]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	lea	eax, [s2]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmp		; Call strcmp(3):
				;	int strcmp(const char *s1, const char *s2);

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s2]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2b]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	lea	eax, [s3]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmpSSE42

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s3]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	lea	eax, [s3]
	mov	[esp+4], eax	; Arg2: pointer to second string to compare
	lea	eax, [s1]
	mov	[esp], eax	; Arg1: pointer to first string to compare
	call	strcmp		; Call strcmp(3):
				;	int strcmp(const char *s1, const char *s2);

	mov	[esp+12], eax	; Arg4: result from strcmpSSE42  
	lea	eax, [s3]
	mov	[esp+8], eax	; Arg3: pointer to second string
	lea	eax, [s1]
	mov	[esp+4], eax	; Arg2: pointer to first string
	lea	eax, [fmtStr2b]
	mov	[esp], eax	; Arg1: pointer to format string
	call	printf

	call	exit


;
; size_t strlen(const char *s);
;
strlenSSE42:
	push	ebp
	mov	ebp, esp

	mov	edx, [ebp+8]	; Arg1: copy s(pointer to string) to edx 
	;
	; We are looking for null terminating char, so set xmm0 to zero
	;
	pxor	xmm0, xmm0
	mov	eax, -16	; Avoid extra jump in main loop

strlenLoop:
	add	eax, 16
	;
	; IMM8[1:0]	= 00b
	;	Src data is unsigned bytes(16 packed unsigned bytes)
	; IMM8[3:2]	= 10b
	; 	We are using Equal Each aggregation
	; IMM8[5:4]	= 00b
	;	Positive Polarity, IntRes2	= IntRes1
	; IMM8[6]	= 0b
	;	ECX contains the least significant set bit in IntRes2
	;
	pcmpistri	xmm0,[edx+eax], 0001000b
	;
	; Loop while ZF != 0, which means none of bytes pointed to by edx+eax
	; are zero.
	;
	jnz	strlenLoop
	
	;
	; ecx will contain the offset from edx+eax where the first null
	; terminating character was found.
	;
	add	eax, ecx
	pop	ebp
	ret

;
; int strcmp(const char *s1, const char *s2);
;
strcmpSSE42:
	push	ebp
	mov	ebp, esp

	mov	eax, [ebp+8]	; Arg1: copy s1(pointer to string) to eax
	mov	edx, [ebp+12]	; Arg2: copy s2(pointer to string) to edx
	;
	; Subtract s2(edx) from s1(eax). This admititedly looks odd, but we
	; can now use edx to index into s1 and s2. As we adjust edx to move
	; forward into s2, we can then add edx to eax and this will give us
	; the comparable offset into s1 i.e. if we take edx + 16 then:
	;
	;	edx 	= edx + 16		= edx + 16
	;	eax+edx	= eax -edx + edx + 16	= eax + 16
	;
	; therefore edx points to s2 + 16 and eax + edx points to s1 + 16.
	; We thus only need one index, convoluted but effective.
	;
	sub	eax, edx
	sub	edx, 16		; Avoid extra jump in main loop

strcmpLoop:
	add	edx, 16
	movdqu	xmm0, [edx]
	;
	; IMM8[1:0]	= 00b
	;	Src data is unsigned bytes(16 packed unsigned bytes)
	; IMM8[3:2]	= 10b
	; 	We are using Equal Each aggregation
	; IMM8[5:4]	= 01b
	;	Negative Polarity, IntRes2	= -1 XOR IntRes1
	; IMM8[6]	= 0b
	;	ECX contains the least significant set bit in IntRes2
	;
	pcmpistri	xmm0, [edx+eax], 0011000b
	;
	; Loop while ZF=0 and CF=0:
	;
	;	1) We find a null in s1(edx+eax) ZF=1
	;	2) We find a char that does not match CF=1
	;
	ja	strcmpLoop

	;
	; Jump if CF=1, we found a mismatched char
	;
	jc	strcmpDiff

	;
	; We terminated loop due to a null character i.e. CF=0 and ZF=1
	;
	xor	eax, eax	; They are equal so return zero
	jmp	exitStrcmp

strcmpDiff:
	add	eax, edx	; Set offset into s1 to match s2
	;
	; ecx is offset from current poition where two strings do not match,
	; so copy the respective non-matching byte into eax and edx and fill
	; in remaining bits w/ zero.
	;
	movzx	eax, byte[eax+ecx]
	movzx	edx, byte[edx+ecx]
	;
	; If s1 is less than s2 return integer less than zero, otherwise return
	; integer greater than zero.
	;
	sub	eax, edx

exitStrcmp:
	pop	ebp
	ret

exit:
				;
				; Call exit(3) syscall
				;	void exit(int status)
				;
	mov	ebx, 0		; Arg one: the status
	mov	eax, 1		; Syscall number:
	int 	0x80

預期輸出

String: ABCDABCDABCDABCDABCDABCDABCDABCDABCDABC len: 39
strlen(3): String: ABCDABCDABCDABCDABCDABCDABCDABCDABCDABC len: 39
s1: =This is a string= and s2: =This is a string slightly different string= compare: -32
strcmp(3): s1: =This is a string= and s2: =This is a string slightly different string= compare: -32
s1: =This is a string= and s2: =This is a str= compare: 105
strcmp(3): s1: =This is a string= and s2: =This is a str= compare: 105

pcmpistrm IMM8, arg2, arg1	GAS 語法
pcmpistrm arg1, arg2, IMM8	英特爾語法

PCMPISTRM，打包比較隱式長度字串，返回掩碼。比較隱式長度的字串並在 XMM0 中生成掩碼。

運算元

arg1

XMM 暫存器

arg2

XMM 暫存器
記憶體

IMM8

8 位立即值

修改後的標誌

如果 IntRes2 為零，則 CF 被重置，否則被設定
如果在 arg2 中找到空終止字元，則 ZF 被設定，否則被重置
如果在 arg2 中找到空終止字元，則 SF 被設定，否則被重置
OF 設定為 IntRes2[0]
AF 被重置
PF 被重置

pcmpestri IMM8, arg2, arg1	GAS 語法
pcmpestri arg1, arg2, IMM8	英特爾語法

PCMPESTRI，打包比較顯式長度字串，返回索引。比較顯式長度的字串並在 ECX 中生成索引。

運算元

arg1

XMM 暫存器

arg2

XMM 暫存器
記憶體

IMM8

8 位立即值

隱式運算元

EAX 儲存 arg1 的長度
EDX 儲存 arg2 的長度

修改後的標誌

如果 IntRes2 為零，則 CF 被重置，否則被設定
如果 EDX < 16（對於位元組）或 8（對於字），則 ZF 被設定，否則被重置
如果 EAX < 16（對於位元組）或 8（對於字），則 SF 被設定，否則被重置
OF 設定為 IntRes2[0]
AF 被重置
PF 被重置

pcmpestrm IMM8, arg2, arg1	GAS 語法
pcmpestrm arg1, arg2, IMM8	英特爾語法

PCMPESTRM，打包比較顯式長度字串，返回掩碼。比較顯式長度的字串並在 XMM0 中生成掩碼。

運算元

arg1

XMM 暫存器

arg2

XMM 暫存器
記憶體

IMM8

8 位立即值

隱式運算元

EAX 儲存 arg1 的長度
EDX 儲存 arg2 的長度

修改後的標誌

如果 IntRes2 為零，則 CF 被重置，否則被設定
如果 EDX < 16（對於位元組）或 8（對於字），則 ZF 被設定，否則被重置
如果 EAX < 16（對於位元組）或 8（對於字），則 SF 被設定，否則被重置
OF 設定為 IntRes2[0]
AF 被重置
PF 被重置

SSE 指令集

實際上有數百個 SSE 指令，其中一些能夠完成比簡單的 SIMD 算術運算更復雜的操作。有關更深入的參考資料，請檢視本書的資源章節。

您可能會注意到許多浮點 SSE 指令以 PS 或 SD 之類的結尾。這些字尾區分操作的不同版本。第一個字母描述指令應該是Packed（打包）還是Scalar（標量）。打包操作應用於暫存器的每個成員，而標量操作僅應用於第一個值。例如，在虛擬碼中，打包加法將被執行為

v1[0] = v1[0] + v2[0]
v1[1] = v1[1] + v2[1]
v1[2] = v1[2] + v2[2]
v1[3] = v1[3] + v2[3]

而標量加法將僅為

v1[0] = v1[0] + v2[0]

第二個字母表示資料大小：Single（單精度）或Double（雙精度）。這只是告訴處理器分別使用暫存器作為四個 32 位浮點數或兩個 64 位雙精度數。

SSE：在奔騰 III 中新增

浮點指令

ADDPS，ADDSS，CMPPS，CMPSS，COMISS，CVTPI2PS，CVTPS2PI，CVTSI2SS，CVTSS2SI，CVTTPS2PI，CVTTSS2SI，DIVPS，DIVSS，LDMXCSR，MAXPS，MAXSS，MINPS，MINSS，MOVAPS，MOVHLPS，MOVHPS，MOVLHPS，MOVLPS，MOVMSKPS，MOVNTPS，MOVSS，MOVUPS，MULPS，MULSS，RCPPS，RCPSS，RSQRTPS，RSQRTSS，SHUFPS，SQRTPS，SQRTSS，STMXCSR，SUBPS，SUBSS，UCOMISS，UNPCKHPS，UNPCKLPS

整數指令

ANDNPS，ANDPS，ORPS，PAVGB，PAVGW，PEXTRW，PINSRW，PMAXSW，PMAXUB，PMINSW，PMINUB，PMOVMSKB，PMULHUW，PSADBW，PSHUFW，XORPS

SSE2：在奔騰 4 中新增

浮點指令

ADDPD，ADDSD，ANDNPD，ANDPD，CMPPD，CMPSD*，COMISD，CVTDQ2PD，CVTDQ2PS，CVTPD2DQ，CVTPD2PI，CVTPD2PS，CVTPI2PD，CVTPS2DQ，CVTPS2PD，CVTSD2SI，CVTSD2SS，CVTSI2SD，CVTSS2SD，CVTTPD2DQ，CVTTPD2PI，CVTTPS2DQ，CVTTSD2SI，DIVPD，DIVSD，MAXPD，MAXSD，MINPD，MINSD，MOVAPD，MOVHPD，MOVLPD，MOVMSKPD，MOVSD*，MOVUPD，MULPD，MULSD，ORPD，SHUFPD，SQRTPD，SQRTSD，SUBPD，SUBSD，UCOMISD，UNPCKHPD，UNPCKLPD，XORPD

* CMPSD 和 MOVSD 與字串指令助記符 CMPSD（CMPS）和 MOVSD（MOVS）具有相同的名稱；但是，前者指的是標量雙精度浮點數，而後者指的是雙字字串。

整數指令

MOVDQ2Q，MOVDQA，MOVDQU，MOVQ2DQ，PADDQ，PSUBQ，PMULUDQ，PSHUFHW，PSHUFLW，PSHUFD，PSLLDQ，PSRLDQ，PUNPCKHQDQ，PUNPCKLQDQ

SSE3：在後來的奔騰 4 中新增

ADDSUBPD，ADDSUBPS，HADDPD，HADDPS，HSUBPD，HSUBPS，MOVDDUP，MOVSHDUP，MOVSLDUP

SSSE3：在至強 5100 和早期酷睿 2 中新增

PSIGNW，PSIGND，PSIGNB，PSHUFB，PMULHRSW，PMADDUBSW，PHSUBW，PHSUBSW，PHSUBD，PHADDW，PHADDSW，PHADDD，PALIGNR，PABSW，PABSD，PABSB

SSE4

SSE4.1：在後來的酷睿 2 中新增

MPSADBW，PHMINPOSUW，PMULLD，PMULDQ，DPPS，DPPD，BLENDPS，BLENDPD，BLENDVPS，BLENDVPD，PBLENDVB，PBLENDW，PMINSB，PMAXSB，PMINUW，PMAXUW，PMINUD，PMAXUD，PMINSD，PMAXSD，ROUNDPS，ROUNDSS，ROUNDPD，ROUNDSD，INSERTPS，PINSRB，PINSRD，PINSRQ，EXTRACTPS，PEXTRB，PEXTRW，PEXTRD，PEXTRQ，PMOVSXBW，PMOVZXBW，PMOVSXBD，PMOVZXBD，PMOVSXBQ，PMOVZXBQ，PMOVSXWD，PMOVZXWD，PMOVSXWQ，PMOVZXWQ，PMOVSXDQ，PMOVZXDQ，PTEST，PCMPEQQ，PACKUSDW，MOVNTDQA

SSE4a：在羿龍中新增

LZCNT，POPCNT，EXTRQ，INSERTQ，MOVNTSD，MOVNTSS

SSE4.2：在 Nehalem 中新增

CRC32, PCMPESTRI, PCMPESTRM, PCMPISTRI, PCMPISTRM, PCMPGTQ

IMM8[7:6]	描述
00b	複製到最低有效元素
01b	複製到第二個元素
10b	複製到第三個元素
11b	複製到最高有效元素

暫存器

資料移動示例

使用打包單精度浮點數進行算術運算的示例

偵錯程式命令解釋

使用進行亂序的示例shufps

IMM8 控制位元組描述

文字處理指令

IMM8 控制位元組描述

四個指令

SSE 指令集

SSE：在奔騰 III 中新增

SSE2：在奔騰 4 中新增

SSE3：在後來的奔騰 4 中新增

SSSE3：在至強 5100 和早期酷睿 2 中新增

SSE4

SSE4.1：在後來的酷睿 2 中新增

SSE4a：在羿龍中新增

SSE4.2：在 Nehalem 中新增

使用進行亂序的示例`shufps`