但是失败的(慢速路径)存储转发似乎不会阻碍成功的存储转发。在Skylake (i7-6700k)上测试时,我做了一个测试循环,其中包括两个依赖链:
;; nasm -felf64 testloop.asm
;; ld -o testloop testloop.o
;; taskset -c 3 perf stat -etask-clock:u,context-switches:u,cpu-migrations:u,page-faults:u,cycles:u,branches:u,instructions:u,uops_issued.any:u,uops_executed.thread:u,idq.dsb_uops:u -r1 ./testloop
default rel
%ifdef __YASM_VER__
CPU Conroe AMD
CPU Skylake AMD
%use smartalign
alignmode p6, 64
global _start
lea rdi, [buf]
mov ebp, 100000000
align 64
mov [rdi+64], ecx
; mov rcx, [rdi+64] ; reload here: 16c. Or 16.8 if we *also* reload after the %rep block
%rep 3
mov [rdi], eax
mov eax, [rdi]
mov rcx, [rdi+64] ; reload here: 15c
dec ebp
jnz .loop
;;NASM-only, not YASM: %if __BITS__ == 32
%ifidn __OUTPUT_FORMAT__, elf32
mov eax,1
xor ebx,ebx
int 0x80 ; sys_exit(0) 32-bit ABI
xor edi,edi
mov eax,231 ; __NR_exit_group from /usr/include/asm/unistd_64.h
syscall ; sys_exit_group(0)
section .bss
align 4096
buf: resb 4096
$ t=testloop; asm-link -dn "$t".asm && taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,ld_blocks.store_forward,resource_stalls.sb -r2 ./"$t"
+ nasm -felf64 -Worphan-labels testloop.asm
+ ld -o testloop testloop.o
testloop: file format elf64-x86-64
Disassembly of section .text:
0000000000401000 <_start>:
401000: 48 8d 3d f9 0f 00 00 lea rdi,[rip+0xff9] # 402000 <__bss_start>
401007: bd 00 e1 f5 05 mov ebp,0x5f5e100
40100c: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
401014: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
40101c: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
401024: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
40102c: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
401034: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
40103c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
0000000000401040 <_start.loop>:
401040: 89 4f 40 mov DWORD PTR [rdi+0x40],ecx
401043: 89 07 mov DWORD PTR [rdi],eax
401045: 8b 07 mov eax,DWORD PTR [rdi]
401047: 89 07 mov DWORD PTR [rdi],eax
401049: 8b 07 mov eax,DWORD PTR [rdi]
40104b: 89 07 mov DWORD PTR [rdi],eax
40104d: 8b 07 mov eax,DWORD PTR [rdi]
40104f: 48 8b 4f 40 mov rcx,QWORD PTR [rdi+0x40]
401053: ff cd dec ebp
401055: 75 e9 jne 401040 <_start.loop>
0000000000401057 <_start.end>:
401057: 31 ff xor edi,edi
401059: b8 e7 00 00 00 mov eax,0xe7
40105e: 0f 05 syscall
Performance counter stats for './testloop' (two runs):
385.85 msec task-clock # 0.999 CPUs utilized ( +- 0.02% )
0 context-switches # 0.000 /sec
0 cpu-migrations # 0.000 /sec
2 page-faults # 5.183 /sec
1,503,701,305 cycles # 3.897 GHz ( +- 0.01% )
1,000,000,130 instructions # 0.67 instructions per cycle ( +- 0.00% )
900,084,383 uops_issued.any # 2.333 G/sec ( +- 0.00% )
1,300,091,135 uops_executed.thread # 3.369 G/sec ( +- 0.00% )
99,933,928 ld_blocks.store_forward # 258.998 M/sec ( +- 0.02% )
443,686,304 resource_stalls.sb # 1.150 G/sec ( +- 4.87% )
0.386139 +- 0.000119 seconds time elapsed ( +- 0.03% )
MSVC 2022 基准测试,编译器与 /标准:c 最新
#include <chrono>
#include <iostream>
struct alignas(16) S
char* a;
int* b;
extern "C" void init_fused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_fused_copy_fused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_fused(int n, S & s2, S & s1);
int main()
using namespace std::chrono;
S s1, s2;
constexpr int N = 1'000'000'000;
auto t1 = system_clock::now();
init_fused_copy_fused(N, s2, s1);
auto t2 = system_clock::now();
init_fused_copy_unfused(N, s2, s1);
auto t3 = system_clock::now();
init_unfused_copy_fused(N, s2, s1);
auto t4 = system_clock::now();
init_unfused_copy_unfused(N, s2, s1);
auto t5 = system_clock::now();
<< "init fused copy fused " << duration_cast<duration<double>>(t2 - t1) << "\n"
<< "init fused copy unfused " << duration_cast<duration<double>>(t3 - t2) << "\n"
<< "init unfused copy fused " << duration_cast<duration<double>>(t4 - t3) << "\n"
<< "init unfused copy unfused " << duration_cast<duration<double>>(t5 - t4) << "\n";
c db 0
i dd 0
s dq byte ptr [c], dword ptr [i]
init_fused_copy_fused PROC
movups xmm0,xmmword ptr [s]
movups xmmword ptr [r8],xmm0
movups xmm1,xmmword ptr [r8]
movups xmmword ptr [rdx], xmm1
dec ecx
jnz init_fused_copy_fused
init_fused_copy_fused ENDP
init_unfused_copy_fused PROC
lea rax, byte ptr [c]
mov qword ptr[r8], rax
lea rax, dword ptr [i]
mov qword ptr[r8 + 8], rax
movups xmm1,xmmword ptr [r8]
movups xmmword ptr [rdx], xmm1
dec ecx
jnz init_unfused_copy_fused
init_unfused_copy_fused ENDP
init_fused_copy_unfused PROC
movups xmm0,xmmword ptr [s]
movups xmmword ptr [r8],xmm0
mov rax, qword ptr[r8]
mov qword ptr[rdx], rax
mov rax, qword ptr[r8 + 8]
mov qword ptr[rdx +8], rax
dec ecx
jnz init_fused_copy_unfused
init_fused_copy_unfused ENDP
init_unfused_copy_unfused PROC
lea rax, byte ptr [c]
mov qword ptr[r8], rax
lea rax, dword ptr [i]
mov qword ptr[r8 + 8], rax
mov rax, qword ptr[r8]
mov qword ptr[rdx], rax
mov rax, qword ptr[r8 + 8]
mov qword ptr[rdx +8], rax
dec ecx
jnz init_unfused_copy_unfused
init_unfused_copy_unfused ENDP
init fused copy fused 0.664739s
init fused copy unfused 0.935631s
init unfused copy fused 4.34326s
init unfused copy unfused 1.02741s
CPU:Intel(R)Core(TM)i7-8750H CPU@2.20GHz 2.21 GHz
最后一个2字节加载从紧邻的前一个存储区中获取第二个字节,但从前一个存储区中获取第一个字节。这个加载可以被存储转发吗,还是需要等到前面的两个存储都提交到L1? 请注意,通过这里的存储转发,我包含了任何机制,这些机制可以满足来自仍然在存储缓冲区中的存储的读取,而不是等待它们提交到L1,即使这是一个比“从单个存储转发”的最佳情况更慢的路径。
