using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Running; using System; using System.Runtime.CompilerServices; [DisassemblyDiagnoser] public partial class Program { static void Main(string[] args) { // 堆代码 www.duidaima.com BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); } [Benchmark] public void Compute() { for (int thousands = 0; thousands < 10; thousands++) { for (int hundreds = 0; hundreds < 10; hundreds++) { for (int tens = 0; tens < 10; tens++) { for (int ones = 0; ones < 10; ones++) { int n = ComputeNumber(thousands, hundreds, tens, ones); Process(n); } } } } } static int ComputeNumber(int thousands, int hundreds, int tens, int ones) => (thousands * 1000) + (hundreds * 100) + (tens * 10) + ones; [MethodImpl(MethodImplOptions.NoInlining)] static void Process(int n) { } }
运行:
dotnet run -c Release -f net7.0 --filter *Program*可以看到compute这个函数有四层循环,最里面的一层循环调用了ComputeNumber和Process函数。.Net6的Compute函数的ASM如下:
xor esi,esi M00_L00: xor edi,edi M00_L01: xor ebx,ebx M00_L02: xor ebp,ebp imul ecx,esi,3E8 imul eax,edi,64 add ecx,eax lea eax,[rbx+rbx*4] lea r14d,[rcx+rax*2] M00_L03: lea ecx,[r14+rbp] call Program.Process(Int32) inc ebp cmp ebp,0A jl short M00_L03 inc ebx cmp ebx,0A jl short M00_L02 inc edi cmp edi,0A jl short M00_L01 inc esi cmp esi,0A jl short M00_L00 add rsp,20Compute函数最里面的循环调用的ComputeNumber函数计算的代码:
(thousands * 1000) + (hundreds * 100) + (tens * 10) + ones; imul ecx,esi,3E8 imul eax,edi,64注意看这两个imul,它只是被提升到了有里到外,第二层循环的位置。这样就导致了这两段代码多运行了10*10次,完全是可以避免的。所以它在.Net6里面只能叫循环提升。
xor esi,esi M00_L00: xor edi,edi imul ebx,esi,3E8 M00_L01: xor ebp,ebp imul r14d,edi,64 add r14d,ebx M00_L02: xor r15d,r15d lea ecx,[rbp+rbp*4] lea r12d,[r14+rcx*2] M00_L03: lea ecx,[r12+r15] call qword ptr [7FFEE5471168]; Program.Process(Int32) inc r15d cmp r15d,0A jl short M00_L03 inc ebp cmp ebp,0A jl short M00_L02 inc edi cmp edi,0A jl short M00_L01 inc esi cmp esi,0A jl short M00_L00 add rsp,20这个 imul ecx,esi,3E8提升到第一层位置,imul eax,edi,64提升到第二层的位置。减少了冗余的运行次数,在.Net7及8里面进行了大幅度优化,所以叫做深度循环提升。
private int[] _values = Enumerable.Range(0, 1000).ToArray(); [Benchmark] [Arguments(0, 0, 1000)] public int LastIndexOf(int arg, int offset, int count) { int[] values = _values; for (int i = offset + count - 1; i >= offset; i--) if (values[i] == arg) return i; return 0; } .Net6 ASM: sub rsp,28 mov rcx,[rcx+8] lea eax,[r8+r9-1] cmp eax,r8d jl short M00_L01 mov r9d,[rcx+8] nop word ptr [rax+rax] M00_L00: cmp eax,r9d jae short M00_L03 movsxd r10,eax cmp [rcx+r10*4+10],edx je short M00_L02 dec eax cmp eax,r8d jge short M00_L00 M00_L01: xor eax,eax add rsp,28 ret M00_L02: add rsp,28 ret M00_L03: call CORINFO_HELP_RNGCHKFAIL int 3在判断values[i] == arg之前,进行了边界检查。但是这个边界检查并不是必要的,所以在.Net8优化如下:
sub rsp,28 mov rax,[rcx+8] lea ecx,[r8+r9-1] cmp ecx,r8d jl short M00_L02 test rax,rax je short M00_L01 mov r9d,ecx or r9d,r8d jl short M00_L01 cmp [rax+8],ecx jle short M00_L01 M00_L00: mov r9d,ecx cmp [rax+r9*4+10],edx je short M00_L03 dec ecx cmp ecx,r8d jge short M00_L00 jmp short M00_L02 M00_L01: cmp ecx,[rax+8] jae short M00_L04 mov r9d,ecx cmp [rax+r9*4+10],edx je short M00_L03 dec ecx cmp ecx,r8d jge short M00_L01 M00_L02: xor eax,eax add rsp,28 ret M00_L03: mov eax,ecx add rsp,28优化之后的代码,先是判断:
if(i >= offset + count - 1 && offset!=0 &&value!=0 && value[0] > offset + count - 1 ) { if(value[i]==arg) //不进行边界检查 { return; } } else { if(value[i]==arg) //进行边界检查 { return; } }