Now results are
\nMethod | \nMean | \nError | \nStdDev | \n
---|---|---|---|
MasterBranch | \n241.94 ns | \n4.832 ns | \n5.564 ns | \n
PrBranch | \n24.65 ns | \n0.305 ns | \n0.270 ns | \n
But now the feedback I got is that BenchmarkTest.precomputed
might be stored in CPU-registry or some other fast cache. Then the benchmark results would be incorrect because PrBranch
benchmark is essentially \"cheating\". In reality, many processes/threads/etc. compete to store data in the fast cache as it is a scare resource.
Do you agree with the feedback I got? Could anyone give me a tip how to improve the benchmark a bit?
\nThanks you!
","upvoteCount":1,"answerCount":1,"acceptedAnswer":{"@type":"Answer","text":"Take 3 (I told you it's difficult!)
\n\n
public class BenchmarkTest\n{\n private static readonly decimal[] precomputedCache = new decimal[]\n {\n 1m,\n 0.1m,\n 0.01m,\n 0.001m,\n 0.0001m,\n 0.00001m,\n 0.000001m,\n 0.0000001m,\n 0.00000001m,\n 0.000000001m,\n 0.0000000001m,\n 0.00000000001m,\n 0.000000000001m,\n 0.0000000000001m,\n 0.00000000000001m,\n 0.000000000000001m,\n 0.0000000000000001m,\n 0.00000000000000001m,\n 0.000000000000000001m,\n 0.0000000000000000001m,\n };\n\n [Benchmark]\n public decimal Calculate()\n {\n decimal r = 0;\n\n for (int decimals = 0; decimals < 5; decimals++)\n {\n r += (decimal) Math.Pow(10, -decimals);\n }\n\n return r;\n }\n\n [Benchmark]\n public decimal Cache()\n {\n decimal r = 0;\n\n for (int decimals = 0; decimals < 5; decimals++)\n {\n r += precomputedCache[decimals];\n }\n\n return r;\n }\n\n [Benchmark]\n public decimal Ram()\n {\n decimal r = 0;\n\n for (int decimals = 0; decimals < 5; decimals++)\n {\n CacheHelper.FlushCache();\n r += precomputedCache[decimals];\n }\n\n return r;\n }\n\n [Benchmark]\n public void FlushCacheOverhead()\n {\n for (int decimals = 0; decimals < 5; decimals++)\n {\n CacheHelper.FlushCache();\n }\n }\n}\n\nstatic class CacheHelper\n{\n\n private static readonly byte[] cacheFlusher1 = GetCacheFlusher();\n private static readonly byte[] cacheFlusher2 = GetCacheFlusher();\n\n private static byte[] GetCacheFlusher()\n {\n Processor.GetPerCoreCacheSizes(out var l1, out var l2, out var l3);\n var totalCacheSize = l1 + l2 + l3;\n return new byte[totalCacheSize];\n }\n\n // You can probably get the actual cache line size from the processor information, but I just used the common 64 size because I'm lazy.\n const int cacheLineSize = 64;\n // Write to field to prevent dead code elimination.\n public static byte holder;\n private static bool flusher1;\n\n public static void FlushCache()\n {\n flusher1 = !flusher1;\n var array = flusher1 ? cacheFlusher1 : cacheFlusher2;\n // Touch every cache line to flush the cache.\n for (int i = 0, max = array.Length; i < max; i += cacheLineSize)\n {\n holder = array[i];\n }\n }\n}\n\nclass Processor\n{\n [DllImport(\"kernel32.dll\")]\n public static extern int GetCurrentThreadId();\n\n //[DllImport(\"kernel32.dll\")]\n //public static extern int GetCurrentProcessorNumber();\n\n [StructLayout(LayoutKind.Sequential, Pack = 4)]\n private struct GROUP_AFFINITY\n {\n public UIntPtr Mask;\n\n [MarshalAs(UnmanagedType.U2)]\n public ushort Group;\n\n [MarshalAs(UnmanagedType.ByValArray, SizeConst = 3, ArraySubType = UnmanagedType.U2)]\n public ushort[] Reserved;\n }\n\n [DllImport(\"kernel32\", SetLastError = true)]\n private static extern Boolean SetThreadGroupAffinity(IntPtr hThread, ref GROUP_AFFINITY GroupAffinity, ref GROUP_AFFINITY PreviousGroupAffinity);\n\n [StructLayout(LayoutKind.Sequential)]\n public struct PROCESSORCORE\n {\n public byte Flags;\n };\n\n [StructLayout(LayoutKind.Sequential)]\n public struct NUMANODE\n {\n public uint NodeNumber;\n }\n\n public enum PROCESSOR_CACHE_TYPE\n {\n CacheUnified,\n CacheInstruction,\n CacheData,\n CacheTrace\n }\n\n [StructLayout(LayoutKind.Sequential)]\n public struct CACHE_DESCRIPTOR\n {\n public byte Level;\n public byte Associativity;\n public ushort LineSize;\n public uint Size;\n public PROCESSOR_CACHE_TYPE Type;\n }\n\n [StructLayout(LayoutKind.Explicit)]\n public struct SYSTEM_LOGICAL_PROCESSOR_INFORMATION_UNION\n {\n [FieldOffset(0)]\n public PROCESSORCORE ProcessorCore;\n [FieldOffset(0)]\n public NUMANODE NumaNode;\n [FieldOffset(0)]\n public CACHE_DESCRIPTOR Cache;\n [FieldOffset(0)]\n private UInt64 Reserved1;\n [FieldOffset(8)]\n private UInt64 Reserved2;\n }\n\n public enum LOGICAL_PROCESSOR_RELATIONSHIP\n {\n RelationProcessorCore,\n RelationNumaNode,\n RelationCache,\n RelationProcessorPackage,\n RelationGroup,\n RelationAll = 0xffff\n }\n\n public struct SYSTEM_LOGICAL_PROCESSOR_INFORMATION\n {\n#pragma warning disable 0649\n public UIntPtr ProcessorMask;\n public LOGICAL_PROCESSOR_RELATIONSHIP Relationship;\n public SYSTEM_LOGICAL_PROCESSOR_INFORMATION_UNION ProcessorInformation;\n#pragma warning restore 0649\n }\n\n [DllImport(@\"kernel32.dll\", SetLastError = true)]\n public static extern bool GetLogicalProcessorInformation(IntPtr Buffer, ref uint ReturnLength);\n\n private const int ERROR_INSUFFICIENT_BUFFER = 122;\n\n private static SYSTEM_LOGICAL_PROCESSOR_INFORMATION[] _logicalProcessorInformation = null;\n\n public static SYSTEM_LOGICAL_PROCESSOR_INFORMATION[] LogicalProcessorInformation\n {\n get\n {\n if (_logicalProcessorInformation != null)\n return _logicalProcessorInformation;\n\n uint ReturnLength = 0;\n\n GetLogicalProcessorInformation(IntPtr.Zero, ref ReturnLength);\n\n if (Marshal.GetLastWin32Error() == ERROR_INSUFFICIENT_BUFFER)\n {\n IntPtr Ptr = Marshal.AllocHGlobal((int) ReturnLength);\n try\n {\n if (GetLogicalProcessorInformation(Ptr, ref ReturnLength))\n {\n int size = Marshal.SizeOf(typeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));\n int len = (int) ReturnLength / size;\n _logicalProcessorInformation = new SYSTEM_LOGICAL_PROCESSOR_INFORMATION[len];\n IntPtr Item = Ptr;\n\n for (int i = 0; i < len; i++)\n {\n _logicalProcessorInformation[i] = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION) Marshal.PtrToStructure(Item, typeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));\n Item += size;\n }\n\n return _logicalProcessorInformation;\n }\n }\n finally\n {\n Marshal.FreeHGlobal(Ptr);\n }\n }\n return null;\n }\n }\n\n public static void GetPerCoreCacheSizes(out Int64 L1, out Int64 L2, out Int64 L3)\n {\n L1 = 0;\n L2 = 0;\n L3 = 0;\n\n var info = Processor.LogicalProcessorInformation;\n foreach (var entry in info)\n {\n if (entry.Relationship != Processor.LOGICAL_PROCESSOR_RELATIONSHIP.RelationCache)\n continue;\n Int64 mask = (Int64) entry.ProcessorMask;\n if ((mask & (Int64) 1) == 0)\n continue;\n var cache = entry.ProcessorInformation.Cache;\n switch (cache.Level)\n {\n case 1: L1 = L1 + cache.Size; break;\n case 2: L2 = L2 + cache.Size; break;\n case 3: L3 = L3 + cache.Size; break;\n default:\n break;\n }\n }\n }\n}
Method | \nMean | \nError | \nStdDev | \n
---|---|---|---|
Calculate | \n457.24 ns | \n2.536 ns | \n2.372 ns | \n
Cache | \n55.43 ns | \n0.410 ns | \n0.364 ns | \n
Ram | \n5,026,508.40 ns | \n18,058.436 ns | \n14,098.839 ns | \n
FlushCacheOverhead | \n5,019,991.08 ns | \n26,890.458 ns | \n20,994.301 ns | \n
You can see I flushed the cpu cache before each access. This of course has its own overhead, so I measured that overhead in its own benchmark. Subtracting that overhead we get these results:
\nMethod | \nMean | \nError | \nStdDev | \n
---|---|---|---|
Calculate | \n457.24 ns | \n2.536 ns | \n2.372 ns | \n
Cache | \n55.43 ns | \n0.410 ns | \n0.364 ns | \n
Ram | \n6,517.32 ns | \n? | \n? | \n
It looks like this time ram is ~117x slower than cache. That's much more believable to me considering the common 10-100x.
","upvoteCount":0,"url":"https://github.com/dotnet/BenchmarkDotNet/discussions/2513#discussioncomment-8259443"}}}-
I wrote a very simple benchmark for comparing whether it makes sense to compute using System;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
namespace Benchmark;
public static class Program
{
/// <summary>
/// Entry point of the benchmark.
/// </summary>
public static void Main()
{
_ = BenchmarkRunner.Run<BenchmarkTest>();
}
}
/// <summary>
/// Benchmark class for a comparison of current code and a proposed patch.
/// </summary>
/// <seealso href="https://benchmarkdotnet.org/articles/guides/"/>
public class BenchmarkTest
{
private static readonly decimal[] precomputed = new decimal[]
{
1m,
0.1m,
0.01m,
0.001m,
0.0001m,
0.00001m,
0.000001m,
0.0000001m,
0.00000001m,
0.000000001m,
0.0000000001m,
0.00000000001m,
0.000000000001m,
0.0000000000001m,
0.00000000000001m,
0.000000000000001m,
0.0000000000000001m,
0.00000000000000001m,
0.000000000000000001m,
0.0000000000000000001m,
};
/// <summary>
/// Executed once.
/// </summary>
[GlobalSetup]
public void GlobalSetup()
{
}
[Benchmark]
public decimal MasterBranch()
{
decimal r = 0;
for (int decimals = 0; decimals < 5; decimals++)
{
r += (decimal)Math.Pow(10, -decimals);
}
return r;
}
[Benchmark]
public decimal PrBranch()
{
decimal r = 0;
for (int decimals = 0; decimals < 5; decimals++)
{
r += precomputed[decimals];
}
return r;
}
} Now results are
But now the feedback I got is that Do you agree with the feedback I got? Could anyone give me a tip how to improve the benchmark a bit? Thanks you! |
Beta Was this translation helpful? Give feedback.
-
The feedback is correct. It is a difficult thing to measure the speed of cache vs ram in C#. I gave it a try and got these results:
Code
public class BenchmarkTest
{
const int CountToMeasure = 5;
private static readonly decimal[] precomputedCache = new decimal[]
{
1m,
0.1m,
0.01m,
0.001m,
0.0001m,
0.00001m,
0.000001m,
0.0000001m,
0.00000001m,
0.000000001m,
0.0000000001m,
0.00000000001m,
0.000000000001m,
0.0000000000001m,
0.00000000000001m,
0.000000000000001m,
0.0000000000000001m,
0.00000000000000001m,
0.000000000000000001m,
0.0000000000000000001m,
};
private static readonly decimal[][] precomputedRAM = GetPrecomputedRAM();
private static List<byte[]> spacers;
private static decimal[][] GetPrecomputedRAM()
{
Processor.GetPerCoreCacheSizes(out var l1, out var l2, out var l3);
var totalCacheSize = l1 + l2 + l3;
// 16 is size of decimal in bytes, plus array overhead.
var precomputedSize = (precomputedCache.Length * 128) + IntPtr.Size * 3;
// We copy the array as many times as needed to fill the cache.
int copyCount = 0;
for (int cacheFillSize = 0; cacheFillSize < totalCacheSize; cacheFillSize += precomputedSize)
{
++copyCount;
}
// + CountToMeasure more to be sure that we're pulling from RAM and not cache and that we have a minimum size.
copyCount += CountToMeasure;
var array = new decimal[copyCount][];
spacers = new List<byte[]>();
for (int i = 0; i < copyCount; ++i)
{
// Allocate a spacer array the size of the cache line size to prevent the cpu from pulling in mulitple caches when reading from RAM.
// You can probably get the actual value from the processor information, but I just used the common 4k size because I'm lazy.
spacers.Add(new byte[64*64]);
// Copy the cache.
array[i] = precomputedCache.ToArray();
}
return array;
}
int currentIndex = 0;
static readonly int ramMod = precomputedRAM.Length;
[Benchmark]
public decimal Calculate()
{
decimal r = 0;
for (int decimals = 0; decimals < CountToMeasure; decimals++)
{
r += (decimal) Math.Pow(10, -decimals);
// We do the index calculations to match the extra work of Ram.
++currentIndex;
currentIndex %= ramMod;
}
return r;
}
[Benchmark]
public decimal Cache()
{
decimal r = 0;
for (int decimals = 0; decimals < CountToMeasure; decimals++)
{
r += precomputedCache[decimals];
// We do the index calculations to match the extra work of Ram.
++currentIndex;
currentIndex %= ramMod;
}
return r;
}
[Benchmark]
public decimal Ram()
{
decimal r = 0;
for (int decimals = 0; decimals < CountToMeasure; decimals++)
{
r += precomputedRAM[currentIndex][decimals];
++currentIndex;
currentIndex %= ramMod;
}
return r;
}
}
class Processor
{
[DllImport("kernel32.dll")]
public static extern int GetCurrentThreadId();
//[DllImport("kernel32.dll")]
//public static extern int GetCurrentProcessorNumber();
[StructLayout(LayoutKind.Sequential, Pack = 4)]
private struct GROUP_AFFINITY
{
public UIntPtr Mask;
[MarshalAs(UnmanagedType.U2)]
public ushort Group;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 3, ArraySubType = UnmanagedType.U2)]
public ushort[] Reserved;
}
[DllImport("kernel32", SetLastError = true)]
private static extern Boolean SetThreadGroupAffinity(IntPtr hThread, ref GROUP_AFFINITY GroupAffinity, ref GROUP_AFFINITY PreviousGroupAffinity);
[StructLayout(LayoutKind.Sequential)]
public struct PROCESSORCORE
{
public byte Flags;
};
[StructLayout(LayoutKind.Sequential)]
public struct NUMANODE
{
public uint NodeNumber;
}
public enum PROCESSOR_CACHE_TYPE
{
CacheUnified,
CacheInstruction,
CacheData,
CacheTrace
}
[StructLayout(LayoutKind.Sequential)]
public struct CACHE_DESCRIPTOR
{
public byte Level;
public byte Associativity;
public ushort LineSize;
public uint Size;
public PROCESSOR_CACHE_TYPE Type;
}
[StructLayout(LayoutKind.Explicit)]
public struct SYSTEM_LOGICAL_PROCESSOR_INFORMATION_UNION
{
[FieldOffset(0)]
public PROCESSORCORE ProcessorCore;
[FieldOffset(0)]
public NUMANODE NumaNode;
[FieldOffset(0)]
public CACHE_DESCRIPTOR Cache;
[FieldOffset(0)]
private UInt64 Reserved1;
[FieldOffset(8)]
private UInt64 Reserved2;
}
public enum LOGICAL_PROCESSOR_RELATIONSHIP
{
RelationProcessorCore,
RelationNumaNode,
RelationCache,
RelationProcessorPackage,
RelationGroup,
RelationAll = 0xffff
}
public struct SYSTEM_LOGICAL_PROCESSOR_INFORMATION
{
#pragma warning disable 0649
public UIntPtr ProcessorMask;
public LOGICAL_PROCESSOR_RELATIONSHIP Relationship;
public SYSTEM_LOGICAL_PROCESSOR_INFORMATION_UNION ProcessorInformation;
#pragma warning restore 0649
}
[DllImport(@"kernel32.dll", SetLastError = true)]
public static extern bool GetLogicalProcessorInformation(IntPtr Buffer, ref uint ReturnLength);
private const int ERROR_INSUFFICIENT_BUFFER = 122;
private static SYSTEM_LOGICAL_PROCESSOR_INFORMATION[] _logicalProcessorInformation = null;
public static SYSTEM_LOGICAL_PROCESSOR_INFORMATION[] LogicalProcessorInformation
{
get
{
if (_logicalProcessorInformation != null)
return _logicalProcessorInformation;
uint ReturnLength = 0;
GetLogicalProcessorInformation(IntPtr.Zero, ref ReturnLength);
if (Marshal.GetLastWin32Error() == ERROR_INSUFFICIENT_BUFFER)
{
IntPtr Ptr = Marshal.AllocHGlobal((int) ReturnLength);
try
{
if (GetLogicalProcessorInformation(Ptr, ref ReturnLength))
{
int size = Marshal.SizeOf(typeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
int len = (int) ReturnLength / size;
_logicalProcessorInformation = new SYSTEM_LOGICAL_PROCESSOR_INFORMATION[len];
IntPtr Item = Ptr;
for (int i = 0; i < len; i++)
{
_logicalProcessorInformation[i] = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION) Marshal.PtrToStructure(Item, typeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
Item += size;
}
return _logicalProcessorInformation;
}
}
finally
{
Marshal.FreeHGlobal(Ptr);
}
}
return null;
}
}
public static void GetPerCoreCacheSizes(out Int64 L1, out Int64 L2, out Int64 L3)
{
L1 = 0;
L2 = 0;
L3 = 0;
var info = Processor.LogicalProcessorInformation;
foreach (var entry in info)
{
if (entry.Relationship != Processor.LOGICAL_PROCESSOR_RELATIONSHIP.RelationCache)
continue;
Int64 mask = (Int64) entry.ProcessorMask;
if ((mask & (Int64) 1) == 0)
continue;
var cache = entry.ProcessorInformation.Cache;
switch (cache.Level)
{
case 1: L1 = L1 + cache.Size; break;
case 2: L2 = L2 + cache.Size; break;
case 3: L3 = L3 + cache.Size; break;
default:
break;
}
}
}
} Credit for the And honestly, I don't even trust my own results, because it's common knowledge that Ram is 10-100x slower than cache. |
Beta Was this translation helpful? Give feedback.
Take 3 (I told you it's difficult!)
Code