Pregunta

I've tested an interlocked and some other alternatives. Results are below

ForSum: 16145,47 ticks
ForeachSum: 17702,01 ticks
ForEachSum: 66530,06 ticks
ParallelInterlockedForEachSum: 484235,95 ticks
ParallelLockingForeachSum: 965239,91 ticks
LinqSum: 97682,97 ticks
ParallelLinqSum: 23436,28 ticks
ManualParallelSum: 5959,83 ticks

so interlocked is 5 times slower that even non-parallel linq and 20 times slower, than parallelLinq. And it's compared to "Slow and ugly linq". Manual method is several orders of magnitude faster than it and i see no sense to compare them. How is it possible? If it's true why should i use this class instead of manual/Linq parallel summing? Especially if purpose that using Linq i can do everything instead of interlocked, having miserable amount of methods.

So bench code is here:

using System;
using System.Diagnostics;
using System.Linq;
using System.Reflection;
using System.Runtime;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;

namespace InterlockedTest
{
    internal static class Program
    {
        private static void Main()
        {
            DoBenchmark();
            Console.ReadKey();
        }

        private static void DoBenchmark()
        {
            Process.GetCurrentProcess().PriorityClass = ProcessPriorityClass.RealTime;
            DisableGC();

            var arr = Enumerable.Repeat(6, 1005000*6).ToArray();
            int correctAnswer = 6*arr.Length;

            var methods = new Func<int[], int>[]
                          {
                              ForSum, ForeachSum, ForEachSum, ParallelInterlockedForEachSum, ParallelLockingForeachSum,
                              LinqSum, ParallelLinqSum, ManualParallelSum
                          };

            foreach (var method in methods)
            {
                GC.Collect();
                GC.WaitForPendingFinalizers();
                GC.Collect();

                var result = new long[100];

                for (int i = 0; i < result.Length; ++i)
                {
                    result[i] = TestMethod(method, arr, correctAnswer);
                }

                Console.WriteLine("{0}: {1} ticks", method.GetMethodInfo().Name, result.Average());
            }
        }

        private static void DisableGC()
        {
            GCLatencyMode oldMode = GCSettings.LatencyMode;

            // Make sure we can always go to the catch block, 
            // so we can set the latency mode back to `oldMode`
            RuntimeHelpers.PrepareConstrainedRegions();

            GCSettings.LatencyMode = GCLatencyMode.LowLatency;
        }

        private static long TestMethod(Func<int[], int> foo, int[] arr, int correctAnswer)
        {
            var watch = Stopwatch.StartNew();

            if (foo(arr) != correctAnswer)
            {
                return -1;
            }

            watch.Stop();
            return watch.ElapsedTicks;
        }

        private static int ForSum(int[] arr)
        {
            int res = 0;

            for (int i = 0; i < arr.Length; ++i)
            {
                res += arr[i];
            }

            return res;
        }

        private static int ForeachSum(int[] arr)
        {
            int res = 0;

            foreach (var x in arr)
            {
                res += x;
            }

            return res;
        }

        private static int ForEachSum(int[] arr)
        {
            int res = 0;

            Array.ForEach(arr, x => res += x);

            return res;
        }

        private static int ParallelInterlockedForEachSum(int[] arr)
        {
            int res = 0;

            Parallel.ForEach(arr, x => Interlocked.Add(ref res, x));

            return res;
        }

        private static int ParallelLockingForeachSum(int[] arr)
        {
            int res = 0;
            object syncroot = new object();
            Parallel.ForEach(arr, i =>
                                  {
                                      lock (syncroot)
                                      {
                                          res += i;
                                      }
                                  });
            return res;
        }

        private static int LinqSum(int[] arr)
        {
            return arr.Sum();
        }

        private static int ParallelLinqSum(int[] arr)
        {
            return arr.AsParallel().Sum();
        }

        static int ManualParallelSum(int[] arr)
        {
            int blockSize = arr.Length / Environment.ProcessorCount;

            int blockCount = arr.Length / blockSize + arr.Length % blockSize;

            var wHandlers = new ManualResetEvent[blockCount];

            int[] tempResults = new int[blockCount];

            for (int i = 0; i < blockCount; i++)
            {
                ManualResetEvent handler = (wHandlers[i] = new ManualResetEvent(false));

                ThreadPool.UnsafeQueueUserWorkItem(param =>
                {
                    int subResult = 0;
                    int blockIndex = (int)param;
                    int endBlock = Math.Min(arr.Length, blockSize * blockIndex + blockSize);
                    for (int j = blockIndex * blockSize; j < endBlock; j++)
                    {
                        subResult += arr[j];
                    }
                    tempResults[blockIndex] = subResult;

                    handler.Set();
                }, i);
            }

            int res = 0;

            for (int block = 0; block < blockCount; ++block)
            {
                wHandlers[block].WaitOne();
                res += tempResults[block];
            }

            return res;
        }
    }
}
¿Fue útil?

Solución

The problem here is that it's having to sync for every single addition, which is a huge amount of overhead.

Microsoft have provided a Partitioner class which is basically intended to provide some of the logic that you have used in ManualParallelSum().

If you use a Partitioner, it simplifies the code considerable, and it runs in roughly the same time.

Here's a sample implementation - if you add it to your test program you should see results similar to your ManualParallelSum():

private static int PartitionSum(int[] numbers)
{
    int result = 0;
    var rangePartitioner = Partitioner.Create(0, numbers.Length);

    Parallel.ForEach(rangePartitioner, (range, loopState) =>
    {
        int subtotal = 0;

        for (int i = range.Item1; i < range.Item2; i++)
            subtotal += numbers[i];

        Interlocked.Add(ref result, subtotal);
    });

    return result;
}

Otros consejos

Interlocked and lock are fast operations when no contention occurs.
In the sample, there is a lot of contention, overhead then becomes much more important than the underlying operation (which is a really small one).

Interlocked.Add does add a small overhead even without parallelism, but not much.

private static int InterlockedSum(int[] arr)
{
    int res = 0;

    for (int i = 0; i < arr.Length; ++i)
    {
        Interlocked.Add(ref res, arr[i]);
    }

    return res;
}

Results is: ForSum: 6682.45 ticks
InterlockedSum: 15309.63 ticks

The comparison with the manual implementation does not look fair as you split the operation in blocks because you know the nature of the operation. Other implementation cannot assume that.

Licenciado bajo: CC-BY-SA con atribución
No afiliado a StackOverflow
scroll top