双数组数学的 C# OpenCL GPU 实现
C# OpenCL GPU implementation for double array math
如何使这个函数的 for 循环在 OpenCL 中使用 GPU?
public static double[] Calculate(double[] num, int period)
{
var final = new double[num.Length];
double sum = num[0];
double coeff = 2.0 / (1.0 + period);
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
return final;
}
正如评论者 Cory 所述,请参考此 link 进行设置。
以下是您将如何使用该项目:
- 添加 Nuget 包 Cloo
- 添加对 OpenCLlib.dll
的引用
- 下载OpenCLLib.zip
使用 OpenCL 添加
static void Main(string[] args)
{
int[] Primes = { 1,2,3,4,5,6,7 };
EasyCL cl = new EasyCL();
cl.Accelerator = AcceleratorDevice.GPU;
cl.LoadKernel(IsPrime);
cl.Invoke("GetIfPrime", 0, Primes.Length, Primes, 1.0);
}
static string IsPrime
{
get
{
return @"
kernel void GetIfPrime(global int* num, int period)
{
int index = get_global_id(0);
int sum = (2.0 / (1.0 + period)) * (num[index] - num[0]);
printf("" %d \n"",sum);
}";
}
}
您所写的问题不适合在 GPU 上运行的问题。您不能并行化(以提高性能的方式)单个数组上的操作,因为第 n 个元素的值取决于元素 1 到 n。但是,您可以利用 GPU 处理多个阵列,其中每个 GPU 核心在单独的阵列上运行。
解决方案的完整代码在答案的末尾,但测试结果,计算 10,000 个数组,每个数组有 10,000 个元素,生成以下内容(在 GTX1080M 和 i7 7700k 上32GB 内存):
Task Generating Data: 1096.4583ms
Task CPU Single Thread: 596.2624ms
Task CPU Parallel: 179.1717ms
GPU CPU->GPU: 89ms
GPU Execute: 86ms
GPU GPU->CPU: 29ms
Task Running GPU: 921.4781ms
Finished
在这个测试中,我们测量了我们可以将结果生成到托管 C# 数组中的速度,使用 CPU 一个线程, CPU 所有线程,最后使用 GPU所有核心。 我们使用函数 AreTheSame 验证每个测试的结果是否相同。
最快的时间是使用所有线程处理 CPU 上的数组(任务 CPU 并行:179 毫秒)。
GPU 实际上是最慢的(任务 运行 GPU:922 毫秒),但这是因为以可以传输到 GPU 的方式重新格式化 C# 数组所花费的时间。
如果消除了这个瓶颈(这很有可能,具体取决于您的用例),GPU 可能是最快的。如果数据已经以可以立即传输到 GPU 的方式格式化,则 GPU 的总处理时间将为 204 毫秒(CPU->GPU:89 毫秒 + 执行:86 毫秒 + GPU->CPU:29 毫秒 = 204 毫秒)。这仍然比并行 CPU 选项慢,但在不同类型的数据集上,它可能更快。
为了从GPU取回数据(实际使用GPU最重要的部分),我们使用函数ComputeCommandQueue.Read。这会将 GPU 上更改后的数组传输回 CPU.
运行下面的代码,参考了Cloo Nuget Package(我用的是0.9.1)。并确保在 x64 上编译(您将需要内存)。如果找不到 OpenCL 设备,您可能还需要更新显卡驱动程序。
class Program
{
static string CalculateKernel
{
get
{
return @"
kernel void Calc(global int* offsets, global int* lengths, global double* doubles, double periodFactor)
{
int id = get_global_id(0);
int start = offsets[id];
int length = lengths[id];
int end = start + length;
double sum = doubles[start];
for(int i = start; i < end; i++)
{
sum = sum + periodFactor * ( doubles[i] - sum );
doubles[i] = sum;
}
}";
}
}
public static double[] Calculate(double[] num, int period)
{
var final = new double[num.Length];
double sum = num[0];
double coeff = 2.0 / (1.0 + period);
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
return final;
}
static void Main(string[] args)
{
int maxElements = 10000;
int numArrays = 10000;
int computeCores = 2048;
double[][] sets = new double[numArrays][];
using (Timer("Generating Data"))
{
Random elementRand = new Random(1);
for (int i = 0; i < numArrays; i++)
{
sets[i] = GetRandomDoubles(elementRand.Next((int)(maxElements * 0.9), maxElements), randomSeed: i);
}
}
int period = 14;
double[][] singleResults;
using (Timer("CPU Single Thread"))
{
singleResults = CalculateCPU(sets, period);
}
double[][] parallelResults;
using (Timer("CPU Parallel"))
{
parallelResults = CalculateCPUParallel(sets, period);
}
if (!AreTheSame(singleResults, parallelResults)) throw new Exception();
double[][] gpuResults;
using (Timer("Running GPU"))
{
gpuResults = CalculateGPU(computeCores, sets, period);
}
if (!AreTheSame(singleResults, gpuResults)) throw new Exception();
Console.WriteLine("Finished");
Console.ReadKey();
}
public static bool AreTheSame(double[][] a1, double[][] a2)
{
if (a1.Length != a2.Length) return false;
for (int i = 0; i < a1.Length; i++)
{
var ar1 = a1[i];
var ar2 = a2[i];
if (ar1.Length != ar2.Length) return false;
for (int j = 0; j < ar1.Length; j++)
if (Math.Abs(ar1[j] - ar2[j]) > 0.0000001) return false;
}
return true;
}
public static double[][] CalculateGPU(int partitionSize, double[][] sets, int period)
{
ComputeContextPropertyList cpl = new ComputeContextPropertyList(ComputePlatform.Platforms[0]);
ComputeContext context = new ComputeContext(ComputeDeviceTypes.Gpu, cpl, null, IntPtr.Zero);
ComputeProgram program = new ComputeProgram(context, new string[] { CalculateKernel });
program.Build(null, null, null, IntPtr.Zero);
ComputeCommandQueue commands = new ComputeCommandQueue(context, context.Devices[0], ComputeCommandQueueFlags.None);
ComputeEventList events = new ComputeEventList();
ComputeKernel kernel = program.CreateKernel("Calc");
double[][] results = new double[sets.Length][];
double periodFactor = 2d / (1d + period);
Stopwatch sendStopWatch = new Stopwatch();
Stopwatch executeStopWatch = new Stopwatch();
Stopwatch recieveStopWatch = new Stopwatch();
int offset = 0;
while (true)
{
int first = offset;
int last = Math.Min(offset + partitionSize, sets.Length);
int length = last - first;
var merged = Merge(sets, first, length);
sendStopWatch.Start();
ComputeBuffer<int> offsetBuffer = new ComputeBuffer<int>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Offsets);
ComputeBuffer<int> lengthsBuffer = new ComputeBuffer<int>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Lengths);
ComputeBuffer<double> doublesBuffer = new ComputeBuffer<double>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Doubles);
kernel.SetMemoryArgument(0, offsetBuffer);
kernel.SetMemoryArgument(1, lengthsBuffer);
kernel.SetMemoryArgument(2, doublesBuffer);
kernel.SetValueArgument(3, periodFactor);
sendStopWatch.Stop();
executeStopWatch.Start();
commands.Execute(kernel, null, new long[] { merged.Lengths.Length }, null, events);
executeStopWatch.Stop();
using (var pin = Pinned(merged.Doubles))
{
recieveStopWatch.Start();
commands.Read(doublesBuffer, false, 0, merged.Doubles.Length, pin.Address, events);
commands.Finish();
recieveStopWatch.Stop();
}
for (int i = 0; i < merged.Lengths.Length; i++)
{
int len = merged.Lengths[i];
int off = merged.Offsets[i];
var res = new double[len];
Array.Copy(merged.Doubles,off,res,0,len);
results[first + i] = res;
}
offset += partitionSize;
if (offset >= sets.Length) break;
}
Console.WriteLine("GPU CPU->GPU: " + recieveStopWatch.ElapsedMilliseconds + "ms");
Console.WriteLine("GPU Execute: " + executeStopWatch.ElapsedMilliseconds + "ms");
Console.WriteLine("GPU GPU->CPU: " + sendStopWatch.ElapsedMilliseconds + "ms");
return results;
}
public static PinnedHandle Pinned(object obj) => new PinnedHandle(obj);
public class PinnedHandle : IDisposable
{
public IntPtr Address => handle.AddrOfPinnedObject();
private GCHandle handle;
public PinnedHandle(object val)
{
handle = GCHandle.Alloc(val, GCHandleType.Pinned);
}
public void Dispose()
{
handle.Free();
}
}
public class MergedResults
{
public double[] Doubles { get; set; }
public int[] Lengths { get; set; }
public int[] Offsets { get; set; }
}
public static MergedResults Merge(double[][] sets, int offset, int length)
{
List<int> lengths = new List<int>(length);
List<int> offsets = new List<int>(length);
for (int i = 0; i < length; i++)
{
var arr = sets[i + offset];
lengths.Add(arr.Length);
}
var totalLength = lengths.Sum();
double[] doubles = new double[totalLength];
int dataOffset = 0;
for (int i = 0; i < length; i++)
{
var arr = sets[i + offset];
Array.Copy(arr, 0, doubles, dataOffset, arr.Length);
offsets.Add(dataOffset);
dataOffset += arr.Length;
}
return new MergedResults()
{
Doubles = doubles,
Lengths = lengths.ToArray(),
Offsets = offsets.ToArray(),
};
}
public static IDisposable Timer(string name)
{
return new SWTimer(name);
}
public class SWTimer : IDisposable
{
private Stopwatch _sw;
private string _name;
public SWTimer(string name)
{
_name = name;
_sw = Stopwatch.StartNew();
}
public void Dispose()
{
_sw.Stop();
Console.WriteLine("Task " + _name + ": " + _sw.Elapsed.TotalMilliseconds + "ms");
}
}
public static double[][] CalculateCPU(double[][] arrays, int period)
{
double[][] results = new double[arrays.Length][];
for (var index = 0; index < arrays.Length; index++)
{
var arr = arrays[index];
results[index] = Calculate(arr, period);
}
return results;
}
public static double[][] CalculateCPUParallel(double[][] arrays, int period)
{
double[][] results = new double[arrays.Length][];
Parallel.For(0, arrays.Length, i =>
{
var arr = arrays[i];
results[i] = Calculate(arr, period);
});
return results;
}
static double[] GetRandomDoubles(int num, int randomSeed)
{
Random r = new Random(randomSeed);
var res = new double[num];
for (int i = 0; i < num; i++)
res[i] = r.NextDouble() * 0.9 + 0.05;
return res;
}
}
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
表示第一个元素乘以coeff 1次并从第二个元素中减去。第一个元素也乘以 coeff 的平方,这次加到第三个元素上。然后第一个元素乘以 coeff 的立方并从第四个元素中减去。
事情是这样的:
-e0*c*c*c + e1*c*c - e2*c = f3
e0*c*c*c*c - e1*c*c*c + e2*c*c - e3*c = f4
-e0*c*c*c*c*c + e1*c*c*c*c - e2*c*c*c + e3*c*c - e4*c =f5
对于所有元素,扫描所有较小的 id 元素并计算:
如果元素的id值(姑且称之为k)之差为奇数,则进行减法运算,否则进行加法运算。在加减之前,将该值乘以 coeff 的 k 次方。最后,将当前 num 值乘以系数并将其添加到当前单元格。当前单元格值为 final(i)。
这是 O(N*N) 并且看起来像一个全对计算内核。使用开源 C# OpenCL 项目的示例:
ClNumberCruncher cruncher = new ClNumberCruncher(ClPlatforms.all().gpus(), @"
__kernel void foo(__global double * num, __global double * final, __global int *parameters)
{
int threadId = get_global_id(0);
int period = parameters[0];
double coeff = 2.0 / (1.0 + period);
double sumOfElements = 0.0;
for(int i=0;i<threadId;i++)
{
// negativity of coeff is to select addition or subtraction for different powers of coeff
double powKofCoeff = pow(-coeff,threadId-i);
sumOfElements += powKofCoeff * num[i];
}
final[threadId] = sumOfElements + num[threadId] * coeff;
}
");
cruncher.performanceFeed = true; // getting benchmark feedback on console
double[] numArray = new double[10000];
double[] finalArray = new double[10000];
int[] parameters = new int[10];
int period = 15;
parameters[0] = period;
ClArray<double> numGpuArray = numArray;
numGpuArray.readOnly = true; // gpus read this from host
ClArray<double> finalGpuArray = finalArray; // finalArray will have results
finalGpuArray.writeOnly = true; // gpus write this to host
ClArray<int> parametersGpu = parameters;
parametersGpu.readOnly = true;
// calculate kernels with exact same ordering of parameters
// num(double),final(double),parameters(int)
// finalGpuArray points to __global double * final
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
// first compute always lags because of compiling the kernel so here are repeated computes to get actual performance
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
结果在包含 10000 个元素的 finalArray
数组中,每个工作项组使用 100 个工作项。
GPGPU 部分在 rx550 gpu 上需要 82ms,其 64 位与 32 位计算性能的比率非常低(因为消费类游戏卡不擅长新系列的双精度)。 Nvidia Tesla 或 Amd Vega 可以轻松计算该内核,而不会影响性能。 Fx8150(8 核)在 683 毫秒内完成。如果您需要专门 select 仅集成 GPU 及其 CPU,您可以使用
ClPlatforms.all().gpus().devicesWithHostMemorySharing() + ClPlatforms.all().cpus()
创建 ClNumberCruncher
实例时。
api 的二进制文件:
https://www.codeproject.com/Articles/1181213/Easy-OpenCL-Multiple-Device-Load-Balancing-and-Pip
或在您的电脑上编译的源代码:
https://github.com/tugrul512bit/Cekirdekler
如果您有多个 GPU,它会使用它们而无需任何额外代码。在计算中包含 cpu 会降低此示例中第一次迭代的 gpu 效率(使用 cpu+gpu 在 76 毫秒内完成重复),因此最好使用 2-3 个 GPU 而不是 CPU+GPU.
我没有检查数值稳定性(在将数百万或更多值添加到同一个变量时你应该使用 Kahan-Summation 但我没有使用它是为了提高可读性并且不知道是否是 64 位值像 32 位一样也需要这个)或任何值的正确性,你应该这样做。 foo 内核也没有优化。它使 %50 的核心时间空闲,因此应该更好地安排如下:
thread-0: compute element 0 and element N-1
thread-1: compute element 1 and element N-2
thread-m: compute element N/2-1 and element N/2
因此所有工作项的工作量都差不多。最重要的是,使用 100 作为工作组大小并不是最佳选择。它应该是 128,256,512 或 1024(对于 Nvidia),但这意味着数组大小也应该是它的整数倍。然后它需要内核中的额外控制逻辑才能不超出数组边界。为了获得更高的性能,for 循环可以有多个部分和来执行 "loop unrolling".
如何使这个函数的 for 循环在 OpenCL 中使用 GPU?
public static double[] Calculate(double[] num, int period)
{
var final = new double[num.Length];
double sum = num[0];
double coeff = 2.0 / (1.0 + period);
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
return final;
}
正如评论者 Cory 所述,请参考此 link 进行设置。
以下是您将如何使用该项目:
- 添加 Nuget 包 Cloo
- 添加对 OpenCLlib.dll 的引用
- 下载OpenCLLib.zip
使用 OpenCL 添加
static void Main(string[] args) { int[] Primes = { 1,2,3,4,5,6,7 }; EasyCL cl = new EasyCL(); cl.Accelerator = AcceleratorDevice.GPU; cl.LoadKernel(IsPrime); cl.Invoke("GetIfPrime", 0, Primes.Length, Primes, 1.0); } static string IsPrime { get { return @" kernel void GetIfPrime(global int* num, int period) { int index = get_global_id(0); int sum = (2.0 / (1.0 + period)) * (num[index] - num[0]); printf("" %d \n"",sum); }"; } }
您所写的问题不适合在 GPU 上运行的问题。您不能并行化(以提高性能的方式)单个数组上的操作,因为第 n 个元素的值取决于元素 1 到 n。但是,您可以利用 GPU 处理多个阵列,其中每个 GPU 核心在单独的阵列上运行。
解决方案的完整代码在答案的末尾,但测试结果,计算 10,000 个数组,每个数组有 10,000 个元素,生成以下内容(在 GTX1080M 和 i7 7700k 上32GB 内存):
Task Generating Data: 1096.4583ms
Task CPU Single Thread: 596.2624ms
Task CPU Parallel: 179.1717ms
GPU CPU->GPU: 89ms
GPU Execute: 86ms
GPU GPU->CPU: 29ms
Task Running GPU: 921.4781ms
Finished
在这个测试中,我们测量了我们可以将结果生成到托管 C# 数组中的速度,使用 CPU 一个线程, CPU 所有线程,最后使用 GPU所有核心。 我们使用函数 AreTheSame 验证每个测试的结果是否相同。
最快的时间是使用所有线程处理 CPU 上的数组(任务 CPU 并行:179 毫秒)。
GPU 实际上是最慢的(任务 运行 GPU:922 毫秒),但这是因为以可以传输到 GPU 的方式重新格式化 C# 数组所花费的时间。
如果消除了这个瓶颈(这很有可能,具体取决于您的用例),GPU 可能是最快的。如果数据已经以可以立即传输到 GPU 的方式格式化,则 GPU 的总处理时间将为 204 毫秒(CPU->GPU:89 毫秒 + 执行:86 毫秒 + GPU->CPU:29 毫秒 = 204 毫秒)。这仍然比并行 CPU 选项慢,但在不同类型的数据集上,它可能更快。
为了从GPU取回数据(实际使用GPU最重要的部分),我们使用函数ComputeCommandQueue.Read。这会将 GPU 上更改后的数组传输回 CPU.
运行下面的代码,参考了Cloo Nuget Package(我用的是0.9.1)。并确保在 x64 上编译(您将需要内存)。如果找不到 OpenCL 设备,您可能还需要更新显卡驱动程序。
class Program
{
static string CalculateKernel
{
get
{
return @"
kernel void Calc(global int* offsets, global int* lengths, global double* doubles, double periodFactor)
{
int id = get_global_id(0);
int start = offsets[id];
int length = lengths[id];
int end = start + length;
double sum = doubles[start];
for(int i = start; i < end; i++)
{
sum = sum + periodFactor * ( doubles[i] - sum );
doubles[i] = sum;
}
}";
}
}
public static double[] Calculate(double[] num, int period)
{
var final = new double[num.Length];
double sum = num[0];
double coeff = 2.0 / (1.0 + period);
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
return final;
}
static void Main(string[] args)
{
int maxElements = 10000;
int numArrays = 10000;
int computeCores = 2048;
double[][] sets = new double[numArrays][];
using (Timer("Generating Data"))
{
Random elementRand = new Random(1);
for (int i = 0; i < numArrays; i++)
{
sets[i] = GetRandomDoubles(elementRand.Next((int)(maxElements * 0.9), maxElements), randomSeed: i);
}
}
int period = 14;
double[][] singleResults;
using (Timer("CPU Single Thread"))
{
singleResults = CalculateCPU(sets, period);
}
double[][] parallelResults;
using (Timer("CPU Parallel"))
{
parallelResults = CalculateCPUParallel(sets, period);
}
if (!AreTheSame(singleResults, parallelResults)) throw new Exception();
double[][] gpuResults;
using (Timer("Running GPU"))
{
gpuResults = CalculateGPU(computeCores, sets, period);
}
if (!AreTheSame(singleResults, gpuResults)) throw new Exception();
Console.WriteLine("Finished");
Console.ReadKey();
}
public static bool AreTheSame(double[][] a1, double[][] a2)
{
if (a1.Length != a2.Length) return false;
for (int i = 0; i < a1.Length; i++)
{
var ar1 = a1[i];
var ar2 = a2[i];
if (ar1.Length != ar2.Length) return false;
for (int j = 0; j < ar1.Length; j++)
if (Math.Abs(ar1[j] - ar2[j]) > 0.0000001) return false;
}
return true;
}
public static double[][] CalculateGPU(int partitionSize, double[][] sets, int period)
{
ComputeContextPropertyList cpl = new ComputeContextPropertyList(ComputePlatform.Platforms[0]);
ComputeContext context = new ComputeContext(ComputeDeviceTypes.Gpu, cpl, null, IntPtr.Zero);
ComputeProgram program = new ComputeProgram(context, new string[] { CalculateKernel });
program.Build(null, null, null, IntPtr.Zero);
ComputeCommandQueue commands = new ComputeCommandQueue(context, context.Devices[0], ComputeCommandQueueFlags.None);
ComputeEventList events = new ComputeEventList();
ComputeKernel kernel = program.CreateKernel("Calc");
double[][] results = new double[sets.Length][];
double periodFactor = 2d / (1d + period);
Stopwatch sendStopWatch = new Stopwatch();
Stopwatch executeStopWatch = new Stopwatch();
Stopwatch recieveStopWatch = new Stopwatch();
int offset = 0;
while (true)
{
int first = offset;
int last = Math.Min(offset + partitionSize, sets.Length);
int length = last - first;
var merged = Merge(sets, first, length);
sendStopWatch.Start();
ComputeBuffer<int> offsetBuffer = new ComputeBuffer<int>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Offsets);
ComputeBuffer<int> lengthsBuffer = new ComputeBuffer<int>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Lengths);
ComputeBuffer<double> doublesBuffer = new ComputeBuffer<double>(
context,
ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
merged.Doubles);
kernel.SetMemoryArgument(0, offsetBuffer);
kernel.SetMemoryArgument(1, lengthsBuffer);
kernel.SetMemoryArgument(2, doublesBuffer);
kernel.SetValueArgument(3, periodFactor);
sendStopWatch.Stop();
executeStopWatch.Start();
commands.Execute(kernel, null, new long[] { merged.Lengths.Length }, null, events);
executeStopWatch.Stop();
using (var pin = Pinned(merged.Doubles))
{
recieveStopWatch.Start();
commands.Read(doublesBuffer, false, 0, merged.Doubles.Length, pin.Address, events);
commands.Finish();
recieveStopWatch.Stop();
}
for (int i = 0; i < merged.Lengths.Length; i++)
{
int len = merged.Lengths[i];
int off = merged.Offsets[i];
var res = new double[len];
Array.Copy(merged.Doubles,off,res,0,len);
results[first + i] = res;
}
offset += partitionSize;
if (offset >= sets.Length) break;
}
Console.WriteLine("GPU CPU->GPU: " + recieveStopWatch.ElapsedMilliseconds + "ms");
Console.WriteLine("GPU Execute: " + executeStopWatch.ElapsedMilliseconds + "ms");
Console.WriteLine("GPU GPU->CPU: " + sendStopWatch.ElapsedMilliseconds + "ms");
return results;
}
public static PinnedHandle Pinned(object obj) => new PinnedHandle(obj);
public class PinnedHandle : IDisposable
{
public IntPtr Address => handle.AddrOfPinnedObject();
private GCHandle handle;
public PinnedHandle(object val)
{
handle = GCHandle.Alloc(val, GCHandleType.Pinned);
}
public void Dispose()
{
handle.Free();
}
}
public class MergedResults
{
public double[] Doubles { get; set; }
public int[] Lengths { get; set; }
public int[] Offsets { get; set; }
}
public static MergedResults Merge(double[][] sets, int offset, int length)
{
List<int> lengths = new List<int>(length);
List<int> offsets = new List<int>(length);
for (int i = 0; i < length; i++)
{
var arr = sets[i + offset];
lengths.Add(arr.Length);
}
var totalLength = lengths.Sum();
double[] doubles = new double[totalLength];
int dataOffset = 0;
for (int i = 0; i < length; i++)
{
var arr = sets[i + offset];
Array.Copy(arr, 0, doubles, dataOffset, arr.Length);
offsets.Add(dataOffset);
dataOffset += arr.Length;
}
return new MergedResults()
{
Doubles = doubles,
Lengths = lengths.ToArray(),
Offsets = offsets.ToArray(),
};
}
public static IDisposable Timer(string name)
{
return new SWTimer(name);
}
public class SWTimer : IDisposable
{
private Stopwatch _sw;
private string _name;
public SWTimer(string name)
{
_name = name;
_sw = Stopwatch.StartNew();
}
public void Dispose()
{
_sw.Stop();
Console.WriteLine("Task " + _name + ": " + _sw.Elapsed.TotalMilliseconds + "ms");
}
}
public static double[][] CalculateCPU(double[][] arrays, int period)
{
double[][] results = new double[arrays.Length][];
for (var index = 0; index < arrays.Length; index++)
{
var arr = arrays[index];
results[index] = Calculate(arr, period);
}
return results;
}
public static double[][] CalculateCPUParallel(double[][] arrays, int period)
{
double[][] results = new double[arrays.Length][];
Parallel.For(0, arrays.Length, i =>
{
var arr = arrays[i];
results[i] = Calculate(arr, period);
});
return results;
}
static double[] GetRandomDoubles(int num, int randomSeed)
{
Random r = new Random(randomSeed);
var res = new double[num];
for (int i = 0; i < num; i++)
res[i] = r.NextDouble() * 0.9 + 0.05;
return res;
}
}
for (int i = 0; i < num.Length; i++)
{
sum += coeff * (num[i] - sum);
final[i] = sum;
}
表示第一个元素乘以coeff 1次并从第二个元素中减去。第一个元素也乘以 coeff 的平方,这次加到第三个元素上。然后第一个元素乘以 coeff 的立方并从第四个元素中减去。
事情是这样的:
-e0*c*c*c + e1*c*c - e2*c = f3
e0*c*c*c*c - e1*c*c*c + e2*c*c - e3*c = f4
-e0*c*c*c*c*c + e1*c*c*c*c - e2*c*c*c + e3*c*c - e4*c =f5
对于所有元素,扫描所有较小的 id 元素并计算:
如果元素的id值(姑且称之为k)之差为奇数,则进行减法运算,否则进行加法运算。在加减之前,将该值乘以 coeff 的 k 次方。最后,将当前 num 值乘以系数并将其添加到当前单元格。当前单元格值为 final(i)。
这是 O(N*N) 并且看起来像一个全对计算内核。使用开源 C# OpenCL 项目的示例:
ClNumberCruncher cruncher = new ClNumberCruncher(ClPlatforms.all().gpus(), @"
__kernel void foo(__global double * num, __global double * final, __global int *parameters)
{
int threadId = get_global_id(0);
int period = parameters[0];
double coeff = 2.0 / (1.0 + period);
double sumOfElements = 0.0;
for(int i=0;i<threadId;i++)
{
// negativity of coeff is to select addition or subtraction for different powers of coeff
double powKofCoeff = pow(-coeff,threadId-i);
sumOfElements += powKofCoeff * num[i];
}
final[threadId] = sumOfElements + num[threadId] * coeff;
}
");
cruncher.performanceFeed = true; // getting benchmark feedback on console
double[] numArray = new double[10000];
double[] finalArray = new double[10000];
int[] parameters = new int[10];
int period = 15;
parameters[0] = period;
ClArray<double> numGpuArray = numArray;
numGpuArray.readOnly = true; // gpus read this from host
ClArray<double> finalGpuArray = finalArray; // finalArray will have results
finalGpuArray.writeOnly = true; // gpus write this to host
ClArray<int> parametersGpu = parameters;
parametersGpu.readOnly = true;
// calculate kernels with exact same ordering of parameters
// num(double),final(double),parameters(int)
// finalGpuArray points to __global double * final
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
// first compute always lags because of compiling the kernel so here are repeated computes to get actual performance
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
numGpuArray.nextParam(finalGpuArray, parametersGpu).compute(cruncher, 1, "foo", 10000, 100);
结果在包含 10000 个元素的 finalArray
数组中,每个工作项组使用 100 个工作项。
GPGPU 部分在 rx550 gpu 上需要 82ms,其 64 位与 32 位计算性能的比率非常低(因为消费类游戏卡不擅长新系列的双精度)。 Nvidia Tesla 或 Amd Vega 可以轻松计算该内核,而不会影响性能。 Fx8150(8 核)在 683 毫秒内完成。如果您需要专门 select 仅集成 GPU 及其 CPU,您可以使用
ClPlatforms.all().gpus().devicesWithHostMemorySharing() + ClPlatforms.all().cpus()
创建 ClNumberCruncher
实例时。
api 的二进制文件:
https://www.codeproject.com/Articles/1181213/Easy-OpenCL-Multiple-Device-Load-Balancing-and-Pip
或在您的电脑上编译的源代码:
https://github.com/tugrul512bit/Cekirdekler
如果您有多个 GPU,它会使用它们而无需任何额外代码。在计算中包含 cpu 会降低此示例中第一次迭代的 gpu 效率(使用 cpu+gpu 在 76 毫秒内完成重复),因此最好使用 2-3 个 GPU 而不是 CPU+GPU.
我没有检查数值稳定性(在将数百万或更多值添加到同一个变量时你应该使用 Kahan-Summation 但我没有使用它是为了提高可读性并且不知道是否是 64 位值像 32 位一样也需要这个)或任何值的正确性,你应该这样做。 foo 内核也没有优化。它使 %50 的核心时间空闲,因此应该更好地安排如下:
thread-0: compute element 0 and element N-1
thread-1: compute element 1 and element N-2
thread-m: compute element N/2-1 and element N/2
因此所有工作项的工作量都差不多。最重要的是,使用 100 作为工作组大小并不是最佳选择。它应该是 128,256,512 或 1024(对于 Nvidia),但这意味着数组大小也应该是它的整数倍。然后它需要内核中的额外控制逻辑才能不超出数组边界。为了获得更高的性能,for 循环可以有多个部分和来执行 "loop unrolling".