Parralel.ForEach 丢失数据
Parralel.ForEach losing data
我的代码运行速度太慢:
DataTable result = GetDataTable();
while (reader.Read())
{
var a = reader.Field<int>("a").ToString();
var b = reader.Field<int>("b").ToString();
var c = reader.Field<double>("c");
var d = reader.Field<string>("d");
var e = reader.Field<string>("e");
DataRow datarow = result.AsEnumerable().FirstOrDefault(r => r.Field<string>("A") == a && r.Field<string>("D") == d);
if (datarow == null)
{
datarow = result.NewRow();
datarow["A"] = a;
datarow["D"] = d;
datarow["E"] = e;
result.Rows.Add(datarow);
}
datarow[b] = c;
}
return result;
我将其更改为使用 TPL
。现在看起来像:
var result = GetDataTable();
var concurrentCollection = new ConcurrentDictionary<string, SomeClass>();
Parallel.ForEach(reader.ToDataTable().AsEnumerable(), new ParallelOptions { MaxDegreeOfParallelism = 2 }, row =>
{
var a = reader.Field<int>("a").ToString();
var b = reader.Field<int>("b").ToString();
var c = reader.Field<double>("c");
var d = reader.Field<string>("d");
var values = concurrentCollection.FirstOrDefault(r => r.Key.ToString() == $"{a}|{d}");
if (values.Key == null)
{
var data = new SomeClass
{
Dictionary =
{
["A"] = a;
["D"] = d;
["E"] = reader.Field<string>("e")
}
};
values = new KeyValuePair<string, SomeClass>($"{a}|{d}", data);
}
values.Value.Dictionary[b] = c;
concurrentCollection.AddOrUpdate(values.Key, values.Value, (key, oldValue) => values.Value);
});
foreach (var ins in concurrentCollection.OrderBy(x => x.Value.Dictionary["D"]).ThenBy(x => x.Value.Dictionary["A"]))
{
var datarow = result.NewRow();
foreach (var key in ins.Value.Dictionary.Keys)
{
datarow[key.ToString()] = ins.Value.Dictionary[key];
}
result.Rows.Add(datarow);
}
concurrentCollection.Clear();
return result;
如果我将 MaxDegreeOfParallelism
更改为 1,则两个代码部分的结果相同。但是当我更改 MaxDegreeOfParallelism
的值时,结果数据开始不同。而且MaxDegreeOfParallelism
的值越大,结果越不一样
这里是 result 变量的 JSON
的转换结果。
第一个代码部分的部分结果:
[{
"A": "1010",
"1": "744",
"2": "736",
"3": "8",
"4": null,
"5": null,
"6": null,
"7": null,
"8": null,
"9": null,
"10": null,
"B": " Data",
"E": "0.4"
},...]
第二个代码部分的部分结果:
[{
"A": "1010",
"1": "744",
"2": null,
"3": null,
"4": null,
"5": null,
"6": null,
"7": null,
"8": null,
"9": null,
"10": null,
"B": " Data",
"E": "0.4"
},...]
结果 JSON
数组中不匹配对象的数量每次开始都不同。
也许您的处理方式有误。我假设较慢的部分是在 result
中查找匹配行。尝试使用您需要查找的字段的键创建字典。使用字典将有接近 O(1) 查找。
如果字段 A 和 D 在 result
中不是唯一的,请改用查找。 ToLookup()
并从查找键的结果中选择第一行(等于你今天的逻辑)
DataTable result = GetDataTable();
var dic = result.AsEnumerable().ToDictionary(r => new { A = r.Field<string>("A"), D = r.Field<string>("D")});
while (reader.Read())
{
var a = reader.Field<int>("a").ToString();
var b = reader.Field<int>("b").ToString();
var c = reader.Field<double>("c");
var d = reader.Field<string>("d");
var e = reader.Field<string>("e");
DataRow datarow;
if(!dic.TryGetValue(new{A = a, D = d}, out datarow))
{
datarow = result.NewRow();
datarow["A"] = a;
datarow["D"] = d;
datarow["E"] = e;
result.Rows.Add(datarow);
dic.Add(new{A = a, D = d}, datarow);
}
datarow[b] = c;
}
return result;
我会使用覆盖 equals 和 GetHashCode
的 class
使用 HashSet 进行 O(1) 查找
public static HashSet<Drow> GetRows()
{
HashSet<Drow> Drows = new HashSet<Drow>();
SqlCommand cmd = new SqlCommand();
SqlDataReader rdr = cmd.ExecuteReader();
int a;
int b;
double c;
string d;
while(rdr.Read())
{
a = rdr.GetInt32(0);
b = rdr.GetInt32(1);
c = rdr.GetDouble(2);
d = rdr.GetString(3);
Drow drow = new Drow(a, b, c, d);
Drows.Add(drow); //it just will not add if it is a duplicat
}
return Drows;
}
}
class Drow : object
{
int ai;
int bi;
public string A { get; }
public string B { get; }
public double C { get; }
public string D { get; }
public Drow(int a, int b, double c, string d)
{
ai = a;
bi = b;
A = a.ToString();
B = b.ToString();
C = c;
D = d;
}
public override bool Equals(Object obj)
{
// Check for null values and compare run-time types.
if (obj == null || GetType() != obj.GetType())
return false;
Drow r = (Drow)obj;
return (A == r.A) && (B == r.B);
}
public override int GetHashCode()
{
return ai ^ bi;
}
}
我的代码运行速度太慢:
DataTable result = GetDataTable();
while (reader.Read())
{
var a = reader.Field<int>("a").ToString();
var b = reader.Field<int>("b").ToString();
var c = reader.Field<double>("c");
var d = reader.Field<string>("d");
var e = reader.Field<string>("e");
DataRow datarow = result.AsEnumerable().FirstOrDefault(r => r.Field<string>("A") == a && r.Field<string>("D") == d);
if (datarow == null)
{
datarow = result.NewRow();
datarow["A"] = a;
datarow["D"] = d;
datarow["E"] = e;
result.Rows.Add(datarow);
}
datarow[b] = c;
}
return result;
我将其更改为使用 TPL
。现在看起来像:
var result = GetDataTable();
var concurrentCollection = new ConcurrentDictionary<string, SomeClass>();
Parallel.ForEach(reader.ToDataTable().AsEnumerable(), new ParallelOptions { MaxDegreeOfParallelism = 2 }, row =>
{
var a = reader.Field<int>("a").ToString();
var b = reader.Field<int>("b").ToString();
var c = reader.Field<double>("c");
var d = reader.Field<string>("d");
var values = concurrentCollection.FirstOrDefault(r => r.Key.ToString() == $"{a}|{d}");
if (values.Key == null)
{
var data = new SomeClass
{
Dictionary =
{
["A"] = a;
["D"] = d;
["E"] = reader.Field<string>("e")
}
};
values = new KeyValuePair<string, SomeClass>($"{a}|{d}", data);
}
values.Value.Dictionary[b] = c;
concurrentCollection.AddOrUpdate(values.Key, values.Value, (key, oldValue) => values.Value);
});
foreach (var ins in concurrentCollection.OrderBy(x => x.Value.Dictionary["D"]).ThenBy(x => x.Value.Dictionary["A"]))
{
var datarow = result.NewRow();
foreach (var key in ins.Value.Dictionary.Keys)
{
datarow[key.ToString()] = ins.Value.Dictionary[key];
}
result.Rows.Add(datarow);
}
concurrentCollection.Clear();
return result;
如果我将 MaxDegreeOfParallelism
更改为 1,则两个代码部分的结果相同。但是当我更改 MaxDegreeOfParallelism
的值时,结果数据开始不同。而且MaxDegreeOfParallelism
的值越大,结果越不一样
这里是 result 变量的 JSON
的转换结果。
第一个代码部分的部分结果:
[{ "A": "1010", "1": "744", "2": "736", "3": "8", "4": null, "5": null, "6": null, "7": null, "8": null, "9": null, "10": null, "B": " Data", "E": "0.4" },...]
第二个代码部分的部分结果:
[{ "A": "1010", "1": "744", "2": null, "3": null, "4": null, "5": null, "6": null, "7": null, "8": null, "9": null, "10": null, "B": " Data", "E": "0.4" },...]
结果 JSON
数组中不匹配对象的数量每次开始都不同。
也许您的处理方式有误。我假设较慢的部分是在 result
中查找匹配行。尝试使用您需要查找的字段的键创建字典。使用字典将有接近 O(1) 查找。
如果字段 A 和 D 在 result
中不是唯一的,请改用查找。 ToLookup()
并从查找键的结果中选择第一行(等于你今天的逻辑)
DataTable result = GetDataTable();
var dic = result.AsEnumerable().ToDictionary(r => new { A = r.Field<string>("A"), D = r.Field<string>("D")});
while (reader.Read())
{
var a = reader.Field<int>("a").ToString();
var b = reader.Field<int>("b").ToString();
var c = reader.Field<double>("c");
var d = reader.Field<string>("d");
var e = reader.Field<string>("e");
DataRow datarow;
if(!dic.TryGetValue(new{A = a, D = d}, out datarow))
{
datarow = result.NewRow();
datarow["A"] = a;
datarow["D"] = d;
datarow["E"] = e;
result.Rows.Add(datarow);
dic.Add(new{A = a, D = d}, datarow);
}
datarow[b] = c;
}
return result;
我会使用覆盖 equals 和 GetHashCode
的 class
使用 HashSet 进行 O(1) 查找
public static HashSet<Drow> GetRows()
{
HashSet<Drow> Drows = new HashSet<Drow>();
SqlCommand cmd = new SqlCommand();
SqlDataReader rdr = cmd.ExecuteReader();
int a;
int b;
double c;
string d;
while(rdr.Read())
{
a = rdr.GetInt32(0);
b = rdr.GetInt32(1);
c = rdr.GetDouble(2);
d = rdr.GetString(3);
Drow drow = new Drow(a, b, c, d);
Drows.Add(drow); //it just will not add if it is a duplicat
}
return Drows;
}
}
class Drow : object
{
int ai;
int bi;
public string A { get; }
public string B { get; }
public double C { get; }
public string D { get; }
public Drow(int a, int b, double c, string d)
{
ai = a;
bi = b;
A = a.ToString();
B = b.ToString();
C = c;
D = d;
}
public override bool Equals(Object obj)
{
// Check for null values and compare run-time types.
if (obj == null || GetType() != obj.GetType())
return false;
Drow r = (Drow)obj;
return (A == r.A) && (B == r.B);
}
public override int GetHashCode()
{
return ai ^ bi;
}
}