用于解析 CSV 文本行的 Linq Lookup
Linq Lookup to parse a CSV text line
问题
前一段时间我曾问过 this question,从那时起要求发生了一些变化。
现在,可能会有一个包含以下行的文件:
Bryar22053;ADDPWN;Bryar.Suarez@company.com;ACTIVE
Nicole49927;ADDPWN;Nicole.Acosta@company.com;ACTIVE
Rashad58323;ADDPWN;Rashad.Everett@company.com;ACTIVE
取第一行。跳过第一个值 Bryar22053
并使用相同的查找:
var columnCount = dataRow.Skip(1).Count();
var modular = 0;
// Simple Enum
var rightsFileType = new RightsFileType();
if (columnCount % 2 == 0)
{
rightsFileType = RightsFileType.WithoutStatus;
modular = 2;
}
else if (columnCount % 3 == 0)
{
rightsFileType = RightsFileType.WithStatus;
modular = 3;
}
var lookup = dataRow.Skip(1).Select((data, index) => new
{
lookup = index % modular,
index,
data
}).ToLookup(d => d.lookup);
查找对象现在有三组:
> ? lookup[0].ToList() Count = 1
> [0]: { lookup = 0, index = 0, data = "ADDPWN" } ? lookup[1].ToList() Count = 1
> [0]: { lookup = 1, index = 1, data = "Bryar.Suarez@company.com" } ? lookup[2].ToList() Count = 1
> [0]: { lookup = 2, index = 2, data = "ACTIVE" }
如果是原来的情况,它只是 System1、User1、System2、User2... lookup
将有两个组,下面的代码将起作用:
List<RightObjectRetrieved> rights;
rights = lookup[0].Join(lookup[1], system => system.index + 1, username => username.index, (system, username) => new
{
system = system.data,
useraname = username.data
}).Where(d => !string.IsNullOrEmpty(d.system)).Select(d => new RightObjectRetrieved {UserIdentifier = userIdentifier, SystemIdentifer = d.system, Username = d.useraname, RightType = rightsFileType}).ToList();
// rights => Key = System Identifier, Value = Username
但是第三个 'status' 为 System1,User1,Status1,System2,User2,Status2...,我在尝试加入并获取全部三个时遇到问题。请帮忙。
编辑
这是我的原始数据:
// Method has parameter localReadLine (string) that has this:
// Bryar22053;ADDPWN;Bryar.Suarez@company.com;ACTIVE
// Data line
var dataRow = localReadLine.Split(new[] { ToolSettings.RightsSeperator }, StringSplitOptions.None);
// Trim each element
Array.ForEach(dataRow, x => dataRow[Array.IndexOf(dataRow, x)] = x.Trim());
到目前为止已尝试(失败)
rights = lookup[0].Join(lookup[1], system => system.index + 1, username => username.index, status => status.index, (system, username, status) => new
{
system = system.data,
useraname = username.data,
status = status.data
}).Where(d => !string.IsNullOrEmpty(d.system)).Select(d => new RightObjectRetrieved {UserIdentifier = userIdentifier, SystemIdentifer = d.system, Username = d.useraname, RightType = rightsFileType}).ToList();
和
rights = lookup[0].Join(lookup[1], system => system.index + 1, username => username.index, (system, username) => new
{
system = system.data,
useraname = username.data
}).Join(lookup[2], status => status.index, (status) => new
{
status = status.data
}).Where(d => !string.IsNullOrEmpty(d.system)).Select(d => new RightObjectRetrieved {UserIdentifier = userIdentifier, SystemIdentifer = d.system, Username = d.useraname, RightType = rightsFileType, Status = ParseStatus(status)}).ToList();
我认为您需要稍微拆分一下您的实施。
让我们声明一个 class 来保存数据:
class Data
{
public string System { get; set; }
public string Username { get; set; }
public string Status { get; set; }
}
现在,让我们定义几个解析函数来解析一行。
第一个将解析包含状态的行:
var withStatus = (IEnumerable<string> line) => line
.Select((token, index) => new { Value = token, Index = index })
.Aggregate(
new List<Data>(),
(list, token) =>
{
if( token.Index % 3 == 0 )
{
list.Add(new Data { System = token.Value });
return list;
}
var data = list.Last();
if( token.Index % 3 == 1 )
data.Username = token.Value;
else
data.Status = token.Value;
return list;
});
第二个将解析不包含状态的行:
var withoutStatus = (IEnumerable<string> line) => line
.Select((token, index) => new { Value = token, Index = index })
.Aggregate(new List<Data>(),
(list, token) =>
{
if( token.Index % 2 == 0)
list.Add(new Data { System = token.Value });
else
list.Last().Username = token.Value;
return list;
});
准备好所有这些后,您将需要以下内容:
- 确定模数
- 迭代文件的行并解析每一行
- 分组并汇总结果
剩余的代码如下所示:
var lines = streamReader.ReadAllLines(); // mind the system resources here!
var parser = lines.First().Split(';').Length % 2 == 0 ? withoutStatus : withStatus;
var data = lines.Skip(1) // skip the header
.Select(line =>
{
var parts = line.Split(';');
return new
{
UserId = parts.First(),
Data = parser(parts.Skip(1))
};
})
.GroupBy(x => x.UserId)
.ToDictionary(g => g.Key, g => g.SelectMany(x => x.Data));
现在您有一个 Dictionary<string, Data>
,其中包含用户 ID 及其信息。
当然,更优雅的解决方案是将每个解析函数分离到它自己的 class 中,然后将这些 class 加入一个通用接口下,以防需要添加更多信息未来,但上面的代码应该可以工作,并让您了解应该做什么。
如果您想使用联接:
var result = lookup[0]
.Join(lookup[1],
system => system.index,
username => username.index - 1,
(system, username) => new {system = system.data, username = username.data, system.index})
.Join(lookup[2],
d => d.index,
status => status.index - 2,
(d, status) => new {d.system, d.username, status = status.data})
.ToList();
按记录分组的另一种选择,仅 select 来自它的数据(从我的角度来看看起来更具可读性):
var result = dataRow
.Skip(1)
.Select((data, index) => new {data, record = index / 3})
.GroupBy(r => r.record)
.Select(r =>
{
var tokens = r.ToArray();
return new
{
system = tokens[0].data,
username = tokens[1].data,
status = tokens[2].data
};
})
.ToList();
问题
前一段时间我曾问过 this question,从那时起要求发生了一些变化。
现在,可能会有一个包含以下行的文件:
Bryar22053;ADDPWN;Bryar.Suarez@company.com;ACTIVE
Nicole49927;ADDPWN;Nicole.Acosta@company.com;ACTIVE
Rashad58323;ADDPWN;Rashad.Everett@company.com;ACTIVE
取第一行。跳过第一个值 Bryar22053
并使用相同的查找:
var columnCount = dataRow.Skip(1).Count();
var modular = 0;
// Simple Enum
var rightsFileType = new RightsFileType();
if (columnCount % 2 == 0)
{
rightsFileType = RightsFileType.WithoutStatus;
modular = 2;
}
else if (columnCount % 3 == 0)
{
rightsFileType = RightsFileType.WithStatus;
modular = 3;
}
var lookup = dataRow.Skip(1).Select((data, index) => new
{
lookup = index % modular,
index,
data
}).ToLookup(d => d.lookup);
查找对象现在有三组:
> ? lookup[0].ToList() Count = 1
> [0]: { lookup = 0, index = 0, data = "ADDPWN" } ? lookup[1].ToList() Count = 1
> [0]: { lookup = 1, index = 1, data = "Bryar.Suarez@company.com" } ? lookup[2].ToList() Count = 1
> [0]: { lookup = 2, index = 2, data = "ACTIVE" }
如果是原来的情况,它只是 System1、User1、System2、User2... lookup
将有两个组,下面的代码将起作用:
List<RightObjectRetrieved> rights;
rights = lookup[0].Join(lookup[1], system => system.index + 1, username => username.index, (system, username) => new
{
system = system.data,
useraname = username.data
}).Where(d => !string.IsNullOrEmpty(d.system)).Select(d => new RightObjectRetrieved {UserIdentifier = userIdentifier, SystemIdentifer = d.system, Username = d.useraname, RightType = rightsFileType}).ToList();
// rights => Key = System Identifier, Value = Username
但是第三个 'status' 为 System1,User1,Status1,System2,User2,Status2...,我在尝试加入并获取全部三个时遇到问题。请帮忙。
编辑 这是我的原始数据:
// Method has parameter localReadLine (string) that has this:
// Bryar22053;ADDPWN;Bryar.Suarez@company.com;ACTIVE
// Data line
var dataRow = localReadLine.Split(new[] { ToolSettings.RightsSeperator }, StringSplitOptions.None);
// Trim each element
Array.ForEach(dataRow, x => dataRow[Array.IndexOf(dataRow, x)] = x.Trim());
到目前为止已尝试(失败)
rights = lookup[0].Join(lookup[1], system => system.index + 1, username => username.index, status => status.index, (system, username, status) => new
{
system = system.data,
useraname = username.data,
status = status.data
}).Where(d => !string.IsNullOrEmpty(d.system)).Select(d => new RightObjectRetrieved {UserIdentifier = userIdentifier, SystemIdentifer = d.system, Username = d.useraname, RightType = rightsFileType}).ToList();
和
rights = lookup[0].Join(lookup[1], system => system.index + 1, username => username.index, (system, username) => new
{
system = system.data,
useraname = username.data
}).Join(lookup[2], status => status.index, (status) => new
{
status = status.data
}).Where(d => !string.IsNullOrEmpty(d.system)).Select(d => new RightObjectRetrieved {UserIdentifier = userIdentifier, SystemIdentifer = d.system, Username = d.useraname, RightType = rightsFileType, Status = ParseStatus(status)}).ToList();
我认为您需要稍微拆分一下您的实施。
让我们声明一个 class 来保存数据:
class Data
{
public string System { get; set; }
public string Username { get; set; }
public string Status { get; set; }
}
现在,让我们定义几个解析函数来解析一行。 第一个将解析包含状态的行:
var withStatus = (IEnumerable<string> line) => line
.Select((token, index) => new { Value = token, Index = index })
.Aggregate(
new List<Data>(),
(list, token) =>
{
if( token.Index % 3 == 0 )
{
list.Add(new Data { System = token.Value });
return list;
}
var data = list.Last();
if( token.Index % 3 == 1 )
data.Username = token.Value;
else
data.Status = token.Value;
return list;
});
第二个将解析不包含状态的行:
var withoutStatus = (IEnumerable<string> line) => line
.Select((token, index) => new { Value = token, Index = index })
.Aggregate(new List<Data>(),
(list, token) =>
{
if( token.Index % 2 == 0)
list.Add(new Data { System = token.Value });
else
list.Last().Username = token.Value;
return list;
});
准备好所有这些后,您将需要以下内容:
- 确定模数
- 迭代文件的行并解析每一行
- 分组并汇总结果
剩余的代码如下所示:
var lines = streamReader.ReadAllLines(); // mind the system resources here!
var parser = lines.First().Split(';').Length % 2 == 0 ? withoutStatus : withStatus;
var data = lines.Skip(1) // skip the header
.Select(line =>
{
var parts = line.Split(';');
return new
{
UserId = parts.First(),
Data = parser(parts.Skip(1))
};
})
.GroupBy(x => x.UserId)
.ToDictionary(g => g.Key, g => g.SelectMany(x => x.Data));
现在您有一个 Dictionary<string, Data>
,其中包含用户 ID 及其信息。
当然,更优雅的解决方案是将每个解析函数分离到它自己的 class 中,然后将这些 class 加入一个通用接口下,以防需要添加更多信息未来,但上面的代码应该可以工作,并让您了解应该做什么。
如果您想使用联接:
var result = lookup[0]
.Join(lookup[1],
system => system.index,
username => username.index - 1,
(system, username) => new {system = system.data, username = username.data, system.index})
.Join(lookup[2],
d => d.index,
status => status.index - 2,
(d, status) => new {d.system, d.username, status = status.data})
.ToList();
按记录分组的另一种选择,仅 select 来自它的数据(从我的角度来看看起来更具可读性):
var result = dataRow
.Skip(1)
.Select((data, index) => new {data, record = index / 3})
.GroupBy(r => r.record)
.Select(r =>
{
var tokens = r.ToArray();
return new
{
system = tokens[0].data,
username = tokens[1].data,
status = tokens[2].data
};
})
.ToList();