猪性能问题
Pig Performance Issues
我有以下 PIG 脚本,它花费大量时间来处理 342 个文件,分割大小为 256 MB(仅测试)。有人可以提出改进建议吗:
SPLIT filteredalnumcdrs into splitalnumcdrs_1 IF (
(SUBSTRING(aparty,2,3) == '-')),
splitalnumcdrs_2 OTHERWISE;
tmpsplitalnumcdrs_1 = FOREACH splitalnumcdrs_1 GENERATE aparty,srcgt,destgt,SUBSTRING(aparty,0,2) as splitaparty,bparty,smscgt,status,prepost;
groupsplitalnumcdrs_1 = GROUP tmpsplitalnumcdrs_1 BY (aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);
distinctsplitalnumcdrs_1 = FOREACH groupsplitalnumcdrs_1 {
uniqsplitalnumcdrs_1 = DISTINCT tmpsplitalnumcdrs_1.(aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);
GENERATE FLATTEN(group),COUNT(tmpsplitalnumcdrs_1) as countalnumcdrs;
};
tmpsplitalnumcdrs_2 = FOREACH splitalnumcdrs_2 GENERATE aparty,srcgt,destgt,aparty as splitaparty_2,bparty,smscgt,status,prepost;
groupsplitalnumcdrs_2 = GROUP tmpsplitalnumcdrs_2 BY (aparty,srcgt,destgt,splitaparty_2,bparty,smscgt,status,prepost);
distinctsplitalnumcdrs_2 = FOREACH groupsplitalnumcdrs_2 {
uniqsplitalnumcdrs_2 = DISTINCT tmpsplitalnumcdrs_2.(aparty,srcgt,destgt,splitaparty_2,bparty,smscgt,status,prepost);
GENERATE FLATTEN(group),COUNT(tmpsplitalnumcdrs_2) as countsplitalnumcdrs_2;
};
distinctalnumcdrs = UNION distinctsplitalnumcdrs_1,distinctsplitalnumcdrs_2;
alnumreportmap = FOREACH distinctalnumcdrs GENERATE aparty,smsiuc_udfs.mapgtabparty(srcgt,destgt,splitaparty,bparty),smscgt,status,prepost,countalnumcdrs PARALLEL 20;
alnumreportmapgroup = GROUP alnumreportmap BY (aparty,mappedreport,smscgt,status,prepost);
alnumreportmaprecord = FOREACH alnumreportmapgroup GENERATE FLATTEN(group),SUM(alnumreportmap.countalnumcdrs) as alnumsmscount;
你可以避免合并
tmpsplitalnumcdrs = foreach filteredalnumcdrs generate aparty,srcgt,destgt,(SUBSTRING(aparty,2,3) == '-' ?SUBSTRING(aparty,0,2):aparty) as splitaparty,bparty,smscgt,status,prepost;
distinctsplitalnumcdrs = FOREACH tmpsplitalnumcdrs {
uniqsplitalnumcdrs = DISTINCT tmpsplitalnumcdrs.(aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);
GENERATE FLATTEN(group),COUNT(tmpsplitalnumcdrs) as countsplitalnumcdrs;
};
为什么需要
uniqsplitalnumcdrs = DISTINCT tmpsplitalnumcdrs.(aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);
我有以下 PIG 脚本,它花费大量时间来处理 342 个文件,分割大小为 256 MB(仅测试)。有人可以提出改进建议吗:
SPLIT filteredalnumcdrs into splitalnumcdrs_1 IF (
(SUBSTRING(aparty,2,3) == '-')),
splitalnumcdrs_2 OTHERWISE;
tmpsplitalnumcdrs_1 = FOREACH splitalnumcdrs_1 GENERATE aparty,srcgt,destgt,SUBSTRING(aparty,0,2) as splitaparty,bparty,smscgt,status,prepost;
groupsplitalnumcdrs_1 = GROUP tmpsplitalnumcdrs_1 BY (aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);
distinctsplitalnumcdrs_1 = FOREACH groupsplitalnumcdrs_1 {
uniqsplitalnumcdrs_1 = DISTINCT tmpsplitalnumcdrs_1.(aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);
GENERATE FLATTEN(group),COUNT(tmpsplitalnumcdrs_1) as countalnumcdrs;
};
tmpsplitalnumcdrs_2 = FOREACH splitalnumcdrs_2 GENERATE aparty,srcgt,destgt,aparty as splitaparty_2,bparty,smscgt,status,prepost;
groupsplitalnumcdrs_2 = GROUP tmpsplitalnumcdrs_2 BY (aparty,srcgt,destgt,splitaparty_2,bparty,smscgt,status,prepost);
distinctsplitalnumcdrs_2 = FOREACH groupsplitalnumcdrs_2 {
uniqsplitalnumcdrs_2 = DISTINCT tmpsplitalnumcdrs_2.(aparty,srcgt,destgt,splitaparty_2,bparty,smscgt,status,prepost);
GENERATE FLATTEN(group),COUNT(tmpsplitalnumcdrs_2) as countsplitalnumcdrs_2;
};
distinctalnumcdrs = UNION distinctsplitalnumcdrs_1,distinctsplitalnumcdrs_2;
alnumreportmap = FOREACH distinctalnumcdrs GENERATE aparty,smsiuc_udfs.mapgtabparty(srcgt,destgt,splitaparty,bparty),smscgt,status,prepost,countalnumcdrs PARALLEL 20;
alnumreportmapgroup = GROUP alnumreportmap BY (aparty,mappedreport,smscgt,status,prepost);
alnumreportmaprecord = FOREACH alnumreportmapgroup GENERATE FLATTEN(group),SUM(alnumreportmap.countalnumcdrs) as alnumsmscount;
你可以避免合并
tmpsplitalnumcdrs = foreach filteredalnumcdrs generate aparty,srcgt,destgt,(SUBSTRING(aparty,2,3) == '-' ?SUBSTRING(aparty,0,2):aparty) as splitaparty,bparty,smscgt,status,prepost;
distinctsplitalnumcdrs = FOREACH tmpsplitalnumcdrs {
uniqsplitalnumcdrs = DISTINCT tmpsplitalnumcdrs.(aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);
GENERATE FLATTEN(group),COUNT(tmpsplitalnumcdrs) as countsplitalnumcdrs;
};
为什么需要
uniqsplitalnumcdrs = DISTINCT tmpsplitalnumcdrs.(aparty,srcgt,destgt,splitaparty,bparty,smscgt,status,prepost);