使用节点对流中的列求和
Sum column in stream using node
我的目标是计算与 csv 中另一列相关的列的总和。
例如,我有一个输入 csv,看起来像这样
"500","my.jpg"
"500","my.jpg"
"200","another.jpg"
我希望输出为:
[{ bytes: 1000, uri: "my.jpg" }, { bytes:200, "another.jpg" }]
Note: I need to do this as a stream as there can be over 3 millions records for a given csv and looping is just too slow.
我已经设法使用 awk
完成了这个,但我正在努力在节点
中实现它
这是使用 awk
命令的 bash 脚本
awk -F, 'BEGIN { print "["}
{
gsub(/"/, ""); # Remove all quotation from csv
uri=; # Put the current uri in key
a[uri]++; # Increment the count of uris
b[uri] = b[uri] + ; # total up bytes
}
END {
for (i in a) {
printf "%s{\"uri\":\"%s\",\"count\":\"%s\",\"bytes\":\"%s\"}",
separator, i, a[i], b[i]
separator = ", "
}
print "]"
}
' ./res.csv
任何正确方向的指示将不胜感激
您可以尝试为您的 csv 文件创建读取流并将其通过管道传输到 csv-streamify 解析器。
const csv = require('csv-streamify')
const fs = require('fs')
const parser = csv()
const sum = {};
// emits each line as a buffer or as a string representing an array of fields
parser.on('data', function (line) {
let key = line[1];
let val = line[0];
if (!sum[key]) {
sum[key] = 0;
}
sum[key] = sum[key] + parseInt(val);
console.log("Current sum for " + key + ": " + sum[key])
})
parser.on('end', function () {
let results = Object.keys(sum)
.map(key => ({ bytes: sum[key], uri: key }))
console.log(results);
})
// now pipe some data into it
fs.createReadStream('./test.csv').pipe(parser)
使用您的示例数据,此示例应打印:
[ { bytes: 1000, uri: 'my.jpg' },
{ bytes: 200, uri: 'another.jpg' } ]
您也可以试试下面的 Perl 解决方案。
$ cat url.txt
"500","my.jpg"
"500","my.jpg"
"200","another.jpg"
"600","more.jpg"
$ perl -lne ' if(/\"(\d+)\",\"(.+?)\"/g) { $kv{}+=} ; END { print "["; for(keys %kv) { print "$s { bytes:$kv{$_} uri:\"$_\" } ";$s="," } print "]" } ' url.txt
[
{ bytes:600 uri:"more.jpg" }
, { bytes:200 uri:"another.jpg" }
, { bytes:1000 uri:"my.jpg" }
]
$
我的目标是计算与 csv 中另一列相关的列的总和。
例如,我有一个输入 csv,看起来像这样
"500","my.jpg"
"500","my.jpg"
"200","another.jpg"
我希望输出为:
[{ bytes: 1000, uri: "my.jpg" }, { bytes:200, "another.jpg" }]
Note: I need to do this as a stream as there can be over 3 millions records for a given csv and looping is just too slow.
我已经设法使用 awk
完成了这个,但我正在努力在节点
这是使用 awk
命令的 bash 脚本
awk -F, 'BEGIN { print "["}
{
gsub(/"/, ""); # Remove all quotation from csv
uri=; # Put the current uri in key
a[uri]++; # Increment the count of uris
b[uri] = b[uri] + ; # total up bytes
}
END {
for (i in a) {
printf "%s{\"uri\":\"%s\",\"count\":\"%s\",\"bytes\":\"%s\"}",
separator, i, a[i], b[i]
separator = ", "
}
print "]"
}
' ./res.csv
任何正确方向的指示将不胜感激
您可以尝试为您的 csv 文件创建读取流并将其通过管道传输到 csv-streamify 解析器。
const csv = require('csv-streamify')
const fs = require('fs')
const parser = csv()
const sum = {};
// emits each line as a buffer or as a string representing an array of fields
parser.on('data', function (line) {
let key = line[1];
let val = line[0];
if (!sum[key]) {
sum[key] = 0;
}
sum[key] = sum[key] + parseInt(val);
console.log("Current sum for " + key + ": " + sum[key])
})
parser.on('end', function () {
let results = Object.keys(sum)
.map(key => ({ bytes: sum[key], uri: key }))
console.log(results);
})
// now pipe some data into it
fs.createReadStream('./test.csv').pipe(parser)
使用您的示例数据,此示例应打印:
[ { bytes: 1000, uri: 'my.jpg' },
{ bytes: 200, uri: 'another.jpg' } ]
您也可以试试下面的 Perl 解决方案。
$ cat url.txt
"500","my.jpg"
"500","my.jpg"
"200","another.jpg"
"600","more.jpg"
$ perl -lne ' if(/\"(\d+)\",\"(.+?)\"/g) { $kv{}+=} ; END { print "["; for(keys %kv) { print "$s { bytes:$kv{$_} uri:\"$_\" } ";$s="," } print "]" } ' url.txt
[
{ bytes:600 uri:"more.jpg" }
, { bytes:200 uri:"another.jpg" }
, { bytes:1000 uri:"my.jpg" }
]
$