从云存储读取大文件并写入数据存储
Reading large files from cloud storage and writing to datastore
我有这个云函数,它是从云存储中的存储桶触发的。它读取文件,使用 N3 将每一行转换为 rdf 三元组,然后将生成的三元组写入云存储。
由于是将整个文件下载到内存中,不适合大文件。应该如何更改此功能以一次执行一行?
const storage = require('@google-cloud/storage')();
const Datastore = require('@google-cloud/datastore');
const N3 = require('n3');
helloGCS = (event, callback) => {
const file = event.data;
if (file.resourceState === 'not_exists') {
console.log(`File ${file.name} deleted.`);
callback(null, 'ok');
} else if (file.metageneration === '1') {
// metageneration attribute is updated on metadata changes.
// on create value is 1
console.log(`File ${file.name} uploaded.`);
let parser = N3.Parser();
const bucket = storage.bucket('woburn-advisory-ttl');
const remoteFile = bucket.file(file.name);
const datastore = new Datastore({});
let number_of_rows = 0;
remoteFile.download()
.then(data => { // convert buffer to string
if (data) {
lines = data.toString().split('\n')
console.log(lines.length)
entities = lines.map(line=>{
let triple = parser.parse(line)[0];
if (triple) {
// console.log(triple)
const tripleKey = datastore.key('triple');
let entity = {
key: tripleKey,
data: [
{
name: 'subject',
value: triple.subject
},
{
name: 'predicate',
value: triple.predicate
},
{
name: 'object',
value: triple.object
}
]
}
return entity
}
else {
return false
}})
entities = entities.filter((entity)=>{return entity})
console.log(entities.length)
datastore.save(entities)
.then((response)=>{
console.log(`Triples created successfully. but... ${response}`);
res.send(`${entities.length} triples created`)
})
}
callback(null, 'ok');
})
}
else {
console.log(`File ${file.name} metadata updated.`);
callback(null, 'ok');
}
};
而不是调用 download()
使用 createReadStream()
. This allows you to loop over the whole file without storing it in memory. You can use something like byline or readline 从该流中获取单独的行。
整体看起来像这样:
gcsStream = remoteFile.createReadStream();
lineStream = byline.createStream(gcsStream);
lineStream.on('data', function(line) {
let triple = parser.parse(line)[0];
//...
});
我有这个云函数,它是从云存储中的存储桶触发的。它读取文件,使用 N3 将每一行转换为 rdf 三元组,然后将生成的三元组写入云存储。
由于是将整个文件下载到内存中,不适合大文件。应该如何更改此功能以一次执行一行?
const storage = require('@google-cloud/storage')();
const Datastore = require('@google-cloud/datastore');
const N3 = require('n3');
helloGCS = (event, callback) => {
const file = event.data;
if (file.resourceState === 'not_exists') {
console.log(`File ${file.name} deleted.`);
callback(null, 'ok');
} else if (file.metageneration === '1') {
// metageneration attribute is updated on metadata changes.
// on create value is 1
console.log(`File ${file.name} uploaded.`);
let parser = N3.Parser();
const bucket = storage.bucket('woburn-advisory-ttl');
const remoteFile = bucket.file(file.name);
const datastore = new Datastore({});
let number_of_rows = 0;
remoteFile.download()
.then(data => { // convert buffer to string
if (data) {
lines = data.toString().split('\n')
console.log(lines.length)
entities = lines.map(line=>{
let triple = parser.parse(line)[0];
if (triple) {
// console.log(triple)
const tripleKey = datastore.key('triple');
let entity = {
key: tripleKey,
data: [
{
name: 'subject',
value: triple.subject
},
{
name: 'predicate',
value: triple.predicate
},
{
name: 'object',
value: triple.object
}
]
}
return entity
}
else {
return false
}})
entities = entities.filter((entity)=>{return entity})
console.log(entities.length)
datastore.save(entities)
.then((response)=>{
console.log(`Triples created successfully. but... ${response}`);
res.send(`${entities.length} triples created`)
})
}
callback(null, 'ok');
})
}
else {
console.log(`File ${file.name} metadata updated.`);
callback(null, 'ok');
}
};
而不是调用 download()
使用 createReadStream()
. This allows you to loop over the whole file without storing it in memory. You can use something like byline or readline 从该流中获取单独的行。
整体看起来像这样:
gcsStream = remoteFile.createReadStream();
lineStream = byline.createStream(gcsStream);
lineStream.on('data', function(line) {
let triple = parser.parse(line)[0];
//...
});