将网络数据抓取到多个 HTML 表中?
Scraping web data into multiple HTML tables?
*更多详情见评论
我正在使用 cheerio.js 抓取数据并在 3 HTML table 秒内显示。它运行正常,除了第二个和第三个 table 在显示抓取数据之前添加了这些空白行(第一个 table 有 25 行,第二个 table 有 25 个空白行,然后是 25 行数据。第 3 table 有 50 行空白,然后是 25 行数据)。
我认为这是因为通过一个请求抓取所有数据,而不是每个 table.
抓取 1 个数据
这是我第一次抓取,所以我想知道是否有一种方法可以更改我的代码来解决这个问题,或者我是否必须每个 table 有一个抓取请求(但这似乎效率低下) .
如果您想查看我的 mongoose 架构,请告诉我。
HTML table 显示数据(使用ejs):
<table>
<tr>
<th>Environment</th>
<th>Store Name</th>
<th>Code Version</th>
</tr>
<% stores.forEach(function(store){ %>
<tr>
<td> <%= store.environment.QA1 %> </td>
<td> <%= store.store.QA1 %> </td>
<td> <%= store.version.QA1 %> </td>
</tr>
<% }); %>
</table>
cheerio.js 抓取信息:
router.get("/scrape", function(req, res){
request("http://link-goes-here", function(error, response, html){
var $ = cheerio.load(html);
var QAStoreInfo = [];
var QA1StoreInfo = [];
$("body > table > tbody > tr > td:nth-child(1) > table").addClass("row");
$('.row > tbody > tr:contains("QA")').each(function(i, element){
var QAResult = {
"environment.QA" : $(this).children().next(),
"store.QA" : $(this).children().next().next(),
"version.QA" : $(this).children().next().next().next()
};
QAStoreInfo.push(QAResult);
Store.count({}, function(err, test){
if (test === 0) {
var QAEntry = new Store(QAResult);
QAEntry.save(function(err, doc){
if(err){
console.log(err);
}
});
}
});
});
/* QA1 */
$('.row > tbody > tr:contains("QA1")').each(function(i, element){
var QA1Result = {
"environment.QA1" : $(this).children().next(),
"store.QA1" : $(this).children().next().next(),
"version.QA1" : $(this).children().next().next().next()
};
QA1StoreInfo.push(QA1Result);
Store.count({}, function(err, test){
if (test === 0) {
var QA1Entry = new Store(QA1Result);
QA1Entry.save(function(err, doc){
if(err){
console.log(err);
}
});
}
});
});
猫鼬模式:
var storeSchema = new mongoose.Schema({
environment: {
QA: String,
QA1: String,
QA2: String
},
store: {
QA: String,
QA1: String,
QA2: String
},
version: {
QA: String,
QA1: String,
QA2: String
}
});
mongoDB /scrape 调用后收集:
{ "_id" : ObjectId("5f0610fee9b4c35d74a83600"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Omega QA " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83601"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Crescent Centre " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83602"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Cumberland " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83603"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " East Denver " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83604"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Brookwood " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83605"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " WheatRidge " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83606"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Aurora Centerpoint " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83607"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Castle Rock " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83608"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Conyers " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
截图:(第 2 个 table 的显示方式)
Store
集合包含具有以下架构的抓取文档:
{
"environment" : { "QA": <string> } ,
"store" : { "QA": <string> },
"version" : { "QA": <string> }
}
和
{
"environment" : { "QA1": <string> } ,
"store" : { "QA1": <string> },
"version" : { "QA1": <string> }
}
The empty rows in the HTML table are for records with the first schema where QA1
is undefined
in environment
, store
and version
fields of the document.
我建议将 Store
文档的架构与以下文档相协调:
{
"kind": "QA|QA1",
"environment" : <string> ,
"store" : <string> ,
"version" : <string>
}
然后更新你的抓取脚本来存储文件如下:
//...
var QAResult = {
"kind": "QA",
"environment" : $(this).children().next(),
"store" : $(this).children().next().next(),
"version" : $(this).children().next().next().next()
};
QAStoreInfo.push(QAResult);
//...
var QA1Result = {
"kind": "QA1",
"environment" : $(this).children().next(),
"store" : $(this).children().next().next(),
"version" : $(this).children().next().next().next()
};
QA1StoreInfo.push(QA1Result);
ejs
模板中的 forEach
循环可以更新如下:
<% stores.forEach(function(store){ %>
<tr>
<td> <%= store.environment %> </td>
<td> <%= store.store %> </td>
<td> <%= store.version %> </td>
</tr>
<% }); %>
*更多详情见评论
我正在使用 cheerio.js 抓取数据并在 3 HTML table 秒内显示。它运行正常,除了第二个和第三个 table 在显示抓取数据之前添加了这些空白行(第一个 table 有 25 行,第二个 table 有 25 个空白行,然后是 25 行数据。第 3 table 有 50 行空白,然后是 25 行数据)。
我认为这是因为通过一个请求抓取所有数据,而不是每个 table.
抓取 1 个数据这是我第一次抓取,所以我想知道是否有一种方法可以更改我的代码来解决这个问题,或者我是否必须每个 table 有一个抓取请求(但这似乎效率低下) .
如果您想查看我的 mongoose 架构,请告诉我。
HTML table 显示数据(使用ejs):
<table>
<tr>
<th>Environment</th>
<th>Store Name</th>
<th>Code Version</th>
</tr>
<% stores.forEach(function(store){ %>
<tr>
<td> <%= store.environment.QA1 %> </td>
<td> <%= store.store.QA1 %> </td>
<td> <%= store.version.QA1 %> </td>
</tr>
<% }); %>
</table>
cheerio.js 抓取信息:
router.get("/scrape", function(req, res){
request("http://link-goes-here", function(error, response, html){
var $ = cheerio.load(html);
var QAStoreInfo = [];
var QA1StoreInfo = [];
$("body > table > tbody > tr > td:nth-child(1) > table").addClass("row");
$('.row > tbody > tr:contains("QA")').each(function(i, element){
var QAResult = {
"environment.QA" : $(this).children().next(),
"store.QA" : $(this).children().next().next(),
"version.QA" : $(this).children().next().next().next()
};
QAStoreInfo.push(QAResult);
Store.count({}, function(err, test){
if (test === 0) {
var QAEntry = new Store(QAResult);
QAEntry.save(function(err, doc){
if(err){
console.log(err);
}
});
}
});
});
/* QA1 */
$('.row > tbody > tr:contains("QA1")').each(function(i, element){
var QA1Result = {
"environment.QA1" : $(this).children().next(),
"store.QA1" : $(this).children().next().next(),
"version.QA1" : $(this).children().next().next().next()
};
QA1StoreInfo.push(QA1Result);
Store.count({}, function(err, test){
if (test === 0) {
var QA1Entry = new Store(QA1Result);
QA1Entry.save(function(err, doc){
if(err){
console.log(err);
}
});
}
});
});
猫鼬模式:
var storeSchema = new mongoose.Schema({
environment: {
QA: String,
QA1: String,
QA2: String
},
store: {
QA: String,
QA1: String,
QA2: String
},
version: {
QA: String,
QA1: String,
QA2: String
}
});
mongoDB /scrape 调用后收集:
{ "_id" : ObjectId("5f0610fee9b4c35d74a83600"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Omega QA " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83601"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Crescent Centre " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83602"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Cumberland " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83603"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " East Denver " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83604"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Brookwood " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83605"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " WheatRidge " }, "version" : { "QA1" : " _2019.11_2019_11.2 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83606"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Aurora Centerpoint " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83607"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Castle Rock " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
{ "_id" : ObjectId("5f0610fee9b4c35d74a83608"), "environment" : { "QA1" : " QA1 " }, "store" : { "QA1" : " Conyers " }, "version" : { "QA1" : " _2020.06_2020_06.6 " }, "__v" : 0 }
截图:(第 2 个 table 的显示方式)
Store
集合包含具有以下架构的抓取文档:
{
"environment" : { "QA": <string> } ,
"store" : { "QA": <string> },
"version" : { "QA": <string> }
}
和
{
"environment" : { "QA1": <string> } ,
"store" : { "QA1": <string> },
"version" : { "QA1": <string> }
}
The empty rows in the HTML table are for records with the first schema where
QA1
isundefined
inenvironment
,store
andversion
fields of the document.
我建议将 Store
文档的架构与以下文档相协调:
{
"kind": "QA|QA1",
"environment" : <string> ,
"store" : <string> ,
"version" : <string>
}
然后更新你的抓取脚本来存储文件如下:
//...
var QAResult = {
"kind": "QA",
"environment" : $(this).children().next(),
"store" : $(this).children().next().next(),
"version" : $(this).children().next().next().next()
};
QAStoreInfo.push(QAResult);
//...
var QA1Result = {
"kind": "QA1",
"environment" : $(this).children().next(),
"store" : $(this).children().next().next(),
"version" : $(this).children().next().next().next()
};
QA1StoreInfo.push(QA1Result);
ejs
模板中的 forEach
循环可以更新如下:
<% stores.forEach(function(store){ %>
<tr>
<td> <%= store.environment %> </td>
<td> <%= store.store %> </td>
<td> <%= store.version %> </td>
</tr>
<% }); %>