在我当前的实现中应用从下一页抓取链接的逻辑时遇到问题
Trouble applying the logic of grabbing links from next pages within my current implementation
我在节点中创建了一个脚本来从网页中抓取不同帖子的链接。该脚本似乎以正确的方式工作。现在,我也希望从下一页中捕获不同帖子的链接。
由于我不熟悉在节点中编写代码,所以我完全不知道如何在我当前的实现中应用从下一页抓取链接的逻辑。
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://whosebug.com/questions/tagged/web-scraping';
const items = [];
let getLinks = () => {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.summary > h3 > a.question-hyperlink').each(function() {
items.push(base_link + $(this).attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
getLinks().then(resultList => {
var i;
for (i = 0; i < resultList.length; i++) {
console.log(resultList[i]);
}
})
是这样的吗?
const request = require('request');
const cheerio = require('cheerio');
const base_url = 'https://whosebug.com';
let requestURL = 'https://whosebug.com/questions/tagged/web-scraping';
let pageLimit = 5;
(async function main() {
while (pageLimit-- && requestURL) {
console.log(`----- current: ${requestURL}, remains: ${pageLimit}`);
const result = await getLinks(requestURL);
for (const link of result.links) {
console.log(link);
}
requestURL = result.nextPageURL;
}
})().catch(console.error);
function getLinks(link) {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
if (error) return reject(error);
let $ = cheerio.load(html);
const links = [];
try {
$('.summary > h3 > a.question-hyperlink').each(function() {
links.push(base_url + $(this).attr('href'));
});
const nextPageURL = base_url + $('a[rel="next"]').attr('href');
resolve({ links, nextPageURL });
} catch (e) {
reject(e);
}
});
});
};
我在节点中创建了一个脚本来从网页中抓取不同帖子的链接。该脚本似乎以正确的方式工作。现在,我也希望从下一页中捕获不同帖子的链接。
由于我不熟悉在节点中编写代码,所以我完全不知道如何在我当前的实现中应用从下一页抓取链接的逻辑。
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://whosebug.com/questions/tagged/web-scraping';
const items = [];
let getLinks = () => {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.summary > h3 > a.question-hyperlink').each(function() {
items.push(base_link + $(this).attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
getLinks().then(resultList => {
var i;
for (i = 0; i < resultList.length; i++) {
console.log(resultList[i]);
}
})
是这样的吗?
const request = require('request');
const cheerio = require('cheerio');
const base_url = 'https://whosebug.com';
let requestURL = 'https://whosebug.com/questions/tagged/web-scraping';
let pageLimit = 5;
(async function main() {
while (pageLimit-- && requestURL) {
console.log(`----- current: ${requestURL}, remains: ${pageLimit}`);
const result = await getLinks(requestURL);
for (const link of result.links) {
console.log(link);
}
requestURL = result.nextPageURL;
}
})().catch(console.error);
function getLinks(link) {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
if (error) return reject(error);
let $ = cheerio.load(html);
const links = [];
try {
$('.summary > h3 > a.question-hyperlink').each(function() {
links.push(base_url + $(this).attr('href'));
});
const nextPageURL = base_url + $('a[rel="next"]').attr('href');
resolve({ links, nextPageURL });
} catch (e) {
reject(e);
}
});
});
};