根据结果日期进行动态分页
Apify dynamic pagination based on date from results
我为一个网站写了一个半工作的抓取脚本:
async function pageFunction(context) {
const {
request,
log,
skipLinks,
jQuery: $,
waitFor
} = context;
log.info('Pagination');
let timeoutMillis; // undefined
const buttonSelector = 'div.pagination-view-more';
//click on Show more button 5 times
for (let step = 0; step < 5; step++) {
log.info('Waiting for the "Show more" button.');
try {
await waitFor(buttonSelector, {
timeoutMillis
}); // Default timeout first time.
timeoutMillis = 5000; // 2 sec timeout after the first.
} catch (err) {
// Ignore the timeout error.
log.info('Could not find the "Show more button", we\'ve reached the end.');
break;
}
log.info('Clicking the "Show more" button.');
$(buttonSelector).click();
}
//export the results
var result = [];
$(".thing-card").each(function() {
result.push({
title: $(this).attr('title'),
//format Dec 15, 2019
date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
});
});
return result;
}
在上面的示例中,我在 "Show more" 按钮上单击了 5 次,我试图导出标题和日期作为结果。问题是,我没有得到所有的结果,我认为脚本完成的比它应该的要早。
在最后的脚本中,我想删除固定的 for 循环和 运行 这个循环直到结果的日期是从今天起最多 -7 天(或 1 周)。 Apify 是否有可能做到这一点?
我想你差不多已经做到了。 Apify 没有任何限制,因为您可以编写任何您想要的代码:) 所以这是一个比特定于 Apify 更普遍的 JS 问题。
您可以检查最后一项的日期而不是固定循环(我假设这些项是从最近的一项开始排序的)。
虽然您可以对其进行微调,但类似的事情应该可以做到。
async function pageFunction(context) {
const {
log,
jQuery: $,
waitFor
} = context;
log.info('Pagination');
const buttonSelector = 'div.pagination-view-more';
// Last item date, we have to check it before loop also
let lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));
const weekAgo = new Date(Date.now() - 1000 * 3600 * 24 * 7);
// We also need to track if we get new items after the click so we dont run in endless loop
let itemCount = $(".thing-card").length;
// We only enter the loop if the last item is more recent than week ago
if (lastItemsDate >= weekAgo) {
// I chose 'infinite' loop so we can log how we break out
while (true) {
log.info('Waiting for the "Show more" button.');
try {
await waitFor(buttonSelector);
} catch (err) {
// Ignore the timeout error.
log.info('Could not find the "Show more button", we\'ve reached the end.');
break;
}
log.info('Clicking the "Show more" button.');
$(buttonSelector).click();
// Wait a bit so items can load
await waitFor(5000);
// Now we check if new items were loaded
const itemCountAfterClick = $(".thing-card").length;
if (itemCountAfterClick === itemCount) {
log.info('No new items, exiting the loop...');
break;
}
itemCount = itemCountAfterClick;
// Now we check if last item is still within a week. We can compare Dates directly
lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));
if (lastItemsDate < weekAgo) {
log.info(`Last item date is older than a week, exiting the loop: ${lastItemsDate}`);
break;
}
}
}
//export the results
var result = [];
$(".thing-card").each(function() {
result.push({
title: $(this).attr('title'),
//format Dec 15, 2019
date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
});
});
return result;
}
我为一个网站写了一个半工作的抓取脚本:
async function pageFunction(context) {
const {
request,
log,
skipLinks,
jQuery: $,
waitFor
} = context;
log.info('Pagination');
let timeoutMillis; // undefined
const buttonSelector = 'div.pagination-view-more';
//click on Show more button 5 times
for (let step = 0; step < 5; step++) {
log.info('Waiting for the "Show more" button.');
try {
await waitFor(buttonSelector, {
timeoutMillis
}); // Default timeout first time.
timeoutMillis = 5000; // 2 sec timeout after the first.
} catch (err) {
// Ignore the timeout error.
log.info('Could not find the "Show more button", we\'ve reached the end.');
break;
}
log.info('Clicking the "Show more" button.');
$(buttonSelector).click();
}
//export the results
var result = [];
$(".thing-card").each(function() {
result.push({
title: $(this).attr('title'),
//format Dec 15, 2019
date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
});
});
return result;
}
在上面的示例中,我在 "Show more" 按钮上单击了 5 次,我试图导出标题和日期作为结果。问题是,我没有得到所有的结果,我认为脚本完成的比它应该的要早。
在最后的脚本中,我想删除固定的 for 循环和 运行 这个循环直到结果的日期是从今天起最多 -7 天(或 1 周)。 Apify 是否有可能做到这一点?
我想你差不多已经做到了。 Apify 没有任何限制,因为您可以编写任何您想要的代码:) 所以这是一个比特定于 Apify 更普遍的 JS 问题。
您可以检查最后一项的日期而不是固定循环(我假设这些项是从最近的一项开始排序的)。
虽然您可以对其进行微调,但类似的事情应该可以做到。
async function pageFunction(context) {
const {
log,
jQuery: $,
waitFor
} = context;
log.info('Pagination');
const buttonSelector = 'div.pagination-view-more';
// Last item date, we have to check it before loop also
let lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));
const weekAgo = new Date(Date.now() - 1000 * 3600 * 24 * 7);
// We also need to track if we get new items after the click so we dont run in endless loop
let itemCount = $(".thing-card").length;
// We only enter the loop if the last item is more recent than week ago
if (lastItemsDate >= weekAgo) {
// I chose 'infinite' loop so we can log how we break out
while (true) {
log.info('Waiting for the "Show more" button.');
try {
await waitFor(buttonSelector);
} catch (err) {
// Ignore the timeout error.
log.info('Could not find the "Show more button", we\'ve reached the end.');
break;
}
log.info('Clicking the "Show more" button.');
$(buttonSelector).click();
// Wait a bit so items can load
await waitFor(5000);
// Now we check if new items were loaded
const itemCountAfterClick = $(".thing-card").length;
if (itemCountAfterClick === itemCount) {
log.info('No new items, exiting the loop...');
break;
}
itemCount = itemCountAfterClick;
// Now we check if last item is still within a week. We can compare Dates directly
lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));
if (lastItemsDate < weekAgo) {
log.info(`Last item date is older than a week, exiting the loop: ${lastItemsDate}`);
break;
}
}
}
//export the results
var result = [];
$(".thing-card").each(function() {
result.push({
title: $(this).attr('title'),
//format Dec 15, 2019
date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
});
});
return result;
}