使用 Puppeteer 从 React 网站抓取数据
Scraping data with Puppeteer from React website
我正在尝试从 https://invictusdao.fi/#/dashboard 中提取数据
但是,我被困在了这里。
HTML 中没有有用的 class-names。样本在这里:
<div class="route-container jss2 false">
<div id="dashboard-view">
<div class="MuiPaper-root ohm-card main-data-area MuiPaper-elevation0 MuiPaper-rounded" style="
transform: none;
transition: transform 225ms cubic-bezier(0.4, 0, 0.2, 1) 0ms;
">
<div class="MuiGrid-root data-grid MuiGrid-container MuiGrid-spacing-xs-2">
<div class="MuiGrid-root MuiGrid-item MuiGrid-justify-content-xs-flex-end MuiGrid-grid-xs-6">
<a class="MuiTypography-root MuiLink-root MuiLink-underlineNone stat-tile MuiTypography-colorPrimary"
target="_blank" style="cursor: default">
<div class="stat-tile-content">
<h5 class="MuiTypography-root light-tooltip MuiTypography-h5" tooltip="">
$IN Price
</h5>
<h4 class="MuiTypography-root MuiTypography-h4">9.50</h4>
</div>
</a>
</div>
我尝试使用 page.evaluate
来获取页面上元素的标题和值。
这是我的代码:
const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://invictusdao.fi/#/dashboard");
await page.waitForSelector(".data-grid");
// extracting information from code
let cards = await page.evaluate(() => {
let cardsElement = document.body.querySelectorAll(".stat-tile-content");
cards = Object.values(cardsElement).map((x) => {
return {
title: x.querySelector(".MuiTypography-root.light-tooltip.MuiTypography-h5").textContent ?? null,
value: x.querySelector(".MuiTypography-root.MuiTypography-h4").textContent ?? null,
};
});
return cards;
});
// logging results
const inPrice = cards[0].value;
const apy = cards[1].value;
const mCap = cards[2].value;
const supply = cards[3].value;
const tvl = cards[4].value;
const treasury = cards[5].value;
const inStaked = cards[6].value;
const rfv = cards[7].value;
const backedPrice = cards[8].value;
const runway = cards[9].value;
const currentIndex = cards[10].value;
console.log("$IN price", "$" + inPrice);
console.log("APY", apy);
console.log("Market Cap", mCap);
console.log("Supply", supply);
console.log("TVL", tvl);
console.log("Treasury", treasury);
console.log("IN Staked", inStaked);
console.log("Risk Free Value", rfv);
console.log("Backed Price", backedPrice);
console.log("Runway", runway);
console.log("Current Index", currentIndex);
await browser.close();
process.exit(0);
} catch (err) {
console.error(err);
process.exit(1);
}
})();
这给我带来了标题,但没有带来价值(我得到的是空字符串)。
我做错了什么?
乍一看,您的选择器似乎不错。问题似乎是元素已呈现但没有数据,因此您正在抓取空文本内容而不等待它们被异步填充。
我尝试使用 waitForFunction
轮询您想要的文本内容是否为空。当它们不为空时,继续刮:
const puppeteer = require("puppeteer");
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
await page.goto("https://invictusdao.fi/#/dashboard");
await page.waitForFunction(`
document.querySelector(".stat-tile-content h4")
?.textContent.trim()
`);
const data = await page.$$eval(
".stat-tile-content",
els => els.map(el => ({
title: el.querySelector("h5").textContent.trim(),
value: el.querySelector("h4").textContent.trim(),
}))
);
console.log(data);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
输出:
[
{ title: '$IN Price', value: '6.73' },
{ title: 'APY', value: '30,718.206%' },
{ title: 'Market Cap', value: '7,479,790' },
{ title: 'Supply', value: '460,658' },
{ title: 'TVD', value: ',727,599' },
{ title: 'Treasury', value: ',801,741' },
{ title: 'IN Staked', value: '77.45%' },
{ title: 'Risk Free Value', value: ',801,741' },
{ title: 'Backed Price', value: '.04' },
{ title: 'Runway', value: '269 Days' },
{ title: 'Current Index', value: '2.2127' }
]
如果您希望数据作为由标题键入的 object,您可以 reduce
而不是 map
:
// ...
els => els.reduce((a, el) => {
a[el.querySelector("h5").textContent.trim()] =
el.querySelector("h4").textContent.trim();
return a;
}, {})
输出:
{
'$IN Price': '6.73',
APY: '30,821.15%',
'Market Cap': '7,482,974',
Supply: '460,670',
TVD: ',670,031',
Treasury: ',801,741',
'IN Staked': '77.40%',
'Risk Free Value': ',801,741',
'Backed Price': '.03',
Runway: '269 Days',
'Current Index': '2.2128'
}
请注意,存在一些奇怪的行为,即网站在数据加载后立即更改“无风险价值”的值。最初,数据与“金库”卡相同。
解决这个问题的一种方法是等待一秒钟,但通常最好使用 waitForFunction
来避免竞争条件并保持快速。
一个谓词可以检查所有元素的文本内容是否是唯一的,尽管如果数据实际上不应该是唯一的(您可以短时间后捕捉超时,如果最适合您的用例,则像往常一样抓取任何内容):
// ...
await page.waitForFunction(() => {
const sel = ".stat-tile-content h4";
const text = [...document.querySelectorAll(sel)]
.map(e => e.textContent.trim())
;
return text.length && new Set(text).size === text.length;
});
// ...
此代码将替换原来的 waitForFunction
。
我正在尝试从 https://invictusdao.fi/#/dashboard 中提取数据 但是,我被困在了这里。
HTML 中没有有用的 class-names。样本在这里:
<div class="route-container jss2 false">
<div id="dashboard-view">
<div class="MuiPaper-root ohm-card main-data-area MuiPaper-elevation0 MuiPaper-rounded" style="
transform: none;
transition: transform 225ms cubic-bezier(0.4, 0, 0.2, 1) 0ms;
">
<div class="MuiGrid-root data-grid MuiGrid-container MuiGrid-spacing-xs-2">
<div class="MuiGrid-root MuiGrid-item MuiGrid-justify-content-xs-flex-end MuiGrid-grid-xs-6">
<a class="MuiTypography-root MuiLink-root MuiLink-underlineNone stat-tile MuiTypography-colorPrimary"
target="_blank" style="cursor: default">
<div class="stat-tile-content">
<h5 class="MuiTypography-root light-tooltip MuiTypography-h5" tooltip="">
$IN Price
</h5>
<h4 class="MuiTypography-root MuiTypography-h4">9.50</h4>
</div>
</a>
</div>
我尝试使用 page.evaluate
来获取页面上元素的标题和值。
这是我的代码:
const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://invictusdao.fi/#/dashboard");
await page.waitForSelector(".data-grid");
// extracting information from code
let cards = await page.evaluate(() => {
let cardsElement = document.body.querySelectorAll(".stat-tile-content");
cards = Object.values(cardsElement).map((x) => {
return {
title: x.querySelector(".MuiTypography-root.light-tooltip.MuiTypography-h5").textContent ?? null,
value: x.querySelector(".MuiTypography-root.MuiTypography-h4").textContent ?? null,
};
});
return cards;
});
// logging results
const inPrice = cards[0].value;
const apy = cards[1].value;
const mCap = cards[2].value;
const supply = cards[3].value;
const tvl = cards[4].value;
const treasury = cards[5].value;
const inStaked = cards[6].value;
const rfv = cards[7].value;
const backedPrice = cards[8].value;
const runway = cards[9].value;
const currentIndex = cards[10].value;
console.log("$IN price", "$" + inPrice);
console.log("APY", apy);
console.log("Market Cap", mCap);
console.log("Supply", supply);
console.log("TVL", tvl);
console.log("Treasury", treasury);
console.log("IN Staked", inStaked);
console.log("Risk Free Value", rfv);
console.log("Backed Price", backedPrice);
console.log("Runway", runway);
console.log("Current Index", currentIndex);
await browser.close();
process.exit(0);
} catch (err) {
console.error(err);
process.exit(1);
}
})();
这给我带来了标题,但没有带来价值(我得到的是空字符串)。
我做错了什么?
乍一看,您的选择器似乎不错。问题似乎是元素已呈现但没有数据,因此您正在抓取空文本内容而不等待它们被异步填充。
我尝试使用 waitForFunction
轮询您想要的文本内容是否为空。当它们不为空时,继续刮:
const puppeteer = require("puppeteer");
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
await page.goto("https://invictusdao.fi/#/dashboard");
await page.waitForFunction(`
document.querySelector(".stat-tile-content h4")
?.textContent.trim()
`);
const data = await page.$$eval(
".stat-tile-content",
els => els.map(el => ({
title: el.querySelector("h5").textContent.trim(),
value: el.querySelector("h4").textContent.trim(),
}))
);
console.log(data);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
输出:
[
{ title: '$IN Price', value: '6.73' },
{ title: 'APY', value: '30,718.206%' },
{ title: 'Market Cap', value: '7,479,790' },
{ title: 'Supply', value: '460,658' },
{ title: 'TVD', value: ',727,599' },
{ title: 'Treasury', value: ',801,741' },
{ title: 'IN Staked', value: '77.45%' },
{ title: 'Risk Free Value', value: ',801,741' },
{ title: 'Backed Price', value: '.04' },
{ title: 'Runway', value: '269 Days' },
{ title: 'Current Index', value: '2.2127' }
]
如果您希望数据作为由标题键入的 object,您可以 reduce
而不是 map
:
// ...
els => els.reduce((a, el) => {
a[el.querySelector("h5").textContent.trim()] =
el.querySelector("h4").textContent.trim();
return a;
}, {})
输出:
{
'$IN Price': '6.73',
APY: '30,821.15%',
'Market Cap': '7,482,974',
Supply: '460,670',
TVD: ',670,031',
Treasury: ',801,741',
'IN Staked': '77.40%',
'Risk Free Value': ',801,741',
'Backed Price': '.03',
Runway: '269 Days',
'Current Index': '2.2128'
}
请注意,存在一些奇怪的行为,即网站在数据加载后立即更改“无风险价值”的值。最初,数据与“金库”卡相同。
解决这个问题的一种方法是等待一秒钟,但通常最好使用 waitForFunction
来避免竞争条件并保持快速。
一个谓词可以检查所有元素的文本内容是否是唯一的,尽管如果数据实际上不应该是唯一的(您可以短时间后捕捉超时,如果最适合您的用例,则像往常一样抓取任何内容):
// ...
await page.waitForFunction(() => {
const sel = ".stat-tile-content h4";
const text = [...document.querySelectorAll(sel)]
.map(e => e.textContent.trim())
;
return text.length && new Set(text).size === text.length;
});
// ...
此代码将替换原来的 waitForFunction
。