单击内部 javascript 链接并使用 puppeteer 返回 url

Clicking on internal javascript links and returning urls using puppeteer

我的目标是单击此 page 上的每个 link(称为脚注),然后单击 return 脚注 link、文本,然后单击所有边栏中显示的 URL。我坚持在出现侧边栏值时访问它们,并且在失败几周后,我正在寻找一些关于我做错了什么的指示(对 javascript 和 puppeteer 来说都是新的)。

const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function () {
    const browser = await puppeteer.launch({ headless: true });
    const page = await browser.newPage();
    await page.goto(url);
    const footnotes = await page.$$eval(selector, nodes => {
        return nodes.map(node => {
            const ref = node.href.replace('https://www.churchofjesuschrist.org', '');
            const txt = node.text;
            return {
                ref,
                txt
            };
        });
    });
    for (const a of footnotes) {
        page.click(a.ref);
        const links = await page.$$eval('.scripture-ref', nodes => {
            return nodes.map(node => {
                return node.href
            })
        })
    }
    console.log(footnotes);
    console.log(links);
    // const fs = require('fs');
    // fs.writeFile('./footnotes.json', JSON.stringify(footnotes), err => err ? console.log(err) : null);
    await browser.close();
})();

也许是这样的:

const puppeteer = require('puppeteer');

const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';

(async function main() {
  const browser = await puppeteer.launch({ headless: true });
  const [page] = await browser.pages();
  await page.goto(url);

  const data = {};

  for (const footnote of await page.$$(selector)) {
    const [href, text] = await page.evaluate(
      (a) => {
        a.click();
        return [a.getAttribute('href').replace('/#note', ''), a.innerText.slice(1)];
      },
      footnote
    );
    data[href] = { text };

    const header = await page.waitForXPath(`//aside/div/header/span[text()="${href} ${text}"]`);

    data[href].links = await page.evaluate(
      (span) => {
        const aside = span.closest('aside');
        return [...aside.querySelectorAll('a[href]')].map(
          a => ({ [a.innerText]: a.href })
        );
      },
      header
    );

    console.log(`Done: ${href} ${text}`);
  }
  console.log(JSON.stringify(data, null, 2));
  await browser.close();
})();

部分输出:

{
  "1a": {
    "text": "pondering",
    "links": [
      {
        "D&C 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19"
      },
      {
        "TG Meditation": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
      },
      {
        "Doctrine and Covenants 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19#19"
      },
      {
        "Meditation, Meditate": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
      }
    ]
  },
}