Web 实时抓取流式聊天 (puppeteer.js)

Web scraping a stream chat in real-time (puppeteer.js)

我想通过网络抓取实时从流中获取聊天。

尝试在 puppeeter 的 .then() 函数内部创建一个 while 循环似乎并不有效,并且在某些实现中将其全部破坏。

我能够让最初的抓取发生,但在所有情况下程序都会结束并且不想遵循我实现的 while 循环。

没有 while 循环的工作代码

const puppeteer = require ('puppeteer');

//initiating Puppeteer
puppeteer
  .launch ()
  .then (async browser => {
    //opening a new page and navigating to the live stream
    const page = await browser.newPage ();
    await page.goto ('https://www.younow.com/Ken_Nara24');
    await page.waitForSelector ('body');
  
    //manipulating the page's content
    let getComments = await page.evaluate (() => {
    let comments = document.body.querySelectorAll ('.comment');
    let scrapeItems = [];

    

    comments.forEach (item => {
        let commentAuthor = item.querySelector ('div.user-card__header.mini-profile-launcher').innerText;
        let commentContent = '';
            try {
            commentContent = item.querySelector ('div.user-card__body.ng-star-inserted').innerText;
            } catch (err) {}
            scrapeItems.push ({
            commentAuthor: commentAuthor,
            commentContent: commentContent,
            });
        });
    
    
    let items = {
        "userComments": scrapeItems,
    };
    return items;
        
    });
    //outputting the scraped data
    console.log (getComments);
    //closing the browser
    await browser.close ();
  })
  //handling any errors
  .catch (function (err) {
    console.error (err);
  });

所有让逻辑循环的尝试都徒劳无功。我找不到明确定义如何或是否可以完成此类事情的方法或过去 issue/example。我已经尝试过几次自己实现它,但没有任何东西编译正确。

我是不是漏掉了什么重要的东西?我只想听一个网页,每 3-5 秒重新抓取一次。

如果您仍然需要帮助,可以试试这个方法。

const puppeteer = require("puppeteer");
let pageScraping = false; /* set scraping to false */

const scraper = async () => {
  if (pageScraping == true) return; /* check if already scraping page */
  let browser, page;
  let pageUrl = 'https://www.younow.com/Ken_Nara24';

  try {
    pageScraping = true; /* set scraping to true */
    browser = await puppeteer.launch({ headless: true });
    page = await browser.newPage();
    await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });

    /* wait for chat to be visible */
    await page.waitForSelector('.chat', { visible: true, timeout: 60000 });

    let getComments = await page.evaluate(() => {
      let scrapeComments = [];
      let comments = document.querySelectorAll('.comment');

      comments.forEach(comment => {
        let commentContent = '';
        let commentAuthor = comment.querySelector('div[class="user-card__header mini-profile-launcher"]').innerText;
        commentContent = comment.querySelector('div[class="user-card__body ng-star-inserted"]').innerText;

        scrapeComments.push({
          'commentAuthor': commentAuthor,
          'commentContent': commentContent,
        });
      });

      return { 'userComments': scrapeComments };
    });

    console.log(await getComments); /* log comments */
  } catch (err) {
    console.log(err.message);
  } finally {
    if (browser) { /* check if browser is open befor trying to close */
      await browser.close();
      console.log('closing browser');
    }
    pageScraping = false; /* set scraping to false again */
    await setTimeout(scraper, 5000); /* wait 5 seconds befor re-scraping */
  }
}

setTimeout(scraper, 5000); /* start scraping */