使用 Selenium 单击所有 youtube 评论 'reply' 按钮并获取频道链接

Use Selenium to click all youtube comment 'reply' buttons and get channel links

目标是从 YouTube 视频评论部分抓取所有 YouTube 频道链接。当前代码只获取用户名而不是频道链接,并且不查看用户内部的回复。我不知道该怎么做,也不知道为什么我的 xPaths 是错误的。

代码:

from selenium import webdriver
import time

driver=webdriver.Chrome()

driver.get('https://www.youtube.com/watch?v=_p2NvO6KrBs')
time.sleep(5)

#Scrolling
for i in range(4):
    #scroll 1000 px
    driver.execute_script('window.scrollTo(0,(window.pageYOffset+1000))')
    #waiting for the page to load
    time.sleep(1.5) 


#replies
replies = driver.find_element_by_xpath('//*[@id="more-replies"]')
time.sleep(1)
replies.click()


comment_div=driver.find_element_by_xpath('//*[@id="contents"]')
comments=comment_div.find_elements_by_xpath('//*[@id="author-text"]')
for comment in comments:
    print(comment.text)

如果你想要频道,你需要获取 href 属性 url:

for comment in comments:
    print(comment.get_attribute('href'))

如果你也想要每个回复(每个评论)的渠道,那么你可以尝试以下方法。我在某些行上添加了上下文注释...

main_comments = driver.find_elements_by_css_selector('#contents #comment') # get all the comments

for mc in main_comments:
    main_comment_channel = mc.find_element_by_id('author-text').get_attribute('href')
    print('The commenters channel is: ' + main_comment_channel) # print the channel of the main comment

    replies = mc.find_element_by_xpath('..//*[@id="replies"]') # get the replies section of the above comment
    if replies.text.startswith('View'): # check if there are any replies
        replies.find_element_by_css_selector('a').click() # if so open the replies
        time.sleep(3) # wait for load (better strategy should be used here

        for reply in replies.find_elements_by_id('author-text'):
            reply_channel = reply.get_attribute('href')
            print('Reply channel: ' + reply_channel) # print the channel of each reply

包括写入 .txt 文件的完整解决方案

file = open("output.txt","w+")

driver.get('https://www.youtube.com/watch?v=_p2NvO6KrBs')
time.sleep(5)

#new scrolling
while(len(driver.find_elements_by_css_selector('#sections>#continuations #spinner')) > 0):
    #scroll 1000 px
    driver.execute_script('window.scrollTo(0,(window.pageYOffset+1000))')
    #waiting for the page to load
    time.sleep(1.5) 


main_comments = driver.find_elements_by_css_selector('#contents #comment') # get all the comments

for mc in main_comments:
    main_comment_channel = mc.find_element_by_id('author-text').get_attribute('href')
    file.write('The commenters channel is: ' + main_comment_channel + '\n') #write the channel of the main comment to file

    replies = mc.find_element_by_xpath('..//*[@id="replies"]') # get the replies section of the above comment
    if replies.text.startswith('View'): # check if there are any replies
        reply = replies.find_element_by_css_selector('a');
        driver.execute_script("arguments[0].scrollIntoView();", reply) # bring view replies into view
        driver.execute_script('window.scrollTo(0,(window.pageYOffset-150))') # cater for the youtube header
        reply.click() # if so open the replies
        time.sleep(3) # wait for load (better strategy should be used here

        for reply in replies.find_elements_by_id('author-text'):
            reply_channel = reply.get_attribute('href')
            file.write('Reply channel: ' + reply_channel + '\n') # write the channel of each reply to file

file.close()