如何使用 Google Chrome Headless 和 Selenium 提取 youtube 视频的评论数?
How to extract the number of comments of a youtube video using Google Chrome Headless and Selenium?
每个 youtube 网页中都有一个元素来显示视频的评论数。
就是这样一个html结构:
<yt-formatted-string class="count-text style-scope ytd-comments-header-renderer">xx Comments</yt-formatted-string>
我想用 selenium 获取数字 xx Comments
。
code1-带头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
options = webdriver.ChromeOptions()
proxy = '127.0.0.1:1080'
options.add_argument('--proxy-server=socks5://' + proxy)
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver,30)
url='https://www.youtube.com/watch?v=N0lxfilGfak'
driver.get(url)
driver.execute_script("return scrollBy(0, 1000);")
comment = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[contains(., 'Comments')]")))
driver.execute_script("arguments[0].scrollIntoView(true);",comment)
print(driver.find_element_by_xpath("//h2[@id='count']").text)
使用上面的 python 代码,我可以得到 717 Comments
for https://www.youtube.com/watch?v=N0lxfilGfak
.
现在我想在 selenium 中使用无头浏览器获得相同的数字。
code2-带无头浏览器。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
options = webdriver.ChromeOptions()
proxy = '127.0.0.1:1080'
options.add_argument('--proxy-server=socks5://' + proxy)
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver,30)
url='https://www.youtube.com/watch?v=N0lxfilGfak'
driver.get(url)
driver.execute_script("return scrollBy(0, 1000);")
comment = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[contains(., 'Comments')]")))
driver.execute_script("arguments[0].scrollIntoView(true);",comment)
print(driver.find_element_by_xpath("//h2[@id='count']").text)
注意:code2比code1多了三行
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless")
code2和code1其他行相同
执行code2时卡在comment
语句中:
>>> comment = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[contains(., 'Comments')]")))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/support/wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
为什么在selenium中无法使用headless browser获取元素?
你快到了。使用 driven initiated google-chrome Browsing Context you have to induce for the visibility_of_element_located()
and you can use either of the following :
打印文本 xx Comments
使用 XPATH
和 text 属性:
driver.get("https://www.youtube.com/watch?v=N0lxfilGfak")
driver.execute_script("return scrollBy(0, 1000);")
subscribe = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[text()='Subscribe']")))
driver.execute_script("arguments[0].scrollIntoView(true);",subscribe)
print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH,"//h2[@id='count']/yt-formatted-string"))).text)
使用 CSS_SELECTOR
和 get_attribute()
:
driver.get("https://www.youtube.com/watch?v=N0lxfilGfak")
driver.execute_script("return scrollBy(0, 1000);")
subscribe = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[text()='Subscribe']")))
driver.execute_script("arguments[0].scrollIntoView(true);",subscribe)
print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"h2#count>yt-formatted-string"))).get_attribute("innerHTML"))
控制台输出:
717 Comments
注意:您必须添加以下导入:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
使用无头Chrome
使用google-chrome-headless可以使用以下解决方案:
代码块:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--headless')
options.add_argument('--window-size=1920,1080')
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://www.youtube.com/watch?v=N0lxfilGfak")
driver.execute_script("return scrollBy(0, 1000);")
subscribe = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[text()='Subscribe']")))
driver.execute_script("arguments[0].scrollIntoView(true);",subscribe)
# using xpath and text attribute
print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH,"//h2[@id='count']/yt-formatted-string"))).text)
# using cssSelector and get_attribute()
print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"h2#count>yt-formatted-string"))).get_attribute("innerHTML"))
print("Exiting")
driver.quit()
控制台输出:
717 Comments
717 Comments
Exiting
在我的 headless
设置中添加行:
options.add_argument('--window-size=1920,1080')
或者在 y 方向滚动更长的时间。
driver.execute_script("return scrollBy(0, 5000);")
我的xpath表达式比较直接
每个 youtube 网页中都有一个元素来显示视频的评论数。 就是这样一个html结构:
<yt-formatted-string class="count-text style-scope ytd-comments-header-renderer">xx Comments</yt-formatted-string>
我想用 selenium 获取数字 xx Comments
。
code1-带头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
options = webdriver.ChromeOptions()
proxy = '127.0.0.1:1080'
options.add_argument('--proxy-server=socks5://' + proxy)
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver,30)
url='https://www.youtube.com/watch?v=N0lxfilGfak'
driver.get(url)
driver.execute_script("return scrollBy(0, 1000);")
comment = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[contains(., 'Comments')]")))
driver.execute_script("arguments[0].scrollIntoView(true);",comment)
print(driver.find_element_by_xpath("//h2[@id='count']").text)
使用上面的 python 代码,我可以得到 717 Comments
for https://www.youtube.com/watch?v=N0lxfilGfak
.
现在我想在 selenium 中使用无头浏览器获得相同的数字。
code2-带无头浏览器。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
options = webdriver.ChromeOptions()
proxy = '127.0.0.1:1080'
options.add_argument('--proxy-server=socks5://' + proxy)
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver,30)
url='https://www.youtube.com/watch?v=N0lxfilGfak'
driver.get(url)
driver.execute_script("return scrollBy(0, 1000);")
comment = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[contains(., 'Comments')]")))
driver.execute_script("arguments[0].scrollIntoView(true);",comment)
print(driver.find_element_by_xpath("//h2[@id='count']").text)
注意:code2比code1多了三行
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless")
code2和code1其他行相同
执行code2时卡在comment
语句中:
>>> comment = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[contains(., 'Comments')]")))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/support/wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
为什么在selenium中无法使用headless browser获取元素?
你快到了。使用 visibility_of_element_located()
and you can use either of the following
使用
XPATH
和 text 属性:driver.get("https://www.youtube.com/watch?v=N0lxfilGfak") driver.execute_script("return scrollBy(0, 1000);") subscribe = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[text()='Subscribe']"))) driver.execute_script("arguments[0].scrollIntoView(true);",subscribe) print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH,"//h2[@id='count']/yt-formatted-string"))).text)
使用
CSS_SELECTOR
和get_attribute()
:driver.get("https://www.youtube.com/watch?v=N0lxfilGfak") driver.execute_script("return scrollBy(0, 1000);") subscribe = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[text()='Subscribe']"))) driver.execute_script("arguments[0].scrollIntoView(true);",subscribe) print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"h2#count>yt-formatted-string"))).get_attribute("innerHTML"))
控制台输出:
717 Comments
注意:您必须添加以下导入:
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC
使用无头Chrome
使用google-chrome-headless可以使用以下解决方案:
代码块:
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option('useAutomationExtension', False) options.add_argument('--headless') options.add_argument('--window-size=1920,1080') driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe') driver.get("https://www.youtube.com/watch?v=N0lxfilGfak") driver.execute_script("return scrollBy(0, 1000);") subscribe = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//yt-formatted-string[text()='Subscribe']"))) driver.execute_script("arguments[0].scrollIntoView(true);",subscribe) # using xpath and text attribute print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH,"//h2[@id='count']/yt-formatted-string"))).text) # using cssSelector and get_attribute() print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"h2#count>yt-formatted-string"))).get_attribute("innerHTML")) print("Exiting") driver.quit()
控制台输出:
717 Comments 717 Comments Exiting
在我的 headless
设置中添加行:
options.add_argument('--window-size=1920,1080')
或者在 y 方向滚动更长的时间。
driver.execute_script("return scrollBy(0, 5000);")
我的xpath表达式比较直接