使用 selenium 下载“401 未经授权”视频

Download "401 Unauthorized" video with selenium

我正在尝试创建一个机器人,它将使用 selenium 和 python3 从这个名为“Sdarot”的网站下载视频。

站点中的每个视频(或剧集)都有一个独特的页面和 URL。加载剧集时,必须等待 30 秒才能“加载”剧集,然后

问题是视频请求以某种方式加密或受到保护(我真的不明白它是如何工作的)!当我尝试简单地等待视频标签出现然后使用 urllib 库下载视频时(参见下面的代码),我收到以下错误:urllib.error.HTTPError: HTTP Error 401: Unauthorized

请注意,当我尝试在 selenium 驱动程序中打开下载视频的 link 时,它完全可以打开,我可以手动下载它。

如何自动下载视频?提前致谢!

代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import urllib.request


def load(driver, url):

    driver.get(url)  # open the page in the browser

    try:
        # wait for the episode to "load"
        # if something is wrong and the episode doesn't load after 45 seconds,
        # the function will call itself again and try to load again.
        continue_btn = WebDriverWait(driver, 45).until(
            EC.element_to_be_clickable((By.ID, "proceed"))
        )
    except:
        load(url)


def save_video(driver, filename):

    video_element = driver.find_element_by_tag_name(
        "video")  # get the video element
    video_url = video_element.get_property('src')  # get the video url
    # trying to download the video
    urllib.request.urlretrieve(video_url, filename)
    # ERROR: "urllib.error.HTTPError: HTTP Error 401: Unauthorized"


def main():

    URL = r'https://www.sdarot.dev/watch/339-%D7%94%D7%A4%D7%99%D7%92-%D7%9E%D7%95%D7%AA-ha-pijamot/season/1/episode/23'

    DRIVER = webdriver.Chrome()
    load(DRIVER, URL)
    video_url = save_video(DRIVER, "video.mp4")


if __name__ == "__main__":
    main()

您收到未经授权的错误,因为他们使用 cookie 来存储与您的会话相关的一些信息。特别是名为 Sdarot 的 cookie。我已经使用 requests 库下载并保存视频。

要点是当您使用 selenium 打开 url 它工作正常,因为 selenium 使用相同的 http 客户端(浏览器),它已经有可用的 cookie 详细信息,但是当您使用 urllib 基本上是它不同的 http 客户端,因此它是对服务器的新请求。为了克服这个问题,您必须像浏览器一样提供足够的会话信息,在本例中由 cookie 维护。

检查我如何提取 Sdarot cookie 的值并将其应用到 requests.get 方法中。您也可以使用 urllib 来做到这一点。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests


def load(driver, url):

    driver.get(url)  # open the page in the browser

    try:
        # wait for the episode to "load"
        # if something is wrong and the episode doesn't load after 45 seconds,
        # the function will call itself again and try to load again.
        continue_btn = WebDriverWait(driver, 45).until(
            EC.element_to_be_clickable((By.ID, "proceed"))
        )
        continue_btn.click()
    except:
        load(driver,url) #corrected parameter error


def save_video(driver, filename):

    video_element = driver.find_element_by_tag_name(
        "video")  # get the video element
    video_url = video_element.get_property('src')  # get the video url

    cookies = driver.get_cookies()
    #iterate all the cookies and extract cookie value named Sdarot
    for entry in cookies:
        if(entry["name"] == 'Sdarot'):
            cookies = dict({entry["name"]:entry["value"]})
            #set request with proper cookies 
            r = requests.get(video_url, cookies=cookies,stream = True) 

            # start download 
            with open(filename, 'wb') as f: 
                for chunk in r.iter_content(chunk_size = 1024*1024): 
                    if chunk: 
                        f.write(chunk) 
                    
def main():

    URL = r'https://www.sdarot.dev/watch/339-%D7%94%D7%A4%D7%99%D7%92-%D7%9E%D7%95%D7%AA-ha-pijamot/season/1/episode/23'

    DRIVER = webdriver.Chrome()
    load(DRIVER, URL)
    video_url = save_video(DRIVER, "video.mp4")


if __name__ == "__main__":
    main()