如何在抓取 url 的多个页面时创建 for 循环?
How to create a for loop when scraping multiple pages of a url?
我希望能够创建一个 for 循环来抓取具有多个页面的 url。我发现了一些这样的例子,但是我的代码需要身份验证,因此我没有分享实际的 url。我输入了一个示例 url,它显示相同的键标识符“currentPage=1”
因此对于第 i 页的这个示例,它将是 currentPage=i,其中 i 将是 1,2,3,4....
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def requests_retry_session(retries=10,
backoff_factor=0.3,
status_forcelist=(500, 502, 503, 504),
session=None):
session = session or requests.Session()
retry = Retry(total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
import io
import urllib3
import pandas as pd
from requests_kerberos import OPTIONAL, HTTPKerberosAuth
import web
a = web.get_mwinit_cookie()
urls = https://example-url.com/ABCD/customer.currentPage=1&end
def Scraper(url):
urllib3.disable_warnings()
with requests_retry_session() as req:
resp = req.get(url,
timeout=30,
verify=False,
allow_redirects=True,
auth=HTTPAuth(mutual_authentication=OPTIONAL),
cookies=a)
global df
data = pd.read_html(resp.text, flavor=None, header=0, index_col=0)
df = pd.concat(data, sort=False)
print(df)
s = Scraper(urls)
df
pageCount = 4 #say you have 3 pages
urlsList = []
base = "https://example-url.com/ABCD/customer.currentPage={}&end" #curly braces let you format
for x in range(pageCount)[1:]:
urlsList.append(base.format(x))
然后您可以将列表传递给您的函数。
我希望能够创建一个 for 循环来抓取具有多个页面的 url。我发现了一些这样的例子,但是我的代码需要身份验证,因此我没有分享实际的 url。我输入了一个示例 url,它显示相同的键标识符“currentPage=1”
因此对于第 i 页的这个示例,它将是 currentPage=i,其中 i 将是 1,2,3,4....
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def requests_retry_session(retries=10,
backoff_factor=0.3,
status_forcelist=(500, 502, 503, 504),
session=None):
session = session or requests.Session()
retry = Retry(total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
import io
import urllib3
import pandas as pd
from requests_kerberos import OPTIONAL, HTTPKerberosAuth
import web
a = web.get_mwinit_cookie()
urls = https://example-url.com/ABCD/customer.currentPage=1&end
def Scraper(url):
urllib3.disable_warnings()
with requests_retry_session() as req:
resp = req.get(url,
timeout=30,
verify=False,
allow_redirects=True,
auth=HTTPAuth(mutual_authentication=OPTIONAL),
cookies=a)
global df
data = pd.read_html(resp.text, flavor=None, header=0, index_col=0)
df = pd.concat(data, sort=False)
print(df)
s = Scraper(urls)
df
pageCount = 4 #say you have 3 pages
urlsList = []
base = "https://example-url.com/ABCD/customer.currentPage={}&end" #curly braces let you format
for x in range(pageCount)[1:]:
urlsList.append(base.format(x))
然后您可以将列表传递给您的函数。