Python: 蜘蛛递归循环
Python: spider recursive loop
我有一个简单的 BeautifulSoup 爬虫,returns 服务器链接深度为 2 或更多,具体取决于添加的功能数量:
import requests
from bs4 import BeautifulSoup
def spider():
address = "http://dog.carnivore.mammal.xyz"
pageFull = requests.get(address)
pageText = pageFull.text
soup = BeautifulSoup(pageText, "html.parser")
for link in soup.findAll("a"):
href = link.get("href")
print(href)
depth2(href)
def depth2(address):
pageFull = requests.get(address)
pageText = pageFull.text
soup = BeautifulSoup(pageText, "html.parser")
for name in soup.findAll("a"):
href = name.get("href")
print(href)
depth3(href)
def depth3(address):
etc...
spider()
这当然可以进一步扩展,但我正在寻找的是扩展 spider 函数的方法,使其能够自行递归地继续处理新链接,而无需新的深度方法——但只要新的深度还在服务器上,在这种情况下mammal.xyz.这样的递归循环如何完成?
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, urljoin
MAX_DEPTH = 4
already_visited = set()
def _get_uri(base_url, href):
if href.startswith('#'):
print('discarding anchor {}'.format(href))
return None
# We use urljoin to deal with relative hrefs (e.g. <a href="../about.html">)
# urljoin resolves them using the base_url we provide
absolute_url = urljoin(base_url, href)
# Here we take the hostname and we strip www because we want to consider
# www.test.com and test.com as the same location
base_url_host = urlparse(base_url).hostname.replace('www.', '')
absolute_url_host = urlparse(absolute_url).hostname.replace('www.', '')
# Skip if the href is external (outside of the starting domain)
# Note that we just stripped www so that www.foo.com and foo.com are considered ==
if absolute_url_host != base_url_host:
print('Skipping {}'.format(href))
return None
return absolute_url
def spider(start_address):
_crawl_page(start_address, '/', [])
def _crawl_page(start_address, uri, followed_path):
path = followed_path[:]
if len(path) > MAX_DEPTH:
return
# For better debugging, we keep track of the current recursion path by creating a list
# of the hrefs visited by this recursion branch. Instead of using curr_deep, we can just check
# the length of the visited list, as we append items as we visit them.
path.append(uri)
print('Visiting: {}, path: {}'.format(uri, path))
# Update a set with the current href, so that we don't visit twice the same uri.
# In practice hrefs should be normalised here to be safe, it's not trivial to deal with
# all the possible edge cases consistently.
already_visited.add(uri)
pageFull = requests.get(start_address)
pageText = pageFull.text
soup = BeautifulSoup(pageText, "html.parser")
for link in soup.findAll("a"):
uri = _get_uri(start_address, link.get("href"))
if uri and uri not in already_visited:
# Recursively visit the links.
# Note this is a depth-first search (DFS) visit of a tree.
_crawl_page(start_address, uri, path)
spider("http://www.sheldonbrown.com/index.html")
我有一个简单的 BeautifulSoup 爬虫,returns 服务器链接深度为 2 或更多,具体取决于添加的功能数量:
import requests
from bs4 import BeautifulSoup
def spider():
address = "http://dog.carnivore.mammal.xyz"
pageFull = requests.get(address)
pageText = pageFull.text
soup = BeautifulSoup(pageText, "html.parser")
for link in soup.findAll("a"):
href = link.get("href")
print(href)
depth2(href)
def depth2(address):
pageFull = requests.get(address)
pageText = pageFull.text
soup = BeautifulSoup(pageText, "html.parser")
for name in soup.findAll("a"):
href = name.get("href")
print(href)
depth3(href)
def depth3(address):
etc...
spider()
这当然可以进一步扩展,但我正在寻找的是扩展 spider 函数的方法,使其能够自行递归地继续处理新链接,而无需新的深度方法——但只要新的深度还在服务器上,在这种情况下mammal.xyz.这样的递归循环如何完成?
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, urljoin
MAX_DEPTH = 4
already_visited = set()
def _get_uri(base_url, href):
if href.startswith('#'):
print('discarding anchor {}'.format(href))
return None
# We use urljoin to deal with relative hrefs (e.g. <a href="../about.html">)
# urljoin resolves them using the base_url we provide
absolute_url = urljoin(base_url, href)
# Here we take the hostname and we strip www because we want to consider
# www.test.com and test.com as the same location
base_url_host = urlparse(base_url).hostname.replace('www.', '')
absolute_url_host = urlparse(absolute_url).hostname.replace('www.', '')
# Skip if the href is external (outside of the starting domain)
# Note that we just stripped www so that www.foo.com and foo.com are considered ==
if absolute_url_host != base_url_host:
print('Skipping {}'.format(href))
return None
return absolute_url
def spider(start_address):
_crawl_page(start_address, '/', [])
def _crawl_page(start_address, uri, followed_path):
path = followed_path[:]
if len(path) > MAX_DEPTH:
return
# For better debugging, we keep track of the current recursion path by creating a list
# of the hrefs visited by this recursion branch. Instead of using curr_deep, we can just check
# the length of the visited list, as we append items as we visit them.
path.append(uri)
print('Visiting: {}, path: {}'.format(uri, path))
# Update a set with the current href, so that we don't visit twice the same uri.
# In practice hrefs should be normalised here to be safe, it's not trivial to deal with
# all the possible edge cases consistently.
already_visited.add(uri)
pageFull = requests.get(start_address)
pageText = pageFull.text
soup = BeautifulSoup(pageText, "html.parser")
for link in soup.findAll("a"):
uri = _get_uri(start_address, link.get("href"))
if uri and uri not in already_visited:
# Recursively visit the links.
# Note this is a depth-first search (DFS) visit of a tree.
_crawl_page(start_address, uri, path)
spider("http://www.sheldonbrown.com/index.html")