Python 网络爬虫,我可以从 for 循环调用函数吗?
Python Web Crawler, Can i do function calls from for loops?
所以我试着编写这个网络爬虫,让它获取所有标题 URL 链接,然后去寻找所有章节 URL 链接,然后从章节链接找到所有章节链接等。
问题是,我在这个 https://github.com/buckyroberts/Source-Code-from-Tutorials/blob/master/Python/27_workingsolution_python.py 教程中看到,作者能够在定义第二个函数之前调用它。这真的很令人困惑。
我尝试了类似的方法,但得到的名字 "leveltwo" is not defined,正如预期的那样。
我的问题是,如何使用从前一个函数获得的链接将其用作第二个函数的参数等等。
我的代码:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
base_url = "http://law.justia.com/codes/alabama/2015/"
levelone(base_url)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
# chapterlinks = "\n" + str(chapterlinks)
levelthree(chapterlinks)
# print (chapterlinks)
# leveltwo(titlelinks) ### I tried call the function right here, but titlelinks is not defined.
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
href = "\n" + str(href)
print (href)
先定义函数再调用
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
href = "\n" + str(href)
print (href)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
# chapterlinks = "\n" + str(chapterlinks)
levelthree(chapterlinks)
# print (chapterlinks)
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
###########################################################################
base_url = "http://law.justia.com/codes/alabama/2015/"
levelone(base_url)
所以我试着编写这个网络爬虫,让它获取所有标题 URL 链接,然后去寻找所有章节 URL 链接,然后从章节链接找到所有章节链接等。
问题是,我在这个 https://github.com/buckyroberts/Source-Code-from-Tutorials/blob/master/Python/27_workingsolution_python.py 教程中看到,作者能够在定义第二个函数之前调用它。这真的很令人困惑。
我尝试了类似的方法,但得到的名字 "leveltwo" is not defined,正如预期的那样。 我的问题是,如何使用从前一个函数获得的链接将其用作第二个函数的参数等等。
我的代码:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
base_url = "http://law.justia.com/codes/alabama/2015/"
levelone(base_url)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
# chapterlinks = "\n" + str(chapterlinks)
levelthree(chapterlinks)
# print (chapterlinks)
# leveltwo(titlelinks) ### I tried call the function right here, but titlelinks is not defined.
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
href = "\n" + str(href)
print (href)
先定义函数再调用
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
href = "\n" + str(href)
print (href)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
# chapterlinks = "\n" + str(chapterlinks)
levelthree(chapterlinks)
# print (chapterlinks)
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
###########################################################################
base_url = "http://law.justia.com/codes/alabama/2015/"
levelone(base_url)