Python BeautifulSoup 网络抓取:将数据添加到列表
Python BeautifulSoup webcrawling: Appending piece of data to list
我要抓取的网站是 http://www.boxofficemojo.com/yearly/chart/?yr=2013&p=.htm. The specific page I'm focusing on now is http://www.boxofficemojo.com/movies/?id=catchingfire.htm。
我需要获取 "Foreign gross" 金额(在总生命周期总收入下),但出于某种原因,我无法通过循环获取它以便它遍历所有电影,但它与我输入的单个 link 一起使用。
这是我获取每部电影数量的函数。
def getForeign(item_url):
s = urlopen(item_url).read()
soup = BeautifulSoup(s)
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
这是遍历每个link
的函数
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://www.boxofficemojo.com' + link.get('href')
details(href)
listOfDirectors.append(getDirectors(href))
str(listOfDirectors).replace('[','').replace(']','')
#getActors(href)
title = link.string
listOfTitles.append(title)
page += 1
我有一个名为 listOfForeign = [] 的列表,我想将每部电影的外国总金额附加到该列表。
问题是,如果我调用 getForeign(item_url) 使用我输入的单个完整 link 例如:
print listOfForeign.append(getForeign(http://www.boxofficemojo.com/movies/?id=catchingfire.htm))
之后
print listOfForeign
它打印出一个正确的数量。
但是当我 运行 函数 spider(max_pages),并添加:
listOfForeign.append(getForeign(href))
在 for 循环中,稍后尝试打印 listOfForeign,我得到一个错误
AttributeError: 'NoneType' object has no attribute 'find_parent'
为什么我无法在蜘蛛功能中为每部电影成功添加此数量?在 spider(max_pages) 函数中,我在变量 "href" 中获取每个电影的 link,并且基本上做与分别添加每个个体 link 相同的事情。
完整代码:
import requests
from bs4 import BeautifulSoup
from urllib import urlopen
import xlwt
import csv
from tempfile import TemporaryFile
listOfTitles = []
listOfGenre = []
listOfRuntime = []
listOfRatings = []
listOfBudget = []
listOfDirectors = []
listOfActors = []
listOfForeign = []
resultFile = open("movies.csv",'wb')
wr = csv.writer(resultFile, dialect='excel')
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://www.boxofficemojo.com' + link.get('href')
details(href)
listOfForeign.append(getForeign(href))
listOfDirectors.append(getDirectors(href))
str(listOfDirectors).replace('[','').replace(']','')
#getActors(href)
title = link.string
listOfTitles.append(title)
page += 1
def getDirectors(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tempDirector = []
for director in soup.select('td > font > a[href^=/people/chart/?view=Director]'):
tempDirector.append(str(director.string))
return tempDirector
def getActors(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tempActors = []
print soup.find(text="Actors:").find_parent("tr").text[7:]
def details(href):
response = requests.get(href)
soup = BeautifulSoup(response.content)
genre = soup.find(text="Genre: ").next_sibling.text
rating = soup.find(text='MPAA Rating: ').next_sibling.text
runtime = soup.find(text='Runtime: ').next_sibling.text
budget = soup.find(text='Production Budget: ').next_sibling.text
listOfGenre.append(genre)
listOfRuntime.append(runtime)
listOfRatings.append(rating)
listOfBudget.append(budget)
def getForeign(item_url):
s = urlopen(item_url).read()
soup = BeautifulSoup(s)
try:
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
except AttributeError:
return "[=18=]"
spider(1)
print listOfForeign
wr.writerow(listOfTitles)
wr.writerow(listOfGenre)
wr.writerow(listOfRuntime)
wr.writerow(listOfRatings)
wr.writerow(listOfBudget)
for item in listOfDirectors:
wr.writerow(item)
一旦代码进入电影页面没有外国收入,代码就会失败,例如42。你应该处理这样的情况。比如捕获异常,设置为[=11=]
.
您也遇到了 differences between parsers - specify the lxml
or html5lib
parser explicitly (you would need to have lxml
or html5lib
installed).
还有,为什么不用requests
来解析电影页面呢:
def getForeign(item_url):
response = requests.get(item_url)
soup = BeautifulSoup(response.content, "lxml") # or BeautifulSoup(response.content, "html5lib")
try:
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
except AttributeError:
return "[=10=]"
附带说明一下,总体而言,由于脚本的阻塞性质,您的代码变得相当复杂和缓慢,请求是一个接一个发送的顺序地。切换到 Scrapy
web-scraping framework 可能是个好主意,除了使代码更快之外,还有助于将其组织成逻辑组 - 你将拥有一个内部具有抓取逻辑的蜘蛛,item class 定义提取数据模型、将提取的数据写入数据库的管道(如果需要)等等。
我要抓取的网站是 http://www.boxofficemojo.com/yearly/chart/?yr=2013&p=.htm. The specific page I'm focusing on now is http://www.boxofficemojo.com/movies/?id=catchingfire.htm。
我需要获取 "Foreign gross" 金额(在总生命周期总收入下),但出于某种原因,我无法通过循环获取它以便它遍历所有电影,但它与我输入的单个 link 一起使用。
这是我获取每部电影数量的函数。
def getForeign(item_url):
s = urlopen(item_url).read()
soup = BeautifulSoup(s)
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
这是遍历每个link
的函数def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://www.boxofficemojo.com' + link.get('href')
details(href)
listOfDirectors.append(getDirectors(href))
str(listOfDirectors).replace('[','').replace(']','')
#getActors(href)
title = link.string
listOfTitles.append(title)
page += 1
我有一个名为 listOfForeign = [] 的列表,我想将每部电影的外国总金额附加到该列表。 问题是,如果我调用 getForeign(item_url) 使用我输入的单个完整 link 例如:
print listOfForeign.append(getForeign(http://www.boxofficemojo.com/movies/?id=catchingfire.htm))
之后
print listOfForeign
它打印出一个正确的数量。
但是当我 运行 函数 spider(max_pages),并添加:
listOfForeign.append(getForeign(href))
在 for 循环中,稍后尝试打印 listOfForeign,我得到一个错误
AttributeError: 'NoneType' object has no attribute 'find_parent'
为什么我无法在蜘蛛功能中为每部电影成功添加此数量?在 spider(max_pages) 函数中,我在变量 "href" 中获取每个电影的 link,并且基本上做与分别添加每个个体 link 相同的事情。
完整代码:
import requests
from bs4 import BeautifulSoup
from urllib import urlopen
import xlwt
import csv
from tempfile import TemporaryFile
listOfTitles = []
listOfGenre = []
listOfRuntime = []
listOfRatings = []
listOfBudget = []
listOfDirectors = []
listOfActors = []
listOfForeign = []
resultFile = open("movies.csv",'wb')
wr = csv.writer(resultFile, dialect='excel')
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.select('td > b > font > a[href^=/movies/?]'):
href = 'http://www.boxofficemojo.com' + link.get('href')
details(href)
listOfForeign.append(getForeign(href))
listOfDirectors.append(getDirectors(href))
str(listOfDirectors).replace('[','').replace(']','')
#getActors(href)
title = link.string
listOfTitles.append(title)
page += 1
def getDirectors(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tempDirector = []
for director in soup.select('td > font > a[href^=/people/chart/?view=Director]'):
tempDirector.append(str(director.string))
return tempDirector
def getActors(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tempActors = []
print soup.find(text="Actors:").find_parent("tr").text[7:]
def details(href):
response = requests.get(href)
soup = BeautifulSoup(response.content)
genre = soup.find(text="Genre: ").next_sibling.text
rating = soup.find(text='MPAA Rating: ').next_sibling.text
runtime = soup.find(text='Runtime: ').next_sibling.text
budget = soup.find(text='Production Budget: ').next_sibling.text
listOfGenre.append(genre)
listOfRuntime.append(runtime)
listOfRatings.append(rating)
listOfBudget.append(budget)
def getForeign(item_url):
s = urlopen(item_url).read()
soup = BeautifulSoup(s)
try:
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
except AttributeError:
return "[=18=]"
spider(1)
print listOfForeign
wr.writerow(listOfTitles)
wr.writerow(listOfGenre)
wr.writerow(listOfRuntime)
wr.writerow(listOfRatings)
wr.writerow(listOfBudget)
for item in listOfDirectors:
wr.writerow(item)
一旦代码进入电影页面没有外国收入,代码就会失败,例如42。你应该处理这样的情况。比如捕获异常,设置为[=11=]
.
您也遇到了 differences between parsers - specify the lxml
or html5lib
parser explicitly (you would need to have lxml
or html5lib
installed).
还有,为什么不用requests
来解析电影页面呢:
def getForeign(item_url):
response = requests.get(item_url)
soup = BeautifulSoup(response.content, "lxml") # or BeautifulSoup(response.content, "html5lib")
try:
return soup.find(text="Foreign:").find_parent("td").find_next_sibling("td").get_text(strip = True)
except AttributeError:
return "[=10=]"
附带说明一下,总体而言,由于脚本的阻塞性质,您的代码变得相当复杂和缓慢,请求是一个接一个发送的顺序地。切换到 Scrapy
web-scraping framework 可能是个好主意,除了使代码更快之外,还有助于将其组织成逻辑组 - 你将拥有一个内部具有抓取逻辑的蜘蛛,item class 定义提取数据模型、将提取的数据写入数据库的管道(如果需要)等等。