BeautifulSoup 没有获取所有数据,只有一些
BeautifulSoup is not getting all data, only some
import requests
from bs4 import BeautifulSoup
def trade_spider(max_pages):
page = 0
while page <= max_pages:
url = 'http://orangecounty.craigslist.org/search/foa?s=' + str(page * 100)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class':'hdrlnk'}):
href = 'http://orangecounty.craigslist.org/' + link.get('href')
title = link.string
print title
#print href
get_single_item_data(href)
page += 1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('section', {'id':'postingbody'}):
print item_name.string
trade_spider(1)
我正在尝试抓取 craigslist(练习),尤其是 http://orangecounty.craigslist.org/search/foa?s=0。我现在将其设置为打印条目的标题和条目的描述。问题是,虽然标题为列出的每个 object 都正确打印,但其中大多数的描述被列为 "None",即使有明确的描述。任何帮助,将不胜感激。谢谢
而不是获取发帖正文的 .string
、get the text(为我工作):
item_name.get_text(strip=True)
附带说明一下,您的脚本有一个阻塞 "nature",您可以通过切换到 Scrapy
web-scraping framework 来显着加快速度。
你快到了。只需将 item_name.string
更改为 item_name.text
import requests
from bs4 import BeautifulSoup
def trade_spider(max_pages):
page = 0
while page <= max_pages:
url = 'http://orangecounty.craigslist.org/search/foa?s=' + str(page * 100)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class':'hdrlnk'}):
href = 'http://orangecounty.craigslist.org/' + link.get('href')
title = link.string
print title
#print href
get_single_item_data(href)
page += 1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('section', {'id':'postingbody'}):
print item_name.string
trade_spider(1)
我正在尝试抓取 craigslist(练习),尤其是 http://orangecounty.craigslist.org/search/foa?s=0。我现在将其设置为打印条目的标题和条目的描述。问题是,虽然标题为列出的每个 object 都正确打印,但其中大多数的描述被列为 "None",即使有明确的描述。任何帮助,将不胜感激。谢谢
而不是获取发帖正文的 .string
、get the text(为我工作):
item_name.get_text(strip=True)
附带说明一下,您的脚本有一个阻塞 "nature",您可以通过切换到 Scrapy
web-scraping framework 来显着加快速度。
你快到了。只需将 item_name.string
更改为 item_name.text