AttributeError: 'NoneType' object has no attribute 'tbody' - Spyder 3.3.1 / beautifulsoup4 / python 3.6
AttributeError: 'NoneType' object has no attribute 'tbody' - Spyder 3.3.1 / beautifulsoup4 / python 3.6
嘿,这是我的设置:Spyder 3.3.1 / beautifulsoup4 / python 3.6
以下代码来自媒体 (here) 上的一篇关于使用 python 和 Beautifulsoup 进行网页抓取的文章。本来应该是一个快速阅读但现在两天后我仍然无法在 spyder 中获取 运行 的代码并继续获取:
File "/Users/xxxxxxx/Documents/testdir/swiftScrape.py", line 9, in table_to_df
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
AttributeError: 'NoneType' object has no attribute 'tbody'
不确定出了什么问题,似乎是实施错误。谁能协助阐明这个问题。
提前致谢。
import os
import bs4
import requests
import pandas as pd
PATH = os.path.join("C:\","Users","xxxxx","Documents","tesdir")
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=None, sep=';', encoding='iso-8859-1')
url = next_page(soup)
counter += 1
就像在网络上找到的示例代码丢失一样,此代码不是生产级代码 - 它盲目地假设 http 请求总是成功并且 returns 预期的内容。事实是,情况通常并非如此(网络错误、代理或防火墙阻止您、站点关闭 - 暂时或肯定地、站点更新更改了 url and/or 页面标记等)。
您的问题出现在这里:
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
来自 table
实际上是 None
,这意味着在 for 循环中:
table = soup.find(name='table', attrs={'id':'tableID'})
在 html 文档中找不到 ID 为 "tableID" 的 "table" 标签。您可以通过打印实际的 html 内容来检查:
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continye
# etc
感谢@bruno desthuilliers 的指点。非常感谢。
这是使用 Selenium 和 webdriver 而不是 import requests
:
对我有用的重写代码
import os
import bs4
import pandas as pd
from selenium import webdriver
PATH = os.path.join('/','Users','benmorris','documents','testdir')
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.find_all('td')] for row in soup.find_all('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
driver = webdriver.Chrome()
driver.get(url)
while True:
print(counter)
page = driver.get(url)
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
table = driver.find_element_by_xpath('//*[@id="tableID"]')
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continue
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=False, sep=',', encoding='iso-8859-1')
url = next_page(soup)
counter += 1
嘿,这是我的设置:Spyder 3.3.1 / beautifulsoup4 / python 3.6
以下代码来自媒体 (here) 上的一篇关于使用 python 和 Beautifulsoup 进行网页抓取的文章。本来应该是一个快速阅读但现在两天后我仍然无法在 spyder 中获取 运行 的代码并继续获取:
File "/Users/xxxxxxx/Documents/testdir/swiftScrape.py", line 9, in table_to_df
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
AttributeError: 'NoneType' object has no attribute 'tbody'
不确定出了什么问题,似乎是实施错误。谁能协助阐明这个问题。
提前致谢。
import os
import bs4
import requests
import pandas as pd
PATH = os.path.join("C:\","Users","xxxxx","Documents","tesdir")
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=None, sep=';', encoding='iso-8859-1')
url = next_page(soup)
counter += 1
就像在网络上找到的示例代码丢失一样,此代码不是生产级代码 - 它盲目地假设 http 请求总是成功并且 returns 预期的内容。事实是,情况通常并非如此(网络错误、代理或防火墙阻止您、站点关闭 - 暂时或肯定地、站点更新更改了 url and/or 页面标记等)。
您的问题出现在这里:
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
来自 table
实际上是 None
,这意味着在 for 循环中:
table = soup.find(name='table', attrs={'id':'tableID'})
在 html 文档中找不到 ID 为 "tableID" 的 "table" 标签。您可以通过打印实际的 html 内容来检查:
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continye
# etc
感谢@bruno desthuilliers 的指点。非常感谢。
这是使用 Selenium 和 webdriver 而不是 import requests
:
import os
import bs4
import pandas as pd
from selenium import webdriver
PATH = os.path.join('/','Users','benmorris','documents','testdir')
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.find_all('td')] for row in soup.find_all('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
driver = webdriver.Chrome()
driver.get(url)
while True:
print(counter)
page = driver.get(url)
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
table = driver.find_element_by_xpath('//*[@id="tableID"]')
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continue
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=False, sep=',', encoding='iso-8859-1')
url = next_page(soup)
counter += 1