使用 BeautifulSoup 从 URL 查询中解析列
parsing the columns from a URL query using BeautifulSoup
我通过使用 requests
模块发送信息获取 URL 的查询结果,从而在 html
中获得了 table。现在我想使用 BeautifulSoup
从输出中得到一个 table
import urllib, requests, re
from bs4 import BeautifulSoup
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
payload = {'lon': '1:35:00', 'lat': '-10:13:00', 'radius':'18.0', 'hconst':'73', 'omegam':'0.27','omegav':'0.73','search_type':'Near Position Search','in_equinox':'J2000.0','ot_include':'ANY','in_csys':'Equatorial','in_objtypes1': ['GClusters', 'GGroups']}
r = requests.get('https://ned.ipac.caltech.edu/cgi-bin/objsearch', params=payload,verify=False)
print(r.url)
soup = BeautifulSoup(r.text, 'html.parser')
tables = soup.table
#find the column's names
header=soup.find_all('strong')[-1]
columns=re.split(r'\n*', header.text)[-2]
mylist=re.split(r'\s*', columns)
#Storing the names of columns in a list
mycolumns=[];flag=0
for element in mylist:
if ((element!=u'') and (flag<1) and ('(' not in element) ):
mycolumns.append(element)
if (('(' in element) and (flag<1)):
object=element
flag=1
if (('(' not in element) and (flag>0)):
if ')' not in element:
object+=element
else:
object+=element
flag=0
mycolumns.append(object)
上面的代码给了我更多和更少的我正在寻找的东西,但我想这不是最好的方法。网页查询结果如下:
Row Object Name EquJ2000.0 Object Velocity/Redshift Mag./ Separ. Number of Row
No. (* => Essential Note) RA DEC Type km/s z Qual Filter arcmin Refs Notes Phot Posn Vel/z Diam Assoc Images Spectra No.
1 GMBCG J023.72560-10.18783 01h34m54.1s -10d11m16s GClstr >30000 0.346000 PHOT ... 2.252 1 0 0 0 1 0 0 Retrieve Retrieve 1
2 SDSSCGB 21433 01h34m40.5s -10d14m14s GGroup ... ... ... 4.956 1 0 0 0 0 0 0 Retrieve Retrieve 2
3 WHL J013438.0-101743 01h34m38.0s -10d17m43s GClstr >30000 0.372800 PHOT ... 7.179 2 0 0 0 2 0 0 Retrieve Retrieve 3
4 SDSSCGB 18836 01h34m37.4s -10d20m25s GGroup ... ... ... 9.272 1 0 0 0 0 0 0 Retrieve Retrieve 4
5 GMBCG J023.65477-10.06935 01h34m37.1s -10d04m10s GClstr >30000 0.336000 PHOT ... 10.477 1 0 0 0 1 0 0 Retrieve Retrieve 5
6 GMBCG J023.95379-10.20892 01h35m48.9s -10d12m32s GClstr >30000 0.179000 PHOT ... 12.043 1 0 0 0 1 0 0 Retrieve Retrieve 6
7 SDSSCGB 11439 01h34m07.6s -10d12m01s GGroup ... ... ... 12.930 1 0 0 0 0 0 0 Retrieve Retrieve 7
8 GMBCG J023.53330-10.16959 01h34m08.0s -10d10m11s GClstr >30000 0.438000 PHOT ... 13.105 1 0 0 0 1 0 0 Retrieve Retrieve 8
9 WHL J013404.8-101438 01h34m04.8s -10d14m38s GClstr >30000 0.321800 PHOT ... 13.678 2 0 0 0 2 0 0 Retrieve Retrieve 9
10 GMBCG J023.90759-10.03946 01h35m37.8s -10d02m22s GClstr >30000 0.298000 PHOT ... 14.131 1 0 0 0 1 0 0 Retrieve Retrieve 10
11 SDSSCGB 20022 01h36m00.4s -10d09m21s GGroup ... ... ... 15.302 1 0 0 0 0 0 0 Retrieve Retrieve 11
12 GMBCG J024.00318-10.15744 01h36m00.7s -10d09m27s GClstr >30000 0.385000 PHOT ... 15.368 1 0 0 0 1 0 0 Retrieve Retrieve 12
13 MaxBCG J023.98788-10.04339 01h35m57.1s -10d02m36s GClstr >30000 0.297050 PHOT ... 17.479 2 0 0 0 1 0 0 Retrieve Retrieve 13
我只想从查询的前八列中提取信息,但使用 BeautifulSoup
完成它并不是很简单。我将不胜感激任何建议。
你不能使用BeautifulSoup,因为你从网站得到的结果是
格式不正确 HTML(至少对于您想要获得的部分)。整个 table 内容作为大多数纯文本位于一个 table/tr/td/pre
元素内。
如果您想使用正则表达式——什么可以是 unstable 如果数据发生变化——您可以使用这种方法(基于您当前的代码):
# coding: utf-8
import requests, re, pprint
from bs4 import BeautifulSoup
payload = {'lon': '1:35:00', 'lat': '-10:13:00', 'radius':'18.0', 'hconst':'73', 'omegam':'0.27','omegav':'0.73','search_type':'Near Position Search','in_equinox':'J2000.0','ot_include':'ANY','in_csys':'Equatorial','in_objtypes1': ['GClusters', 'GGroups']}
r = requests.get('https://ned.ipac.caltech.edu/cgi-bin/objsearch', params=payload,verify=False)
print(r.url)
soup = BeautifulSoup(r.text, 'html.parser')
tables = soup.table
text = tables.text
rows = text.split("\n")
result = []
for row in rows:
if re.match("^\d+\s", row):
row = row.replace(u'\xa0', u' ') # normalize non-breaking spaces
# split by regex
search = re.match(
"(\d+)\s+(.+?)\s+(\d+h\S+)\s+([-\w]+)\s+(\w+)\s+(\.{3}|[<>\d]+)\s+(\.{3}|[\d.]+)\s+(\w+)?\s+\s+(\.{3}|\w+)",
# for all columns, add this part to the regex:
# \s+([-.\d]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\w+)\s+(\w+)\s+(\d+)
row)
# map the regex groups to the table row names
tmp_result = {
"row": search.group(1),
"objectName": search.group(2),
"EquJ2000_ra": search.group(3),
"EquJ2000_dec": search.group(4),
"objectType": search.group(5),
"Velocity": search.group(6),
"Redshift": search.group(7),
"Qual": search.group(8),
"Filter": search.group(9),
# further columns
# "arcmin": search.group(10),
# "Refs": search.group(11),
# "Notes": search.group(12),
# "Phot": search.group(13),
# "Posn": search.group(14),
# "Vel_z": search.group(15),
# "Diam": search.group(16),
# "Assoc": search.group(17),
# "Images": search.group(18),
# "Spectra": search.group(19),
}
# append the result with the row number as key
n = int(search.group(1))
result.append({ n: tmp_result })
print pprint.pprint(result)
结果是:
[{1: {'EquJ2000_dec': u'-10d11m16s',
'EquJ2000_ra': u'01h34m54.1s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.346000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.72560-10.18783',
'objectType': u'GClstr',
'row': u'1'}},
{2: {'EquJ2000_dec': u'-10d14m14s',
'EquJ2000_ra': u'01h34m40.5s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 21433',
'objectType': u'GGroup',
'row': u'2'}},
{3: {'EquJ2000_dec': u'-10d17m43s',
'EquJ2000_ra': u'01h34m38.0s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.372800',
'Velocity': u'>30000',
'objectName': u'WHL J013438.0-101743',
'objectType': u'GClstr',
'row': u'3'}},
{4: {'EquJ2000_dec': u'-10d20m25s',
'EquJ2000_ra': u'01h34m37.4s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 18836',
'objectType': u'GGroup',
'row': u'4'}},
{5: {'EquJ2000_dec': u'-10d04m10s',
'EquJ2000_ra': u'01h34m37.1s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.336000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.65477-10.06935',
'objectType': u'GClstr',
'row': u'5'}},
{6: {'EquJ2000_dec': u'-10d12m32s',
'EquJ2000_ra': u'01h35m48.9s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.179000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.95379-10.20892',
'objectType': u'GClstr',
'row': u'6'}},
{7: {'EquJ2000_dec': u'-10d12m01s',
'EquJ2000_ra': u'01h34m07.6s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 11439',
'objectType': u'GGroup',
'row': u'7'}},
{8: {'EquJ2000_dec': u'-10d10m11s',
'EquJ2000_ra': u'01h34m08.0s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.438000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.53330-10.16959',
'objectType': u'GClstr',
'row': u'8'}},
{9: {'EquJ2000_dec': u'-10d14m38s',
'EquJ2000_ra': u'01h34m04.8s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.321800',
'Velocity': u'>30000',
'objectName': u'WHL J013404.8-101438',
'objectType': u'GClstr',
'row': u'9'}},
{10: {'EquJ2000_dec': u'-10d02m22s',
'EquJ2000_ra': u'01h35m37.8s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.298000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.90759-10.03946',
'objectType': u'GClstr',
'row': u'10'}},
{11: {'EquJ2000_dec': u'-10d09m21s',
'EquJ2000_ra': u'01h36m00.4s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 20022',
'objectType': u'GGroup',
'row': u'11'}},
{12: {'EquJ2000_dec': u'-10d09m27s',
'EquJ2000_ra': u'01h36m00.7s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.385000',
'Velocity': u'>30000',
'objectName': u'GMBCG J024.00318-10.15744',
'objectType': u'GClstr',
'row': u'12'}},
{13: {'EquJ2000_dec': u'-10d02m36s',
'EquJ2000_ra': u'01h35m57.1s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.297050',
'Velocity': u'>30000',
'objectName': u'MaxBCG J023.98788-10.04339',
'objectType': u'GClstr',
'row': u'13'}}]
请注意,"Qual" 可以是 None
,就像第 2 行中的空行一样。
我通过使用 requests
模块发送信息获取 URL 的查询结果,从而在 html
中获得了 table。现在我想使用 BeautifulSoup
import urllib, requests, re
from bs4 import BeautifulSoup
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
payload = {'lon': '1:35:00', 'lat': '-10:13:00', 'radius':'18.0', 'hconst':'73', 'omegam':'0.27','omegav':'0.73','search_type':'Near Position Search','in_equinox':'J2000.0','ot_include':'ANY','in_csys':'Equatorial','in_objtypes1': ['GClusters', 'GGroups']}
r = requests.get('https://ned.ipac.caltech.edu/cgi-bin/objsearch', params=payload,verify=False)
print(r.url)
soup = BeautifulSoup(r.text, 'html.parser')
tables = soup.table
#find the column's names
header=soup.find_all('strong')[-1]
columns=re.split(r'\n*', header.text)[-2]
mylist=re.split(r'\s*', columns)
#Storing the names of columns in a list
mycolumns=[];flag=0
for element in mylist:
if ((element!=u'') and (flag<1) and ('(' not in element) ):
mycolumns.append(element)
if (('(' in element) and (flag<1)):
object=element
flag=1
if (('(' not in element) and (flag>0)):
if ')' not in element:
object+=element
else:
object+=element
flag=0
mycolumns.append(object)
上面的代码给了我更多和更少的我正在寻找的东西,但我想这不是最好的方法。网页查询结果如下:
Row Object Name EquJ2000.0 Object Velocity/Redshift Mag./ Separ. Number of Row
No. (* => Essential Note) RA DEC Type km/s z Qual Filter arcmin Refs Notes Phot Posn Vel/z Diam Assoc Images Spectra No.
1 GMBCG J023.72560-10.18783 01h34m54.1s -10d11m16s GClstr >30000 0.346000 PHOT ... 2.252 1 0 0 0 1 0 0 Retrieve Retrieve 1
2 SDSSCGB 21433 01h34m40.5s -10d14m14s GGroup ... ... ... 4.956 1 0 0 0 0 0 0 Retrieve Retrieve 2
3 WHL J013438.0-101743 01h34m38.0s -10d17m43s GClstr >30000 0.372800 PHOT ... 7.179 2 0 0 0 2 0 0 Retrieve Retrieve 3
4 SDSSCGB 18836 01h34m37.4s -10d20m25s GGroup ... ... ... 9.272 1 0 0 0 0 0 0 Retrieve Retrieve 4
5 GMBCG J023.65477-10.06935 01h34m37.1s -10d04m10s GClstr >30000 0.336000 PHOT ... 10.477 1 0 0 0 1 0 0 Retrieve Retrieve 5
6 GMBCG J023.95379-10.20892 01h35m48.9s -10d12m32s GClstr >30000 0.179000 PHOT ... 12.043 1 0 0 0 1 0 0 Retrieve Retrieve 6
7 SDSSCGB 11439 01h34m07.6s -10d12m01s GGroup ... ... ... 12.930 1 0 0 0 0 0 0 Retrieve Retrieve 7
8 GMBCG J023.53330-10.16959 01h34m08.0s -10d10m11s GClstr >30000 0.438000 PHOT ... 13.105 1 0 0 0 1 0 0 Retrieve Retrieve 8
9 WHL J013404.8-101438 01h34m04.8s -10d14m38s GClstr >30000 0.321800 PHOT ... 13.678 2 0 0 0 2 0 0 Retrieve Retrieve 9
10 GMBCG J023.90759-10.03946 01h35m37.8s -10d02m22s GClstr >30000 0.298000 PHOT ... 14.131 1 0 0 0 1 0 0 Retrieve Retrieve 10
11 SDSSCGB 20022 01h36m00.4s -10d09m21s GGroup ... ... ... 15.302 1 0 0 0 0 0 0 Retrieve Retrieve 11
12 GMBCG J024.00318-10.15744 01h36m00.7s -10d09m27s GClstr >30000 0.385000 PHOT ... 15.368 1 0 0 0 1 0 0 Retrieve Retrieve 12
13 MaxBCG J023.98788-10.04339 01h35m57.1s -10d02m36s GClstr >30000 0.297050 PHOT ... 17.479 2 0 0 0 1 0 0 Retrieve Retrieve 13
我只想从查询的前八列中提取信息,但使用 BeautifulSoup
完成它并不是很简单。我将不胜感激任何建议。
你不能使用BeautifulSoup,因为你从网站得到的结果是
格式不正确 HTML(至少对于您想要获得的部分)。整个 table 内容作为大多数纯文本位于一个 table/tr/td/pre
元素内。
如果您想使用正则表达式——什么可以是 unstable 如果数据发生变化——您可以使用这种方法(基于您当前的代码):
# coding: utf-8
import requests, re, pprint
from bs4 import BeautifulSoup
payload = {'lon': '1:35:00', 'lat': '-10:13:00', 'radius':'18.0', 'hconst':'73', 'omegam':'0.27','omegav':'0.73','search_type':'Near Position Search','in_equinox':'J2000.0','ot_include':'ANY','in_csys':'Equatorial','in_objtypes1': ['GClusters', 'GGroups']}
r = requests.get('https://ned.ipac.caltech.edu/cgi-bin/objsearch', params=payload,verify=False)
print(r.url)
soup = BeautifulSoup(r.text, 'html.parser')
tables = soup.table
text = tables.text
rows = text.split("\n")
result = []
for row in rows:
if re.match("^\d+\s", row):
row = row.replace(u'\xa0', u' ') # normalize non-breaking spaces
# split by regex
search = re.match(
"(\d+)\s+(.+?)\s+(\d+h\S+)\s+([-\w]+)\s+(\w+)\s+(\.{3}|[<>\d]+)\s+(\.{3}|[\d.]+)\s+(\w+)?\s+\s+(\.{3}|\w+)",
# for all columns, add this part to the regex:
# \s+([-.\d]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\w+)\s+(\w+)\s+(\d+)
row)
# map the regex groups to the table row names
tmp_result = {
"row": search.group(1),
"objectName": search.group(2),
"EquJ2000_ra": search.group(3),
"EquJ2000_dec": search.group(4),
"objectType": search.group(5),
"Velocity": search.group(6),
"Redshift": search.group(7),
"Qual": search.group(8),
"Filter": search.group(9),
# further columns
# "arcmin": search.group(10),
# "Refs": search.group(11),
# "Notes": search.group(12),
# "Phot": search.group(13),
# "Posn": search.group(14),
# "Vel_z": search.group(15),
# "Diam": search.group(16),
# "Assoc": search.group(17),
# "Images": search.group(18),
# "Spectra": search.group(19),
}
# append the result with the row number as key
n = int(search.group(1))
result.append({ n: tmp_result })
print pprint.pprint(result)
结果是:
[{1: {'EquJ2000_dec': u'-10d11m16s',
'EquJ2000_ra': u'01h34m54.1s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.346000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.72560-10.18783',
'objectType': u'GClstr',
'row': u'1'}},
{2: {'EquJ2000_dec': u'-10d14m14s',
'EquJ2000_ra': u'01h34m40.5s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 21433',
'objectType': u'GGroup',
'row': u'2'}},
{3: {'EquJ2000_dec': u'-10d17m43s',
'EquJ2000_ra': u'01h34m38.0s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.372800',
'Velocity': u'>30000',
'objectName': u'WHL J013438.0-101743',
'objectType': u'GClstr',
'row': u'3'}},
{4: {'EquJ2000_dec': u'-10d20m25s',
'EquJ2000_ra': u'01h34m37.4s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 18836',
'objectType': u'GGroup',
'row': u'4'}},
{5: {'EquJ2000_dec': u'-10d04m10s',
'EquJ2000_ra': u'01h34m37.1s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.336000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.65477-10.06935',
'objectType': u'GClstr',
'row': u'5'}},
{6: {'EquJ2000_dec': u'-10d12m32s',
'EquJ2000_ra': u'01h35m48.9s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.179000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.95379-10.20892',
'objectType': u'GClstr',
'row': u'6'}},
{7: {'EquJ2000_dec': u'-10d12m01s',
'EquJ2000_ra': u'01h34m07.6s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 11439',
'objectType': u'GGroup',
'row': u'7'}},
{8: {'EquJ2000_dec': u'-10d10m11s',
'EquJ2000_ra': u'01h34m08.0s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.438000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.53330-10.16959',
'objectType': u'GClstr',
'row': u'8'}},
{9: {'EquJ2000_dec': u'-10d14m38s',
'EquJ2000_ra': u'01h34m04.8s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.321800',
'Velocity': u'>30000',
'objectName': u'WHL J013404.8-101438',
'objectType': u'GClstr',
'row': u'9'}},
{10: {'EquJ2000_dec': u'-10d02m22s',
'EquJ2000_ra': u'01h35m37.8s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.298000',
'Velocity': u'>30000',
'objectName': u'GMBCG J023.90759-10.03946',
'objectType': u'GClstr',
'row': u'10'}},
{11: {'EquJ2000_dec': u'-10d09m21s',
'EquJ2000_ra': u'01h36m00.4s',
'Filter': u'...',
'Qual': None,
'Redshift': u'...',
'Velocity': u'...',
'objectName': u'SDSSCGB 20022',
'objectType': u'GGroup',
'row': u'11'}},
{12: {'EquJ2000_dec': u'-10d09m27s',
'EquJ2000_ra': u'01h36m00.7s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.385000',
'Velocity': u'>30000',
'objectName': u'GMBCG J024.00318-10.15744',
'objectType': u'GClstr',
'row': u'12'}},
{13: {'EquJ2000_dec': u'-10d02m36s',
'EquJ2000_ra': u'01h35m57.1s',
'Filter': u'...',
'Qual': u'PHOT',
'Redshift': u'0.297050',
'Velocity': u'>30000',
'objectName': u'MaxBCG J023.98788-10.04339',
'objectType': u'GClstr',
'row': u'13'}}]
请注意,"Qual" 可以是 None
,就像第 2 行中的空行一样。