url 从 href 标签中提取的 Beautiful Soup 无法被 urllib2 识别
url extracted from Beautiful Soup from href Tag is not recognized by urllib2
我正在学习 Python 和 Beautiful Soup,作为练习,我抓取了一个测试网页。我的objective是从网页中提取一个url然后按照这个url提取另一个url.
我的代码如下:
第一步:
path = "http://python-data.dr-chuck.net/known_by_Fikret.html"
pattern = re.compile(r'"(.+)"')
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
a = soup.find_all("a")
path = re.search(pattern, str(a[2])).group(0)
path
输出:
'"http://python-data.dr-chuck.net/known_by_Montgomery.html"'
第二步:
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
a = soup.find_all("a")
path = re.search(pattern, str(a[2])).group(0)
path
输出:
---------------------------------------------------------------------------
URLError Traceback (most recent call last)
<ipython-input-33-14ad9508aea0> in <module>()
----> 1 page = urllib2.urlopen(path)
2 soup = bs(page, 'lxml')
3 a = soup.find_all("a")
4 path = re.search(pattern, str(a[2])).group(0)
5 path
C:\users\alex\Anaconda2\lib\urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
C:\users\alex\Anaconda2\lib\urllib2.pyc in open(self, fullurl, data, timeout)
427 req = meth(req)
428
--> 429 response = self._open(req, data)
430
431 # post-process response
C:\users\alex\Anaconda2\lib\urllib2.pyc in _open(self, req, data)
450
451 return self._call_chain(self.handle_open, 'unknown',
--> 452 'unknown_open', req)
453
454 def error(self, proto, *args):
C:\users\alex\Anaconda2\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
405 func = getattr(handler, meth_name)
406
--> 407 result = func(*args)
408 if result is not None:
409 return result
C:\users\alex\Anaconda2\lib\urllib2.pyc in unknown_open(self, req)
1264 def unknown_open(self, req):
1265 type = req.get_type()
-> 1266 raise URLError('unknown url type: %s' % type)
1267
1268 def parse_keqv_list(l):
URLError: <urlopen error unknown url type: "http>
为什么urlopen无法识别url?
我们将不胜感激您的建议。
在检索正则表达式匹配结果时使用.group(1)
。 .group(0)
returns 包括引号在内的整个匹配字符串。
path.strip('"')
输出:
'http://python-data.dr-chuck.net/known_by_Montgomery.html'
url 不正确,只需删除 url 中的 "
或调整您的正则表达式
我猜的问题是你有多余的引号path
'"http://python-data.dr-chuck.net/known_by_Montgomery.html"'
Trim 使用 strip()
作为
的字符串
path = path.strip('"')
page = urllib2.urlopen(path)
您可以使用BeautifulSoup从锚标签中提取src。您不需要为此目的使用正则表达式
例子
>>> html = """<a href="http://www.google.com">"""
>>> soup.find_all('a')[0]['href']
'http://www.google.com'
您的问题是因为 url 中有 "
。删除它。
但是 BeautifulSoup 有自己的方法来获取 url - a[2]['href']
from bs4 import BeautifulSoup as bs
import urllib2
# - first page -
path = "http://python-data.dr-chuck.net/known_by_Fikret.html"
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
all_links = soup.find_all("a")
#for link in all_links:
# print link['href']
print all_links[2]['href']
# - second page -
path = all_links[2]['href']
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
all_links = soup.find_all("a")
#for link in all_links:
# print link['href']
print all_links[2]['href']
或更短
from bs4 import BeautifulSoup as bs
import urllib2
def get_url(path):
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
all_links = soup.find_all("a")
#for link in all_links:
# print link['href']
return all_links[2]['href']
# - first page -
path = "http://python-data.dr-chuck.net/known_by_Fikret.html"
path = get_url(path)
print path
# - second page -
path = get_url(path)
print path
我正在学习 Python 和 Beautiful Soup,作为练习,我抓取了一个测试网页。我的objective是从网页中提取一个url然后按照这个url提取另一个url.
我的代码如下:
第一步:
path = "http://python-data.dr-chuck.net/known_by_Fikret.html"
pattern = re.compile(r'"(.+)"')
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
a = soup.find_all("a")
path = re.search(pattern, str(a[2])).group(0)
path
输出:
'"http://python-data.dr-chuck.net/known_by_Montgomery.html"'
第二步:
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
a = soup.find_all("a")
path = re.search(pattern, str(a[2])).group(0)
path
输出:
---------------------------------------------------------------------------
URLError Traceback (most recent call last)
<ipython-input-33-14ad9508aea0> in <module>()
----> 1 page = urllib2.urlopen(path)
2 soup = bs(page, 'lxml')
3 a = soup.find_all("a")
4 path = re.search(pattern, str(a[2])).group(0)
5 path
C:\users\alex\Anaconda2\lib\urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
C:\users\alex\Anaconda2\lib\urllib2.pyc in open(self, fullurl, data, timeout)
427 req = meth(req)
428
--> 429 response = self._open(req, data)
430
431 # post-process response
C:\users\alex\Anaconda2\lib\urllib2.pyc in _open(self, req, data)
450
451 return self._call_chain(self.handle_open, 'unknown',
--> 452 'unknown_open', req)
453
454 def error(self, proto, *args):
C:\users\alex\Anaconda2\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
405 func = getattr(handler, meth_name)
406
--> 407 result = func(*args)
408 if result is not None:
409 return result
C:\users\alex\Anaconda2\lib\urllib2.pyc in unknown_open(self, req)
1264 def unknown_open(self, req):
1265 type = req.get_type()
-> 1266 raise URLError('unknown url type: %s' % type)
1267
1268 def parse_keqv_list(l):
URLError: <urlopen error unknown url type: "http>
为什么urlopen无法识别url?
我们将不胜感激您的建议。
在检索正则表达式匹配结果时使用.group(1)
。 .group(0)
returns 包括引号在内的整个匹配字符串。
path.strip('"')
输出:
'http://python-data.dr-chuck.net/known_by_Montgomery.html'
url 不正确,只需删除 url 中的 "
或调整您的正则表达式
我猜的问题是你有多余的引号path
'"http://python-data.dr-chuck.net/known_by_Montgomery.html"'
Trim 使用 strip()
作为
path = path.strip('"')
page = urllib2.urlopen(path)
您可以使用BeautifulSoup从锚标签中提取src。您不需要为此目的使用正则表达式
例子
>>> html = """<a href="http://www.google.com">"""
>>> soup.find_all('a')[0]['href']
'http://www.google.com'
您的问题是因为 url 中有 "
。删除它。
但是 BeautifulSoup 有自己的方法来获取 url - a[2]['href']
from bs4 import BeautifulSoup as bs
import urllib2
# - first page -
path = "http://python-data.dr-chuck.net/known_by_Fikret.html"
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
all_links = soup.find_all("a")
#for link in all_links:
# print link['href']
print all_links[2]['href']
# - second page -
path = all_links[2]['href']
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
all_links = soup.find_all("a")
#for link in all_links:
# print link['href']
print all_links[2]['href']
或更短
from bs4 import BeautifulSoup as bs
import urllib2
def get_url(path):
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
all_links = soup.find_all("a")
#for link in all_links:
# print link['href']
return all_links[2]['href']
# - first page -
path = "http://python-data.dr-chuck.net/known_by_Fikret.html"
path = get_url(path)
print path
# - second page -
path = get_url(path)
print path