我应该如何从 data-* 属性中抓取数据?
How should I scrape data from data-* attributes?
我想编写一个可以从任何 HTML 标签的任何自定义数据属性中获取磁铁 link 的抓取工具。例如,在 kickassto.cc webpages 上,磁铁 link 没有分配给锚标签的 href 属性,而是分配给 data-sc- div 标签的 params 属性,例如:
<a data-download rel="nofollow" class="kaGiantButton siteButton iconButton" title="Download verified torrent file" target="_blank"
href="/torrents/Download Something in the Woods 2016 HDRip XviD AC3-EVO Torrent">
<i class="ka ka-verify"></i>
<span>Download torrent</span></a>
<div data-sc-replace data-sc-slot="_b6f619f42a2411c6688f2273fa3f628a" class="inlineblock"
data-sc-params="{ 'magnet': 'magnet:?xt=urn:btih:CC75C59E9FE0E8689DFD21558C02E9C9F92AE714&dn=something+in+the+woods+2016+hdrip+xvid+ac3+evo&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Fglotorrents.pw%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce', 'extension': 'avi', 'stream': '' }"></div>
为了得到磁铁 links 我写了下面的代码:
import requests
from bs4 import BeautifulSoup
import re
#All the URLs found within a page’s <a> tags:
url = input("What is the address of the web page in question?")
#Here you would enter: https://kickassto.cc/something-in-the-woods-2016-hdrip-xvid-ac3-evo-t12972573.html
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# RE patterns:
magnet1 = re.compile(r"^magnet:\?xt=urn:btih:")
magnet2 = re.compile(r"magnet:\?xt=urn:btih:")
whateverTagOrAttribute = re.compile(r".{1,40}") #That has no more than forty characters
kickass = "data-sc-params"
dataAttribute = re.compile(r"data.{1,30}") # to match "data-whatever..", this whatever is unlikely to be longer than 30 characters in a name of an attribute.
links = soup.find_all("a", attrs={"href": magnet1})
if links == []:
links = soup.find_all("a", attrs={"href": magnet2}) # ? is a special character, therefore has to be escaped
if links == []:
links = soup.find_all("div", attrs={"data-sc-params": magnet2}) #kickassto.cc webpages do not place their magnets in a tags, but hide them in divs.
#links = soup.find_all(whateverTagOrAttribute, attrs={whateverTagOrAttribute: magnet2})
if links == []:
#the following works
links = soup.find_all(whateverTagOrAttribute, attrs={"data-sc-params": magnet2})
if links == []:
#the following does not work
links = soup.find_all(whateverTagOrAttribute, attrs={dataAttribute: magnet2})
if links != []:
print(f"The magnet links that we managed to scrape: {links}")
正如我在代码中评论的那样,我可以通过指定确切的属性“data-sc-params”来获取磁铁 links。
我想要完成的是通过 RE 模式概括我的解决方案,这样我就可以刮磁铁 links 而不仅仅是 data-sc-params 个属性,但来自任何 data-* attributes,或者最好来自任何自定义属性。可悲的是,我无法通过 re.compile(r"data.{1,30}") 获得它们,而且我没有知道为什么。我哪里错了?
您可以使用此脚本解析来自任意 HTML 属性的磁力链接:
import re
from bs4 import BeautifulSoup
txt = '''
<a data-download rel="nofollow" class="kaGiantButton siteButton iconButton" title="Download verified torrent file" target="_blank"
href="/torrents/Download Something in the Woods 2016 HDRip XviD AC3-EVO Torrent">
<i class="ka ka-verify"></i>
<span>Download torrent</span></a>
<div data-sc-replace data-sc-slot="_b6f619f42a2411c6688f2273fa3f628a" class="inlineblock"
data-sc-params="{ 'magnet': 'magnet:?xt=urn:btih:CC75C59E9FE0E8689DFD21558C02E9C9F92AE714&dn=something+in+the+woods+2016+hdrip+xvid+ac3+evo&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Fglotorrents.pw%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce', 'extension': 'avi', 'stream': '' }"></div>
<div some-attribute="magnet:?xt=urn:btih:THIS IS OTHER LINK">
</div>
'''
soup = BeautifulSoup(txt, 'html.parser')
r = re.compile(r'(magnet:\?xt=urn:btih:[^\'"]+)')
def find_magnet_link(t):
rv = []
for k in t.attrs:
if isinstance(t[k], list):
continue
m = r.search(t[k])
if m:
rv.append(m.group(1))
return rv
for tag in soup.find_all(find_magnet_link):
for link in find_magnet_link(tag):
print(link)
打印:
magnet:?xt=urn:btih:CC75C59E9FE0E8689DFD21558C02E9C9F92AE714&dn=something+in+the+woods+2016+hdrip+xvid+ac3+evo&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Fglotorrents.pw%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce
magnet:?xt=urn:btih:THIS IS OTHER LINK
我想编写一个可以从任何 HTML 标签的任何自定义数据属性中获取磁铁 link 的抓取工具。例如,在 kickassto.cc webpages 上,磁铁 link 没有分配给锚标签的 href 属性,而是分配给 data-sc- div 标签的 params 属性,例如:
<a data-download rel="nofollow" class="kaGiantButton siteButton iconButton" title="Download verified torrent file" target="_blank"
href="/torrents/Download Something in the Woods 2016 HDRip XviD AC3-EVO Torrent">
<i class="ka ka-verify"></i>
<span>Download torrent</span></a>
<div data-sc-replace data-sc-slot="_b6f619f42a2411c6688f2273fa3f628a" class="inlineblock"
data-sc-params="{ 'magnet': 'magnet:?xt=urn:btih:CC75C59E9FE0E8689DFD21558C02E9C9F92AE714&dn=something+in+the+woods+2016+hdrip+xvid+ac3+evo&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Fglotorrents.pw%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce', 'extension': 'avi', 'stream': '' }"></div>
为了得到磁铁 links 我写了下面的代码:
import requests
from bs4 import BeautifulSoup
import re
#All the URLs found within a page’s <a> tags:
url = input("What is the address of the web page in question?")
#Here you would enter: https://kickassto.cc/something-in-the-woods-2016-hdrip-xvid-ac3-evo-t12972573.html
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# RE patterns:
magnet1 = re.compile(r"^magnet:\?xt=urn:btih:")
magnet2 = re.compile(r"magnet:\?xt=urn:btih:")
whateverTagOrAttribute = re.compile(r".{1,40}") #That has no more than forty characters
kickass = "data-sc-params"
dataAttribute = re.compile(r"data.{1,30}") # to match "data-whatever..", this whatever is unlikely to be longer than 30 characters in a name of an attribute.
links = soup.find_all("a", attrs={"href": magnet1})
if links == []:
links = soup.find_all("a", attrs={"href": magnet2}) # ? is a special character, therefore has to be escaped
if links == []:
links = soup.find_all("div", attrs={"data-sc-params": magnet2}) #kickassto.cc webpages do not place their magnets in a tags, but hide them in divs.
#links = soup.find_all(whateverTagOrAttribute, attrs={whateverTagOrAttribute: magnet2})
if links == []:
#the following works
links = soup.find_all(whateverTagOrAttribute, attrs={"data-sc-params": magnet2})
if links == []:
#the following does not work
links = soup.find_all(whateverTagOrAttribute, attrs={dataAttribute: magnet2})
if links != []:
print(f"The magnet links that we managed to scrape: {links}")
正如我在代码中评论的那样,我可以通过指定确切的属性“data-sc-params”来获取磁铁 links。 我想要完成的是通过 RE 模式概括我的解决方案,这样我就可以刮磁铁 links 而不仅仅是 data-sc-params 个属性,但来自任何 data-* attributes,或者最好来自任何自定义属性。可悲的是,我无法通过 re.compile(r"data.{1,30}") 获得它们,而且我没有知道为什么。我哪里错了?
您可以使用此脚本解析来自任意 HTML 属性的磁力链接:
import re
from bs4 import BeautifulSoup
txt = '''
<a data-download rel="nofollow" class="kaGiantButton siteButton iconButton" title="Download verified torrent file" target="_blank"
href="/torrents/Download Something in the Woods 2016 HDRip XviD AC3-EVO Torrent">
<i class="ka ka-verify"></i>
<span>Download torrent</span></a>
<div data-sc-replace data-sc-slot="_b6f619f42a2411c6688f2273fa3f628a" class="inlineblock"
data-sc-params="{ 'magnet': 'magnet:?xt=urn:btih:CC75C59E9FE0E8689DFD21558C02E9C9F92AE714&dn=something+in+the+woods+2016+hdrip+xvid+ac3+evo&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Fglotorrents.pw%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce', 'extension': 'avi', 'stream': '' }"></div>
<div some-attribute="magnet:?xt=urn:btih:THIS IS OTHER LINK">
</div>
'''
soup = BeautifulSoup(txt, 'html.parser')
r = re.compile(r'(magnet:\?xt=urn:btih:[^\'"]+)')
def find_magnet_link(t):
rv = []
for k in t.attrs:
if isinstance(t[k], list):
continue
m = r.search(t[k])
if m:
rv.append(m.group(1))
return rv
for tag in soup.find_all(find_magnet_link):
for link in find_magnet_link(tag):
print(link)
打印:
magnet:?xt=urn:btih:CC75C59E9FE0E8689DFD21558C02E9C9F92AE714&dn=something+in+the+woods+2016+hdrip+xvid+ac3+evo&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Fglotorrents.pw%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce
magnet:?xt=urn:btih:THIS IS OTHER LINK