如何使用 Python 从主列表中提取或删除多个列表
How to Extract or Remove Multiple Lists from Master List Using Python
import requests
from bs4 import BeautifulSoup
url = 'https://www.isitwp.com/hosting-reviews/'
r = requests.get(url)
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'entry-content')
for item in items:
productlist = []
all_links = []
all_coupon = []
all_review = []
# FIND ALL LINKS ON PAGE
all_links_counter = 0
for link in soup.find_all('a', href=True):
current_link = link['href']
all_links.append(current_link)
# Remove Duplicate Urls
s = []
for i in all_links:
if i not in s:
all_links_counter = all_links_counter + 1
s.append(i)
#print('All Links Found:', all_links_counter, ' times with Urls:', all_links)
# FIND URL CONTAINING: coupon
coupon_counter = 0
for link in soup.find_all('a', href=True):
if 'coupon' in link.get('href'):
coupon = link.get('href')
all_coupon.append(coupon)
# Remove Duplicate Urls
s = []
for i in all_coupon:
if i not in s:
coupon_counter = coupon_counter + 1
s.append(i)
print('coupon Found:', coupon_counter, ' times with Urls:', all_coupon)
# FIND URL CONTAINING: review
review_counter = 0
for link in soup.find_all('a', href=True):
if 'review' in link.get('href'):
review = link.get('href')
all_review.append(review)
# Remove Duplicate Urls
s = []
for i in all_review:
if i not in s:
review_counter = review_counter + 1
s.append(i)
print('review Found:', review_counter, ' times with Urls:', all_review)
print()
product = {
'All Links Counter': all_links_counter,
'All Links': ', '.join(all_links),
'coupon Counter': coupon_counter,
'coupon Links': ', '.join(all_coupon),
'review Counter': review_counter,
'review Links': ', '.join(all_review),
}
productlist.append(product)
print('Product List:\n',product)
以上脚本抓取嵌入页面的 URL 并保存结果如下:
- all_links - 页面上的所有链接
- all_coupon - 所有包含单词 coupon
的链接
- all_review - 所有包含单词 review
的链接
我现在需要一种方法来识别余额 URL,如下所示:
balance_urls = all_links - (all_coupon + all_review)
如有任何帮助,我们将不胜感激。
使用集差异:
balance_urls = list(set(all_links).difference(set(all_coupon+all_review)))
如果你想保留重复项,或者列表理解:
balance_urls = [l for l in all_links if l not in (all_coupon+all_review)]
import requests
from bs4 import BeautifulSoup
url = 'https://www.isitwp.com/hosting-reviews/'
r = requests.get(url)
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'entry-content')
for item in items:
productlist = []
all_links = []
all_coupon = []
all_review = []
# FIND ALL LINKS ON PAGE
all_links_counter = 0
for link in soup.find_all('a', href=True):
current_link = link['href']
all_links.append(current_link)
# Remove Duplicate Urls
s = []
for i in all_links:
if i not in s:
all_links_counter = all_links_counter + 1
s.append(i)
#print('All Links Found:', all_links_counter, ' times with Urls:', all_links)
# FIND URL CONTAINING: coupon
coupon_counter = 0
for link in soup.find_all('a', href=True):
if 'coupon' in link.get('href'):
coupon = link.get('href')
all_coupon.append(coupon)
# Remove Duplicate Urls
s = []
for i in all_coupon:
if i not in s:
coupon_counter = coupon_counter + 1
s.append(i)
print('coupon Found:', coupon_counter, ' times with Urls:', all_coupon)
# FIND URL CONTAINING: review
review_counter = 0
for link in soup.find_all('a', href=True):
if 'review' in link.get('href'):
review = link.get('href')
all_review.append(review)
# Remove Duplicate Urls
s = []
for i in all_review:
if i not in s:
review_counter = review_counter + 1
s.append(i)
print('review Found:', review_counter, ' times with Urls:', all_review)
print()
product = {
'All Links Counter': all_links_counter,
'All Links': ', '.join(all_links),
'coupon Counter': coupon_counter,
'coupon Links': ', '.join(all_coupon),
'review Counter': review_counter,
'review Links': ', '.join(all_review),
}
productlist.append(product)
print('Product List:\n',product)
以上脚本抓取嵌入页面的 URL 并保存结果如下:
- all_links - 页面上的所有链接
- all_coupon - 所有包含单词 coupon 的链接
- all_review - 所有包含单词 review 的链接
我现在需要一种方法来识别余额 URL,如下所示:
balance_urls = all_links - (all_coupon + all_review)
如有任何帮助,我们将不胜感激。
使用集差异:
balance_urls = list(set(all_links).difference(set(all_coupon+all_review)))
如果你想保留重复项,或者列表理解:
balance_urls = [l for l in all_links if l not in (all_coupon+all_review)]