从 python 中的文件循环到每一行
Loop to a each line from a file in python
我制作了这段正在读取 UpdatedUrls.tmp
的代码,但此文件仅包含一个 url。
该代码执行诸如查找电子邮件的废料网站之类的操作,但如果我在此文件中放置 2 个或更多地址,该代码将不起作用。 emails.txt
为空。
我需要循环或更改某些内容才能从 url 收到两封或更多电子邮件。
UpdatedUrls.tmp
的内容:
https://mobissom.com.br/contato/
我需要合作:
https://mobissom.com.br/contato/
https://www.site2.com
https://www.site3.com
代码在这里:
import re
import requests
from urllib.parse import urlsplit
from collections import deque
from bs4 import BeautifulSoup
import pandas as pd
with open("updatedUrls.tmp", "r") as smails:
original_url = smails.readlines()
original_url = ''.join(original_url)
# to save urls to be scraped
unscraped = deque([original_url])
# to save scraped urls
scraped = set()
# to save fetched emails
emails = set()
while len(unscraped):
# move unsraped_url to scraped_urls set
url = unscraped.popleft() # popleft(): Remove and return an element from the left side of the deque
scraped.add(url)
parts = urlsplit(url)
base_url = "{0.scheme}://{0.netloc}".format(parts)
if '/' in parts.path:
path = url[:url.rfind('/') + 1]
else:
path = url
print("Crawling URL %s" % url)
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
continue
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.com", response.text, re.I))
emails.update(new_emails)
soup = BeautifulSoup(response.text, 'lxml')
for anchor in soup.find_all("a"):
if "href" in anchor.attrs:
link = anchor.attrs["href"]
else:
link = ''
if link.startswith('/'):
link = base_url + link
elif not link.startswith('http'):
link = path + link
if not link.endswith(".gz"):
if not link in unscraped and not link in scraped:
unscraped.append(link)
df = pd.DataFrame(emails, columns=None)
df.to_csv('email.txt', index=False)
with open('email.txt', 'r') as fin:
data = fin.read().splitlines(True)
with open('email.txt', 'w') as fout:
fout.writelines(data[1:])
您正在将文件的所有内容作为单个字符串读取到 original_url
中,因为您在拆分行后立即加入这些行。
改变
with open("updatedUrls.tmp", "r") as smails:
original_url = smails.readlines()
original_url = ''.join(original_url) # This joins all lines into one string
unscraped = deque([original_url]) # original_url is a string
# unscraped = deque(['url1\nurl2\nurl3'])
进入
with open("updatedUrls.tmp", "r") as smails:
# Gets rid of trailing newlines
original_url = smails.read().splitlines()
unscraped = deque(original_url) # original_url is a list of URLs
# unscraped = deque(['url1', 'url2', 'url3'])
我制作了这段正在读取 UpdatedUrls.tmp
的代码,但此文件仅包含一个 url。
该代码执行诸如查找电子邮件的废料网站之类的操作,但如果我在此文件中放置 2 个或更多地址,该代码将不起作用。 emails.txt
为空。
我需要循环或更改某些内容才能从 url 收到两封或更多电子邮件。
UpdatedUrls.tmp
的内容:
https://mobissom.com.br/contato/
我需要合作:
https://mobissom.com.br/contato/
https://www.site2.com
https://www.site3.com
代码在这里:
import re
import requests
from urllib.parse import urlsplit
from collections import deque
from bs4 import BeautifulSoup
import pandas as pd
with open("updatedUrls.tmp", "r") as smails:
original_url = smails.readlines()
original_url = ''.join(original_url)
# to save urls to be scraped
unscraped = deque([original_url])
# to save scraped urls
scraped = set()
# to save fetched emails
emails = set()
while len(unscraped):
# move unsraped_url to scraped_urls set
url = unscraped.popleft() # popleft(): Remove and return an element from the left side of the deque
scraped.add(url)
parts = urlsplit(url)
base_url = "{0.scheme}://{0.netloc}".format(parts)
if '/' in parts.path:
path = url[:url.rfind('/') + 1]
else:
path = url
print("Crawling URL %s" % url)
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
continue
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.com", response.text, re.I))
emails.update(new_emails)
soup = BeautifulSoup(response.text, 'lxml')
for anchor in soup.find_all("a"):
if "href" in anchor.attrs:
link = anchor.attrs["href"]
else:
link = ''
if link.startswith('/'):
link = base_url + link
elif not link.startswith('http'):
link = path + link
if not link.endswith(".gz"):
if not link in unscraped and not link in scraped:
unscraped.append(link)
df = pd.DataFrame(emails, columns=None)
df.to_csv('email.txt', index=False)
with open('email.txt', 'r') as fin:
data = fin.read().splitlines(True)
with open('email.txt', 'w') as fout:
fout.writelines(data[1:])
您正在将文件的所有内容作为单个字符串读取到 original_url
中,因为您在拆分行后立即加入这些行。
改变
with open("updatedUrls.tmp", "r") as smails:
original_url = smails.readlines()
original_url = ''.join(original_url) # This joins all lines into one string
unscraped = deque([original_url]) # original_url is a string
# unscraped = deque(['url1\nurl2\nurl3'])
进入
with open("updatedUrls.tmp", "r") as smails:
# Gets rid of trailing newlines
original_url = smails.read().splitlines()
unscraped = deque(original_url) # original_url is a list of URLs
# unscraped = deque(['url1', 'url2', 'url3'])