从文件 Python 中读取 url
Reading url from file Python
无法读取 txt 文件中的 url
我想将txt中的url个地址一个一个的读取打开,想从url个地址的源头获取标题的正则表达式
错误信息:
Traceback (most recent call last): File "Mypy.py", line 14, in
UrlsOpen = urllib2.urlopen(listSplit) File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout) File "/usr/lib/python2.7/urllib2.py", line 420, in open
req.timeout = timeout AttributeError: 'list' object has no attribute 'timeout'
Mypy.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
import urllib2
import threading
UrlListFile = open("Url.txt","r")
UrlListRead = UrlListFile.read()
UrlListFile.close()
listSplit = UrlListRead.split('\r\n')
UrlsOpen = urllib2.urlopen(listSplit)
ReadSource = UrlsOpen.read().decode('utf-8')
regex = '<title.*?>(.+?)</title>'
comp = re.compile(regex)
links = re.findall(comp,ReadSource)
for i in links:
SaveDataFiles = open("SaveDataMyFile.txt","w")
SaveDataFiles.write(i)
SaveDataFiles.close()
当您调用 urllib2.urlopen(listSplit)
时,listSplit 是一个需要 string or request object 的列表。迭代 listSplit 而不是将整个列表传递给 urlopen 是一个简单的修复。
另外 re.findall()
将 return 搜索到的每个 ReadSource 的列表。您可以通过几种方式处理此问题:
我选择只做一个列表列表来处理它
websites = [ [link, link], [link], [link, link, link]
并遍历两个列表。这样一来,您就可以为每个网站的每个 URL 列表做一些特定的事情(放入不同的文件等...)。
您还可以展平 website
列表以仅包含链接,而不是另一个包含链接的列表:
links = [link, link, link, link]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import urllib2
from pprint import pprint
UrlListFile = open("Url.txt", "r")
UrlListRead = UrlListFile.read()
UrlListFile.close()
listSplit = UrlListRead.splitlines()
pprint(listSplit)
regex = '<title.*?>(.+?)</title>'
comp = re.compile(regex)
websites = []
for url in listSplit:
UrlsOpen = urllib2.urlopen(url)
ReadSource = UrlsOpen.read().decode('utf-8')
websites.append(re.findall(comp, ReadSource))
with open("SaveDataMyFile.txt", "w") as SaveDataFiles:
for website in websites:
for link in website:
pprint(link)
SaveDataFiles.write(link.encode('utf-8'))
SaveDataFiles.close()
无法读取 txt 文件中的 url 我想将txt中的url个地址一个一个的读取打开,想从url个地址的源头获取标题的正则表达式 错误信息:
Traceback (most recent call last): File "Mypy.py", line 14, in UrlsOpen = urllib2.urlopen(listSplit) File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen return opener.open(url, data, timeout) File "/usr/lib/python2.7/urllib2.py", line 420, in open req.timeout = timeout AttributeError: 'list' object has no attribute 'timeout'
Mypy.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
import urllib2
import threading
UrlListFile = open("Url.txt","r")
UrlListRead = UrlListFile.read()
UrlListFile.close()
listSplit = UrlListRead.split('\r\n')
UrlsOpen = urllib2.urlopen(listSplit)
ReadSource = UrlsOpen.read().decode('utf-8')
regex = '<title.*?>(.+?)</title>'
comp = re.compile(regex)
links = re.findall(comp,ReadSource)
for i in links:
SaveDataFiles = open("SaveDataMyFile.txt","w")
SaveDataFiles.write(i)
SaveDataFiles.close()
当您调用 urllib2.urlopen(listSplit)
时,listSplit 是一个需要 string or request object 的列表。迭代 listSplit 而不是将整个列表传递给 urlopen 是一个简单的修复。
另外 re.findall()
将 return 搜索到的每个 ReadSource 的列表。您可以通过几种方式处理此问题:
我选择只做一个列表列表来处理它
websites = [ [link, link], [link], [link, link, link]
并遍历两个列表。这样一来,您就可以为每个网站的每个 URL 列表做一些特定的事情(放入不同的文件等...)。
您还可以展平 website
列表以仅包含链接,而不是另一个包含链接的列表:
links = [link, link, link, link]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import urllib2
from pprint import pprint
UrlListFile = open("Url.txt", "r")
UrlListRead = UrlListFile.read()
UrlListFile.close()
listSplit = UrlListRead.splitlines()
pprint(listSplit)
regex = '<title.*?>(.+?)</title>'
comp = re.compile(regex)
websites = []
for url in listSplit:
UrlsOpen = urllib2.urlopen(url)
ReadSource = UrlsOpen.read().decode('utf-8')
websites.append(re.findall(comp, ReadSource))
with open("SaveDataMyFile.txt", "w") as SaveDataFiles:
for website in websites:
for link in website:
pprint(link)
SaveDataFiles.write(link.encode('utf-8'))
SaveDataFiles.close()