Scrapy IOError: [Errno 22] invalid mode ('wb') or filename
Scrapy IOError: [Errno 22] invalid mode ('wb') or filename
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
import os
from Erowid.items import ErowidItem
import codecs
class ExperiencesSpider(CrawlSpider):
name = "experiences"
allowed_domains = ["www.erowid.org"]
start_urls = ['https://www.erowid.org/experiences/exp_list.shtml']
rules = [
Rule(LinkExtractor(allow =('subs/exp_[a-zA-Z]+.shtml')), follow = True),
Rule(LinkExtractor(allow=r'/experiences/exp\.php\?ID=\d+$'),callback='parse_item', follow = True)
]
def parse_item(self, response):
selectors = response.css('div')
for selector in selectors:
experience = ErowidItem()
experience['Author'] = selector.xpath('//div[@class="author"]/a/text()').extract()
experience['Title'] = selector.xpath('//div[@class="title"]/text()').extract()
experience['Substance'] = selector.xpath('//div[@class="substance"]/text()').extract()
experience['Text'] = selector.xpath("//div[@class = 'report-text-surround']/text()").extract()
experience['Title'] = str(experience['Title']).replace('\' , "")
experience['Title'] = str(experience['Title']).replace('?' , "")
directory = os.path.join('Erowid/archive/',experience['Substance'][0].strip().lower())
filename = os.path.join(directory,experience['Title'][0]+'.txt')
if not os.path.exists(directory):
os.makedirs(directory)
with codecs.open(filename, encoding = 'utf-8', mode= 'wb') as fid:
for symbols in experience['Text']:
fid.write(symbols)
yield experience
我正在尝试抓取 Erowid,到目前为止,这段代码已经完成,因此我将创建一个目录并主要根据体验名称和文本编写一个文件。 (根据经验,我指的是我从 Erowid 上删除的信息。)
问题在于某些体验的名称中包含无法作为文件写入的字符,因为它们包含无法在 windows filenames 中使用的保留字符。
我正在尝试删除所有这些保留字符
The following reserved characters:
<
(less than)
>
(greater than)
:
(colon)
"
(double quote)
/
(forward slash)
\
(backslash)
|
(vertical bar or pipe)
?
(question mark)
*
(asterisk)
我试着用
行做点什么
experience['Title'] = str(experience['Title']).replace('\' , "")
experience['Title'] = str(experience['Title']).replace('?' , "")
(大部分错误来自“\”和“?”)但我仍然收到错误
IOError: [Errno 22] invalid mode ('wb') or filename: u'Erowid/archive/syrian rue\Meditative Help?.txt'
或其他一些我知道是错误的文件名,因为不应该有问号或反斜杠。
我做错了什么导致了这些错误?
您替换特殊字符失败。试试这个:
filename = os.path.join(directory,experience['Title'][0]+'.txt')
filename = filename.replace('\' , "").replace('?' , "")
更新
您只想指定一个合法的文件名。所以我想出了这样的想法。
directory = os.path.join('Erowid/archive/',experience['Substance'][0].strip().lower())
filename = experience['Substance']+experience['Title'][0]+'.txt'
filename = "".join([i for i in filename if i in string.ascii_letters])
#only use ascii letters as file name
filename = os.path.join(directory, filename)
string.ascii_letters
The concatenation of the ascii_lowercase and ascii_uppercase constants described below. This value is not locale-dependent.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
import os
from Erowid.items import ErowidItem
import codecs
class ExperiencesSpider(CrawlSpider):
name = "experiences"
allowed_domains = ["www.erowid.org"]
start_urls = ['https://www.erowid.org/experiences/exp_list.shtml']
rules = [
Rule(LinkExtractor(allow =('subs/exp_[a-zA-Z]+.shtml')), follow = True),
Rule(LinkExtractor(allow=r'/experiences/exp\.php\?ID=\d+$'),callback='parse_item', follow = True)
]
def parse_item(self, response):
selectors = response.css('div')
for selector in selectors:
experience = ErowidItem()
experience['Author'] = selector.xpath('//div[@class="author"]/a/text()').extract()
experience['Title'] = selector.xpath('//div[@class="title"]/text()').extract()
experience['Substance'] = selector.xpath('//div[@class="substance"]/text()').extract()
experience['Text'] = selector.xpath("//div[@class = 'report-text-surround']/text()").extract()
experience['Title'] = str(experience['Title']).replace('\' , "")
experience['Title'] = str(experience['Title']).replace('?' , "")
directory = os.path.join('Erowid/archive/',experience['Substance'][0].strip().lower())
filename = os.path.join(directory,experience['Title'][0]+'.txt')
if not os.path.exists(directory):
os.makedirs(directory)
with codecs.open(filename, encoding = 'utf-8', mode= 'wb') as fid:
for symbols in experience['Text']:
fid.write(symbols)
yield experience
我正在尝试抓取 Erowid,到目前为止,这段代码已经完成,因此我将创建一个目录并主要根据体验名称和文本编写一个文件。 (根据经验,我指的是我从 Erowid 上删除的信息。)
问题在于某些体验的名称中包含无法作为文件写入的字符,因为它们包含无法在 windows filenames 中使用的保留字符。
我正在尝试删除所有这些保留字符
The following reserved characters:
<
(less than)>
(greater than):
(colon)"
(double quote)/
(forward slash)\
(backslash)|
(vertical bar or pipe)?
(question mark)*
(asterisk)
我试着用
行做点什么 experience['Title'] = str(experience['Title']).replace('\' , "")
experience['Title'] = str(experience['Title']).replace('?' , "")
(大部分错误来自“\”和“?”)但我仍然收到错误
IOError: [Errno 22] invalid mode ('wb') or filename: u'Erowid/archive/syrian rue\Meditative Help?.txt'
或其他一些我知道是错误的文件名,因为不应该有问号或反斜杠。
我做错了什么导致了这些错误?
您替换特殊字符失败。试试这个:
filename = os.path.join(directory,experience['Title'][0]+'.txt')
filename = filename.replace('\' , "").replace('?' , "")
更新
您只想指定一个合法的文件名。所以我想出了这样的想法。
directory = os.path.join('Erowid/archive/',experience['Substance'][0].strip().lower())
filename = experience['Substance']+experience['Title'][0]+'.txt'
filename = "".join([i for i in filename if i in string.ascii_letters])
#only use ascii letters as file name
filename = os.path.join(directory, filename)
string.ascii_letters
The concatenation of the ascii_lowercase and ascii_uppercase constants described below. This value is not locale-dependent.