Scrapy 剥离逗号
Scrapy Stripping Comma
import scrapy
import pandas as pd
from ..items import HomedepotpricespiderItem
from scrapy.http import Request
class HomedepotspiderSpider(scrapy.Spider):
name = 'homeDepotSpider'
allowed_domains = ['homedepot.com']
start_urls = ['https://www.homedepot.com/pep/304660691']#.format(omsID = omsID)
#for omsID in omsList]
def parse(self, response):
#call home depot function
for item in self.parseHomeDepot(response):
yield item
pass
def parseHomeDepot(self, response):
#get top level item
items = response.css('#zone-a-product')
for product in items:
item = HomedepotpricespiderItem()
#get SKU
productSKU = product.css('.product-info-bar__detail:nth-child(2)::text').getall()
#get rid of all the stuff i dont need
#productSKU = [x.strip(' ') for x in productSKU] #whiteSpace
#productSKU = [x.strip(',') for x in productSKU]
#productSKU = [x.strip('\n') for x in productSKU]
#productSKU = [x.strip('\t') for x in productSKU]
#productSKU = [x.strip(' Model# ') for x in productSKU] #gets rid of the model name
所以我的选择器没问题,它们得到了正确的字段。
当 运行 带状线被注释掉时,我得到 'Model #,RA30'
然后当我 运行 我的程序没有注释掉 strip 命令时,我得到 ,RA30
Im 运行在终端中使用此命令运行我的程序:scrapy crawl homeDepotSpider -t csv -o - > "/Users/userName/Desktop/homeDepotv2Helpers/homeDepotTest.csv"
我上面的输出是直接从 CSV 文件中复制的
编辑*
我也试过这个
productSKU = [x.replace(' ,', '') for x in productSKU]
但这没有用。这也是终端 {'productSKU': ['', 'RA30']}
的直接输出
strip function will only remove signs or substrings at the beginning or end of a string. If you want to remove a character no matter where in the string, use the replace函数。
但是,如果您只想删除字符串开头或结尾的逗号,则应在 roductSKU = [x.strip(' Model# ') for x in productSKU]
之后再次重复行 productSKU = [x.strip(',') for x in productSKU]
您为什么不想使用 XPath + 正则表达式?
product_model = response.xpath('//h2[@class="product-info-bar__detail"][contains(., "Model #")]/text()').re_first(r'#(.+)')
您的选择器为您提供了一个包含两个元素的列表:['Model #', 'RA30']
.
要仅获取 SKU,只需使用索引:
productSKU = product.css('.product-info-bar__detail:nth-child(2)::text').getall()[1]
如果产品有可能没有 SKU,请确保正确处理异常。
import scrapy
import pandas as pd
from ..items import HomedepotpricespiderItem
from scrapy.http import Request
class HomedepotspiderSpider(scrapy.Spider):
name = 'homeDepotSpider'
allowed_domains = ['homedepot.com']
start_urls = ['https://www.homedepot.com/pep/304660691']#.format(omsID = omsID)
#for omsID in omsList]
def parse(self, response):
#call home depot function
for item in self.parseHomeDepot(response):
yield item
pass
def parseHomeDepot(self, response):
#get top level item
items = response.css('#zone-a-product')
for product in items:
item = HomedepotpricespiderItem()
#get SKU
productSKU = product.css('.product-info-bar__detail:nth-child(2)::text').getall()
#get rid of all the stuff i dont need
#productSKU = [x.strip(' ') for x in productSKU] #whiteSpace
#productSKU = [x.strip(',') for x in productSKU]
#productSKU = [x.strip('\n') for x in productSKU]
#productSKU = [x.strip('\t') for x in productSKU]
#productSKU = [x.strip(' Model# ') for x in productSKU] #gets rid of the model name
所以我的选择器没问题,它们得到了正确的字段。
当 运行 带状线被注释掉时,我得到 'Model #,RA30'
然后当我 运行 我的程序没有注释掉 strip 命令时,我得到 ,RA30
Im 运行在终端中使用此命令运行我的程序:scrapy crawl homeDepotSpider -t csv -o - > "/Users/userName/Desktop/homeDepotv2Helpers/homeDepotTest.csv"
我上面的输出是直接从 CSV 文件中复制的
编辑*
我也试过这个
productSKU = [x.replace(' ,', '') for x in productSKU]
但这没有用。这也是终端 {'productSKU': ['', 'RA30']}
strip function will only remove signs or substrings at the beginning or end of a string. If you want to remove a character no matter where in the string, use the replace函数。
但是,如果您只想删除字符串开头或结尾的逗号,则应在 roductSKU = [x.strip(' Model# ') for x in productSKU]
productSKU = [x.strip(',') for x in productSKU]
您为什么不想使用 XPath + 正则表达式?
product_model = response.xpath('//h2[@class="product-info-bar__detail"][contains(., "Model #")]/text()').re_first(r'#(.+)')
您的选择器为您提供了一个包含两个元素的列表:['Model #', 'RA30']
.
要仅获取 SKU,只需使用索引:
productSKU = product.css('.product-info-bar__detail:nth-child(2)::text').getall()[1]
如果产品有可能没有 SKU,请确保正确处理异常。