Scrapy 递归解析:我在这里做错了什么
Scrapy recursive parse : what i am doing wrong here
我正在尝试抓取 aspx 网站列表视图,因此每个页面的结构都是相同的(因此我正在使用递归蜘蛛调用)
Error: ERROR: Spider must return Request, BaseItem or None, got 'list'
不确定这个错误是什么意思..
我做错了,非常基本但无法识别...指出正确的方向..谢谢
我的代码:
name = "XYZscraper"
allowed_domains = ["xyz.com"]
def __init__(self):
self.start_urls = [
"xyz.com with aspx list viwe",
]
def parse(self, response):
sel = Selector(response)
if sel.xpath('//table/tr/td/form/table/tr'):
print "xpath is present"
elements = sel.xpath('//table/tr/td/form/table/tr')
else:
print "xpath not present "
print " going in with fallback xpath"
elements = sel.xpath('///table/tr')
counter = 1
nextPageAvailable = False # flat if netx page link is available or not
base_url = "xyz.com/"
try:
items = response.meta['item']
except Exception as e:
items = []
pass
no_of_row = len(elements)
for each_row in elements:
#first two row and last two row does not have data
#first and last row have link to previous and next page ...using first row for navigation
if counter == 1:
if each_row.xpath('td/a[1]/text()').extract()[0] == "Previous":
if each_row.xpath('td/a[2]/text()'):
if each_row.xpath('td/a[2]/text()').extract()[0] == "Next":
nextPageAvailable = True
elif each_row.xpath('td/a[1]/text()').extract()[0] == "Next":
nextPageAvailable = True
if counter > 2:
if counter < (no_of_row - 1):
item = myItem()
item['title'] = each_row.xpath('td/div/a/span/text()').extract()[0].encode('ascii', 'ignore') # Title
items.append(item)
counter += 1
if nextPageAvailable:
yield FormRequest.from_response(
response,
meta={'item': items},
formnumber=1,
formdata={
'__EVENTTARGET': 'ctl00$ctl10$EventsDG$ctl01$ctl01', #for request to navigate to next page in table
},
callback=self.parse # calling recursive function since signature of page will remain same just data is refreshed
)
else:
# when end of the list is arrived ...calling next functin to pop item ..may be !! does not work !!
self.popItems(response)
# does not work
# Error: python < 3.3 does not allow return with argument inside the generator
# return item
def popItems(self, response):
print "i am here"
items = ()
baseitem = response.meta['item']
items = baseitem
return items
也许你的意思是这样的:
else:
for item in self.popItems(response):
yield item
或更短的版本:
else:
yield from self.popItems(response)
我正在尝试抓取 aspx 网站列表视图,因此每个页面的结构都是相同的(因此我正在使用递归蜘蛛调用)
Error: ERROR: Spider must return Request, BaseItem or None, got 'list'
不确定这个错误是什么意思..
我做错了,非常基本但无法识别...指出正确的方向..谢谢
我的代码:
name = "XYZscraper"
allowed_domains = ["xyz.com"]
def __init__(self):
self.start_urls = [
"xyz.com with aspx list viwe",
]
def parse(self, response):
sel = Selector(response)
if sel.xpath('//table/tr/td/form/table/tr'):
print "xpath is present"
elements = sel.xpath('//table/tr/td/form/table/tr')
else:
print "xpath not present "
print " going in with fallback xpath"
elements = sel.xpath('///table/tr')
counter = 1
nextPageAvailable = False # flat if netx page link is available or not
base_url = "xyz.com/"
try:
items = response.meta['item']
except Exception as e:
items = []
pass
no_of_row = len(elements)
for each_row in elements:
#first two row and last two row does not have data
#first and last row have link to previous and next page ...using first row for navigation
if counter == 1:
if each_row.xpath('td/a[1]/text()').extract()[0] == "Previous":
if each_row.xpath('td/a[2]/text()'):
if each_row.xpath('td/a[2]/text()').extract()[0] == "Next":
nextPageAvailable = True
elif each_row.xpath('td/a[1]/text()').extract()[0] == "Next":
nextPageAvailable = True
if counter > 2:
if counter < (no_of_row - 1):
item = myItem()
item['title'] = each_row.xpath('td/div/a/span/text()').extract()[0].encode('ascii', 'ignore') # Title
items.append(item)
counter += 1
if nextPageAvailable:
yield FormRequest.from_response(
response,
meta={'item': items},
formnumber=1,
formdata={
'__EVENTTARGET': 'ctl00$ctl10$EventsDG$ctl01$ctl01', #for request to navigate to next page in table
},
callback=self.parse # calling recursive function since signature of page will remain same just data is refreshed
)
else:
# when end of the list is arrived ...calling next functin to pop item ..may be !! does not work !!
self.popItems(response)
# does not work
# Error: python < 3.3 does not allow return with argument inside the generator
# return item
def popItems(self, response):
print "i am here"
items = ()
baseitem = response.meta['item']
items = baseitem
return items
也许你的意思是这样的:
else:
for item in self.popItems(response):
yield item
或更短的版本:
else:
yield from self.popItems(response)