需要帮助在 Python 循环中使用 Scrapy 爬取多个网页并从那里到下一页

Need help in crawling multiple web pages and to the next page from there using Scrapy in Python in a loop

目前我正在同时抓取多个网站,需要抓取下一页,其中 link 将从已抓取的站点获取下一页。所以需要不断地爬取每一页的下一页。请注意,每页的第二页具有相同的 div 内容。

Spider.py

class UstodaySpider(scrapy.Spider):

name = 'usatoday'

start_urls = ['https://en.wikipedia.org/wiki/India',
              'https://en.wikipedia.org/wiki/USA
              ]

def parse(self, response):
    items = MynewsItem()
    print ("**********************************")
    print (type(response))
    print (response.url)

    all_section = response.css(' a.gnt_m_flm_a ')

    for quote in all_section:
        news_provider_id = '14'
        news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
        news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
        news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
        news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()


        items['news_provider_id'] = news_provider_id
        items['news_title'] = news_title
        items['news_details'] = news_details
        items['news_image'] = news_image
        items['news_page_url'] = news_page_url

    yield items
    next_page = 'https://en.wikipedia.org/wiki/India' + str(news_page_url)
    print(next_page)

Pipeline.py

import mysql
class MynewsPipeline(object):
 def __init__(self):
   self.create_connection()
    self.create_table()
 def create_connection(self):
    self.conn = mysql.connector.connect(

        host = 'localhost',
        user = 'root',
        password = '',
        database = 'mydb',
        port = '3306'
    )

    self.curr = self.conn.cursor()

 def create_table(self):

    self.curr.execute("""DROP TABLE IF EXISTS news_crawl_newsdetails""")
    self.curr.execute("""create table news_crawl_newsdetails(
                    news_provider_id text,
                    news_title text,
                    news_details text,
                    news_image text,
                    news_page_url text
                    )""" )

 def process_item(self, item, spider):
    self.store_db(item)
    return item
 def store_db(self,item):
    # print (item['news_title'][0])

     self.curr.execute("""insert into news_crawl_newsdetails (news_provider_id,news_title,news_details,news_image,news_page_url) values (%s,%s,%s,%s,%s)""", (

        item['news_provider_id'],
        item['news_title'][0],
        item['news_details'][0],
        item['news_image'][0],
        item['news_page_url'][0]

    ))

    self.conn.commit()

Items.py

import scrapy
class MynewsItem(scrapy.Item):
  news_provider_id = scrapy.Field()
  news_title = scrapy.Field()
  news_details = scrapy.Field()
  news_image = scrapy.Field()
  news_page_url = scrapy.Field()
  news_des = scrapy.Field()
  pass

你可以试试这个方法:

您应该找到 next_page xpath 。它可以是 link 或指向下一页的按钮:

next_page = response.selector.xpath(--xpath expression--).extract_first()

if next_page is not None:
    next_page_link = response.urljoin(next_page)
    yield scrapy.Request(url = next_page_link, callback=self.parse)

这就是您的解析函数的样子

def parse(self, response):
    items = MynewsItem()
    print ("**********************************")
    print (type(response))
    print (response.url)

    all_section = response.css(' a.gnt_m_flm_a ')

    for quote in all_section:
        news_provider_id = '14'
        news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
        news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
        news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
        news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()


        items['news_provider_id'] = news_provider_id
        items['news_title'] = news_title
        items['news_details'] = news_details
        items['news_image'] = news_image
        items['news_page_url'] = news_page_url

     next_page = response.selector.xpath("").extract_first()

     if next_page is not None:
         next_page_link = response.urljoin(next_page)
         yield scrapy.Request(url= next_page_link, callback=self.parse)