Writing script for college sports class, keep getting error 'AttributeError: module 'scrapy' has no attribute 'spider''
Writing script for college sports class, keep getting error 'AttributeError: module 'scrapy' has no attribute 'spider''
这是我的代码,不确定我做错了什么。感谢任何帮助。
from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd
url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)
class WebSpider(scrapy.spider):
name = "Web_Spider"
allowed_domains = ['https://www.ufc.com/athletes']
start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']
def __init__(self):
self.driver = driver
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')
try:
next.click()
except:
break
self.driver.close()
我不断收到错误“AttributeError:模块 'scrapy' 没有属性 'spider'”。不确定在这里做什么,Scrapy 已正确安装且是最新的。
它是 scrapy.Spider,大写“s”
Try now:
from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd
url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)
class WebSpider(scrapy.Spider):
name = "Web_Spider"
allowed_domains = ['https://www.ufc.com/athletes']
start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']
def __init__(self):
self.driver = driver
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')
try:
next.click()
except:
break
self.driver.close()
根据您的尝试,我不会在这里使用 Selenium,因为您可以直接通过 ajax 获取数据。 Selenium 仍然可以工作,但它有点矫枉过正且效率较低。
试试这个:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ufc.com/views/ajax?_wrapper_format=drupal_ajax'
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36'}
page = 1
end_of_load = False
while end_of_load == False:
payload = {
'view_name': 'all_athletes',
'view_display_id': 'page',
'view_path': '/athletes/all',
'pager_element': '0',
'gender': 'All',
'page': '%s' %page}
jsonData = requests.post(url, headers=headers, data=payload).json()
print('Page: %s' %page)
page += 1
html = jsonData[-1]['data']
soup = BeautifulSoup(html, 'html.parser')
player_cards = soup.find_all('div',{'class':re.compile('.*view-mode-all-athletes-result.*')})
if not player_cards:
end_of_load = True
else:
for player_card in player_cards:
name = player_card.find('span',{'class':re.compile('.*athlete__name.*')}).text.strip()
try:
weight_class = player_card.find('div',{'class':re.compile('.*weight-class.*')}).text.strip()
except:
weight_class = 'N/A'
try:
record = player_card.find('span',{'class':re.compile('.*athlete__record.*')}).text.strip()
except:
record = 'N/A'
print('\t%s - %s\t%s' %(name,weight_class,record))
这是我的代码,不确定我做错了什么。感谢任何帮助。
from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd
url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)
class WebSpider(scrapy.spider):
name = "Web_Spider"
allowed_domains = ['https://www.ufc.com/athletes']
start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']
def __init__(self):
self.driver = driver
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')
try:
next.click()
except:
break
self.driver.close()
我不断收到错误“AttributeError:模块 'scrapy' 没有属性 'spider'”。不确定在这里做什么,Scrapy 已正确安装且是最新的。
它是 scrapy.Spider,大写“s”
Try now:
from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd
url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)
class WebSpider(scrapy.Spider):
name = "Web_Spider"
allowed_domains = ['https://www.ufc.com/athletes']
start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']
def __init__(self):
self.driver = driver
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')
try:
next.click()
except:
break
self.driver.close()
根据您的尝试,我不会在这里使用 Selenium,因为您可以直接通过 ajax 获取数据。 Selenium 仍然可以工作,但它有点矫枉过正且效率较低。
试试这个:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ufc.com/views/ajax?_wrapper_format=drupal_ajax'
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36'}
page = 1
end_of_load = False
while end_of_load == False:
payload = {
'view_name': 'all_athletes',
'view_display_id': 'page',
'view_path': '/athletes/all',
'pager_element': '0',
'gender': 'All',
'page': '%s' %page}
jsonData = requests.post(url, headers=headers, data=payload).json()
print('Page: %s' %page)
page += 1
html = jsonData[-1]['data']
soup = BeautifulSoup(html, 'html.parser')
player_cards = soup.find_all('div',{'class':re.compile('.*view-mode-all-athletes-result.*')})
if not player_cards:
end_of_load = True
else:
for player_card in player_cards:
name = player_card.find('span',{'class':re.compile('.*athlete__name.*')}).text.strip()
try:
weight_class = player_card.find('div',{'class':re.compile('.*weight-class.*')}).text.strip()
except:
weight_class = 'N/A'
try:
record = player_card.find('span',{'class':re.compile('.*athlete__record.*')}).text.strip()
except:
record = 'N/A'
print('\t%s - %s\t%s' %(name,weight_class,record))