如何让爬虫运行 n次,停下来等10分钟,然后再运行 n次,重复整个过程?
How to get a crawler to run n times, stop and wait 10 minutes, and then run it again n times and repeat the whole process?
我有一个网络爬虫:
import http.client
import time
import threading
class Worker(threading.Thread): # Booting crawler workers
def __init__(self, base, ref, conn, urlKey, url):
threading.Thread.__init__(self)
self.base = base
self.ref = ref
self.conn = conn
self.urlKey = urlKey
self.url = url
def run_aux(self): # Getting web response and do stuff with it
self.conn.request('GET', self.url)
response = self.conn.getresponse()
status = response.status
if status == 200:
data = str(response.read())
if data.find('No he podido entenderte.') != -1:
data = 'Err: Ruta no comprendida'
else:
pos = data.find('>Id: ') + 5
if pos == 4:
data = 'Err: Tag >Id: no encontrado'
else:
data = data[pos:pos+32]
else:
data = 'Err: ' + str(status)
response.close()
self.base.saveRes(self.urlKey, data, self.ref)
def run(self): # Auxiliary function that allows the crawler to continue running if any problem is found while connecting to the web site. It will continue trying to runtill connections are accepted
try:
self.run_aux()
except Exception as e:
self.run()
class MIdCrawler(object): # Boot and instantiate the crawler
def __init__(self, site, nThreads, urlDict):
self.res = {}
self.nThreads = nThreads
self.urlDict = urlDict.copy()
self.urlKeys = list(self.urlDict.keys())
self.conns = [http.client.HTTPSConnection(site) for _ in range(self.nThreads)]
self.threads_launched = 0
self.launch()
def newThread(self, ref): # How the threads are launched
if self.threads_launched < len(self.urlKeys):
urlKey = self.urlKeys[self.threads_launched]
Worker(self, ref, self.conns[ref], urlKey, self.urlDict.get(urlKey)).start()
self.threads_launched += 1
def launch(self): # Create new threads
for i in range(self.nThreads):
self.newThread(i)
def saveRes(self, urlKey, data, ref): # Saving the results into a dict
self.res[urlKey] = data
#print('Dato guardado')
print(urlKey)
self.newThread(ref)
def getRes(self): # Return results
while len(self.res) < len(self.urlKeys):
time.sleep(0.5)
return(self.res)
def close(self): # Close working threads
for i in range(self.nThreads):
self.conns[i].close()
此爬虫将从我的网络服务器恢复某些数据,当我的服务器收到调用时,它会调用 Google 地图 api 以恢复一些路线数据。我必须打 3000 次电话,问题是我注意到 Google 地图在短时间内从同一 IP 连续拨打一定数量的电话时停止接听电话。出现此红旗的调用次数不固定,但始终在 240 到 300 次连续调用之间。
此爬虫允许指定我们要并行启动的线程(调用)数,问题是当我到达 2xx-300 调用时 Google 地图服务器停止响应我的请求.
我是这个 web scraping 的新手,我希望有人告诉我如何将这个爬虫修改为 运行 200 个调用,等待十分钟,然后继续从它停止的地方开始。
这是爬虫收到的一些示例数据:
['/testchat?text=quiero%20ir%20desde%20carrer%20cervantes%2C%201%2C%20sant%20andreu%20de%20la%20barca%2C%20hasta%20avinguda%20constituci%C3%B3%2C%2024%2C%20sant%20andreu%20de%20la%20barca',
'/testchat?text=quiero%20ir%20desde%20General%20Mitre%20239%2C%20Barcelona%2C%20hasta%20Plaza%20Artos%2C%20Barcelona',
'/testchat?text=quiero%20ir%20desde%20carrer%20Bon%20viatge%2C%20sant%20Joan%20Desp%C3%AD%2C%20hasta%20mare%20de%20deu%20la%20Merc%C3%A8%2C%20sant%20Joan%20Desp%C3%AD',
'/testchat?text=quiero%20ir%20desde%20no%20recuerdo%2C%20hasta%20no%20recuerdo',
'/testchat?text=quiero%20ir%20desde%20travessera%20de%20les%20corts%2C%20barcelona%2C%20hasta%20Avenida%20diagonal%2050%2C%20barcelona',
'/testchat?text=quiero%20ir%20desde%20Confidencial%2C%20hasta%20Confdencial%2C%20Confidencial',
'/testchat?text=quiero%20ir%20desde%20Paseo%20Zona%20Franca%20241%2C%20Barcelona%2C%20hasta%20Plaza%20Espa%C3%B1a%2C%20Barcelona',
'/testchat?text=quiero%20ir%20desde%20Rbla.%20les%20bobiles%2014%2C%20martorell%2C%20hasta%20plaza%20de%20la%20vila%201%2C%20martorell',
'/testchat?text=quiero%20ir%20desde%20Rambla%20de%20Catalu%C3%B1a%2086%2C%20Barcelona%2C%20hasta%20Padilla%20342%2C%20Barcelona',
'/testchat?text=quiero%20ir%20desde%20Concepci%C3%B3n%2C%20Abrera%2C%20hasta%20Major%2C%20Abrera']
爬虫调用示例:
site = 'sema-dev-backend.mybluemix.net'
urlDict = df.url.reset_index(drop = True).to_dict()
nThreads = 5
ts1 = time.time()
mIdCrawler = MIdCrawler(site, nThreads, urlDict)
print(mIdCrawler.getRes())
t = time.time() - ts1
print('secs: ' + str(t))
print('mean_time: ' + str(t / len(urlDict)))
我应该在此代码中修改什么才能对我的网络服务进行 200 次调用,停止并等待 10 分钟,然后在停止的地方继续进行 200 次调用并重复该过程?
非常感谢您
如果您创建一个 main() 函数,我使用您提供的代码创建该函数:
def main()
site = 'sema-dev-backend.mybluemix.net'
urlDict = df.url.reset_index(drop = True).to_dict()
nThreads = 5
ts1 = time.time()
mIdCrawler = MIdCrawler(site, nThreads, urlDict)
print(mIdCrawler.getRes())
t = time.time() - ts1
print('secs: ' + str(t))
print('mean_time: ' + str(t / len(urlDict)))
然后你可以使用这个循环:
import time
try:
while True:
for i in range(0, 200): #execute 200 times
main()
time.sleep(60*10) #10 minute delay
except KeyboardInterrupt:
print("stopping script")
exit(0)
我有一个网络爬虫:
import http.client
import time
import threading
class Worker(threading.Thread): # Booting crawler workers
def __init__(self, base, ref, conn, urlKey, url):
threading.Thread.__init__(self)
self.base = base
self.ref = ref
self.conn = conn
self.urlKey = urlKey
self.url = url
def run_aux(self): # Getting web response and do stuff with it
self.conn.request('GET', self.url)
response = self.conn.getresponse()
status = response.status
if status == 200:
data = str(response.read())
if data.find('No he podido entenderte.') != -1:
data = 'Err: Ruta no comprendida'
else:
pos = data.find('>Id: ') + 5
if pos == 4:
data = 'Err: Tag >Id: no encontrado'
else:
data = data[pos:pos+32]
else:
data = 'Err: ' + str(status)
response.close()
self.base.saveRes(self.urlKey, data, self.ref)
def run(self): # Auxiliary function that allows the crawler to continue running if any problem is found while connecting to the web site. It will continue trying to runtill connections are accepted
try:
self.run_aux()
except Exception as e:
self.run()
class MIdCrawler(object): # Boot and instantiate the crawler
def __init__(self, site, nThreads, urlDict):
self.res = {}
self.nThreads = nThreads
self.urlDict = urlDict.copy()
self.urlKeys = list(self.urlDict.keys())
self.conns = [http.client.HTTPSConnection(site) for _ in range(self.nThreads)]
self.threads_launched = 0
self.launch()
def newThread(self, ref): # How the threads are launched
if self.threads_launched < len(self.urlKeys):
urlKey = self.urlKeys[self.threads_launched]
Worker(self, ref, self.conns[ref], urlKey, self.urlDict.get(urlKey)).start()
self.threads_launched += 1
def launch(self): # Create new threads
for i in range(self.nThreads):
self.newThread(i)
def saveRes(self, urlKey, data, ref): # Saving the results into a dict
self.res[urlKey] = data
#print('Dato guardado')
print(urlKey)
self.newThread(ref)
def getRes(self): # Return results
while len(self.res) < len(self.urlKeys):
time.sleep(0.5)
return(self.res)
def close(self): # Close working threads
for i in range(self.nThreads):
self.conns[i].close()
此爬虫将从我的网络服务器恢复某些数据,当我的服务器收到调用时,它会调用 Google 地图 api 以恢复一些路线数据。我必须打 3000 次电话,问题是我注意到 Google 地图在短时间内从同一 IP 连续拨打一定数量的电话时停止接听电话。出现此红旗的调用次数不固定,但始终在 240 到 300 次连续调用之间。
此爬虫允许指定我们要并行启动的线程(调用)数,问题是当我到达 2xx-300 调用时 Google 地图服务器停止响应我的请求.
我是这个 web scraping 的新手,我希望有人告诉我如何将这个爬虫修改为 运行 200 个调用,等待十分钟,然后继续从它停止的地方开始。
这是爬虫收到的一些示例数据:
['/testchat?text=quiero%20ir%20desde%20carrer%20cervantes%2C%201%2C%20sant%20andreu%20de%20la%20barca%2C%20hasta%20avinguda%20constituci%C3%B3%2C%2024%2C%20sant%20andreu%20de%20la%20barca',
'/testchat?text=quiero%20ir%20desde%20General%20Mitre%20239%2C%20Barcelona%2C%20hasta%20Plaza%20Artos%2C%20Barcelona',
'/testchat?text=quiero%20ir%20desde%20carrer%20Bon%20viatge%2C%20sant%20Joan%20Desp%C3%AD%2C%20hasta%20mare%20de%20deu%20la%20Merc%C3%A8%2C%20sant%20Joan%20Desp%C3%AD',
'/testchat?text=quiero%20ir%20desde%20no%20recuerdo%2C%20hasta%20no%20recuerdo',
'/testchat?text=quiero%20ir%20desde%20travessera%20de%20les%20corts%2C%20barcelona%2C%20hasta%20Avenida%20diagonal%2050%2C%20barcelona',
'/testchat?text=quiero%20ir%20desde%20Confidencial%2C%20hasta%20Confdencial%2C%20Confidencial',
'/testchat?text=quiero%20ir%20desde%20Paseo%20Zona%20Franca%20241%2C%20Barcelona%2C%20hasta%20Plaza%20Espa%C3%B1a%2C%20Barcelona',
'/testchat?text=quiero%20ir%20desde%20Rbla.%20les%20bobiles%2014%2C%20martorell%2C%20hasta%20plaza%20de%20la%20vila%201%2C%20martorell',
'/testchat?text=quiero%20ir%20desde%20Rambla%20de%20Catalu%C3%B1a%2086%2C%20Barcelona%2C%20hasta%20Padilla%20342%2C%20Barcelona',
'/testchat?text=quiero%20ir%20desde%20Concepci%C3%B3n%2C%20Abrera%2C%20hasta%20Major%2C%20Abrera']
爬虫调用示例:
site = 'sema-dev-backend.mybluemix.net'
urlDict = df.url.reset_index(drop = True).to_dict()
nThreads = 5
ts1 = time.time()
mIdCrawler = MIdCrawler(site, nThreads, urlDict)
print(mIdCrawler.getRes())
t = time.time() - ts1
print('secs: ' + str(t))
print('mean_time: ' + str(t / len(urlDict)))
我应该在此代码中修改什么才能对我的网络服务进行 200 次调用,停止并等待 10 分钟,然后在停止的地方继续进行 200 次调用并重复该过程?
非常感谢您
如果您创建一个 main() 函数,我使用您提供的代码创建该函数:
def main()
site = 'sema-dev-backend.mybluemix.net'
urlDict = df.url.reset_index(drop = True).to_dict()
nThreads = 5
ts1 = time.time()
mIdCrawler = MIdCrawler(site, nThreads, urlDict)
print(mIdCrawler.getRes())
t = time.time() - ts1
print('secs: ' + str(t))
print('mean_time: ' + str(t / len(urlDict)))
然后你可以使用这个循环:
import time
try:
while True:
for i in range(0, 200): #execute 200 times
main()
time.sleep(60*10) #10 minute delay
except KeyboardInterrupt:
print("stopping script")
exit(0)