Python 多线程 HTTP 爬虫 - 关闭连接并挂起程序

Question

在 Python 中编写了这个爬虫，它根据域的输入列表将几个参数转储到 JSON 输出文件。

有这个问题：

是否需要在每个线程中关闭 HTTP 连接？输入数据是 ca。 500万件物品。它在开始时以大约 100% 的速度处理。每秒 50 次迭代，但一段时间后它下降到每秒 1-2 次 and/or 挂起（没有内核消息，标准输出也没有错误）？这可能是代码还是与网络限制有关？我怀疑是软件，因为当我重新启动它时，它会以高速率再次启动（每秒约 50 次迭代）

也欢迎任何有关如何改进以下代码的提示，尤其是提高速度和抓取吞吐量。

问题代码：

import urllib2
import pprint
from tqdm import tqdm

import lxml.html

from Queue import Queue

from geoip import geolite2
import pycountry

from tld import get_tld



resfile = open("out.txt",'a')



concurrent = 200

def doWork():
    while True:
        url = q.get()
        status = getStatus(url)
        doSomethingWithResult(status)
        q.task_done()

def getStatus(ourl):
     try:
       response = urllib2.urlopen("http://"+ourl)
       peer = response.fp._sock.fp._sock.getpeername()
       ip = peer[0]
       header = response.info()
       html = response.read()
       html_element = lxml.html.fromstring(html)
       generator = html_element.xpath("//meta[@name='generator']/@content")
       try:
         match = geolite2.lookup(ip)
         if match is not None:
           country= match.country
           try:

             c=pycountry.countries.lookup(country)
             country=c.name
           except:
             country=""

       except:
         country=""
       try:
         res=get_tld("http://www"+ourl, as_object=True)
         tld=res.suffix
       except:
         tld=""

       try:
         match = re.search(r'[\w\.-]+@[\w\.-]+', html)
         email=match.group(0)
       except:
         email=""

       try:
           item= generator[0]
           val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
       except:
           val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"

       return val
     except Exception as e:
       #print "error"+str(e)
       pass

def doSomethingWithResult(status):
    if status:
      resfile.write(str(status)+"\n")

q = Queue(concurrent * 2)
for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()

try:
    for url in tqdm(open('list.txt')):
        q.put(url.strip())
        status = open("status.txt",'w')
        status.write(str(url.strip()))   
    q.join()
except KeyboardInterrupt:
    sys.exit(1)

更新 1：

关闭 Socket 和 FileDescriptor 使其工作得更好，一段时间后似乎不再挂起。在家用笔记本电脑上性能为 50 reqs/sec，在 VPS

上性能约为 100 req/sec

from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm

import lxml.html

from Queue import Queue

from geoip import geolite2
import pycountry

from tld import get_tld
import json



resfile = open("out.txt",'a')



concurrent = 200

def doWork():
    while True:
        url = q.get()
        status = getStatus(url)
        doSomethingWithResult(status)
        q.task_done()

def getStatus(ourl):
     try:
       response = urllib2.urlopen("http://"+ourl)
       realsock = response.fp._sock.fp._sock
       peer = response.fp._sock.fp._sock.getpeername()
       ip = peer[0]
       header = response.info()
       html = response.read()
       realsock.close()
       response.close()

       html_element = lxml.html.fromstring(html)
       generator = html_element.xpath("//meta[@name='generator']/@content")
       try:
         match = geolite2.lookup(ip)
         if match is not None:
           country= match.country
           try:

             c=pycountry.countries.lookup(country)
             country=c.name
           except:
             country=""

       except:
         country=""
       try:
         res=get_tld("http://www"+ourl, as_object=True)
         tld=res.suffix
       except:
         tld=""

       try:
         match = re.search(r'[\w\.-]+@[\w\.-]+', html)
         email=match.group(0)
       except:
         email=""

       try:
           item= generator[0]
           val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
       except:
           val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"

       return val
     except Exception as e:
       print "error"+str(e)
       pass

def doSomethingWithResult(status):
    if status:
      resfile.write(str(status)+"\n")

q = Queue(concurrent * 2)
for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()

try:
    for url in tqdm(open('list.txt')):
        q.put(url.strip())
        status = open("status.txt",'w')
        status.write(str(url.strip()))   
    q.join()
except KeyboardInterrupt:
    sys.exit(1)

Answer 1

句柄将自动进行垃圾回收，但是，您最好自己关闭句柄，尤其是当您在紧密循环中执行此操作时。

您还询问了改进建议。一个重要的是停止使用 urllib2 并开始使用 requests。

Answer 2

有很多可能的选择，为什么您的抓取速度会下降。

1.) 注意不要从同一域中抓取太多数据。一些 Web 服务器配置为允许每个 IP 地址并行连接。

2.) 尝试发送随机 browser-like http headers (user-agent, referrer, ...) 以防止网络服务器抓取保护，如果设置的话。

3.) 使用成熟的http（并行）库，比如pycurl（有MultiCurl) or requests (grequests）。他们肯定表现得更快。

Python 多线程 HTTP 爬虫 - 关闭连接并挂起程序

Python Multithreaded HTTP crawler - Closing connection and hanging of the program

python

multithreading

urllib2