[转]使用python第三方框架requests多线程获取数据
以前一直使用python的urllib2来抓取页面采集内容,偶然发现python有一个非常好用的封装好的库requests可以省去很多事。具体安装和使用教程可以直接参考官方文档: http://docs.python-requests.org/en/latest/
简单的写了一个通过requests多线程获取页面数据的代码。不贴所有代码了,代码如下:
# -*- coding:utf-8 -*-
import requests
from time import sleep
from threading import Thread
UPDATE_INTERVAL = 0.01
class URLThread(Thread):
def init(self, url, timeout=10, allow_redirects=True):
super(URLThread, self).__init__()
self.url = url
self.timeout = timeout
self.allow_redirects = allow_redirects
self.response = None
<span class="highlight-k">def</span> <span class="highlight-nf">run</span><span class="highlight-p">(</span><span class="highlight-bp">self</span><span class="highlight-p">):</span>
<span class="highlight-k">try</span><span class="highlight-p">:</span>
<span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">response</span> <span class="highlight-o">=</span> <span class="highlight-n">requests</span><span class="highlight-o">.</span><span class="highlight-n">get</span><span class="highlight-p">(</span><span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">url</span><span class="highlight-p">,</span> <span class="highlight-n">timeout</span> <span class="highlight-o">=</span> <span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">timeout</span><span class="highlight-p">,</span> <span class="highlight-n">allow_redirects</span> <span class="highlight-o">=</span> <span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">allow_redirects</span><span class="highlight-p">)</span>
<span class="highlight-k">except</span> <span class="highlight-ne">Exception</span> <span class="highlight-p">,</span> <span class="highlight-n">what</span><span class="highlight-p">:</span>
<span class="highlight-nb">print</span> <span class="highlight-n">what</span>
<span class="highlight-k">pass</span>
def multi_get(uris, timeout=10, allow_redirects=True):
'''
uris uri列表
timeout 访问url超时时间
allow_redirects 是否url自动跳转
'''
def alive_count(lst):
alive = map(lambda x : 1 if x.isAlive() else 0, lst)
return reduce(lambda a,b : a + b, alive)
threads = [ URLThread(uri, timeout, allow_redirects) for uri in uris ]
for thread in threads:
thread.start()
while alive_count(threads) > 0:
sleep(UPDATE_INTERVAL)
return [ (x.url, x.response) for x in threads ]
if name == 'main':
r = multi_get(['http://qq.com'], 1, False)
for url, data in r:
if data: print "received this data %s from this url %s" % (data.headers, url)
友情链接