[转]使用python第三方框架requests多线程获取数据

作者: aries 分类: Python 发布时间: 2012-10-25 21:41 ė 6494次浏览 6 0评论

以前一直使用python的urllib2来抓取页面采集内容,偶然发现python有一个非常好用的封装好的库requests可以省去很多事。具体安装和使用教程可以直接参考官方文档: http://docs.python-requests.org/en/latest/

简单的写了一个通过requests多线程获取页面数据的代码。不贴所有代码了,代码如下:
# -*- coding:utf-8 -*-
import requests
from time import sleep
from threading import Thread

UPDATE_INTERVAL = 0.01

class URLThread(Thread):
def init(self, url, timeout=10, allow_redirects=True):
super(URLThread, self).__init__()
self.url = url
self.timeout = timeout
self.allow_redirects = allow_redirects
self.response = None

<span class="highlight-k">def</span> <span class="highlight-nf">run</span><span class="highlight-p">(</span><span class="highlight-bp">self</span><span class="highlight-p">):</span>
    <span class="highlight-k">try</span><span class="highlight-p">:</span>
        <span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">response</span> <span class="highlight-o">=</span> <span class="highlight-n">requests</span><span class="highlight-o">.</span><span class="highlight-n">get</span><span class="highlight-p">(</span><span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">url</span><span class="highlight-p">,</span> <span class="highlight-n">timeout</span> <span class="highlight-o">=</span> <span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">timeout</span><span class="highlight-p">,</span> <span class="highlight-n">allow_redirects</span> <span class="highlight-o">=</span> <span class="highlight-bp">self</span><span class="highlight-o">.</span><span class="highlight-n">allow_redirects</span><span class="highlight-p">)</span>
    <span class="highlight-k">except</span> <span class="highlight-ne">Exception</span> <span class="highlight-p">,</span> <span class="highlight-n">what</span><span class="highlight-p">:</span>
        <span class="highlight-nb">print</span> <span class="highlight-n">what</span>
        <span class="highlight-k">pass</span>

def multi_get(uris, timeout=10, allow_redirects=True):
'''
uris uri列表
timeout 访问url超时时间
allow_redirects 是否url自动跳转
'''
def alive_count(lst):
alive = map(lambda x : 1 if x.isAlive() else 0, lst)
return reduce(lambda a,b : a + b, alive)
threads = [ URLThread(uri, timeout, allow_redirects) for uri in uris ]
for thread in threads:
thread.start()
while alive_count(threads) > 0:
sleep(UPDATE_INTERVAL)
return [ (x.url, x.response) for x in threads ]

if name == 'main':
r = multi_get(['http://qq.com'], 1, False)
for url, data in r:
if data: print "received this data %s from this url %s" % (data.headers, url)

换一个
暂无评论
Ɣ回顶部