Python+多线程+队列爬虫

来源:互联网 发布:设计软件图标 编辑:程序博客网 时间:2024/05/21 08:02

Python+多线程+队列,爬虫例子

# -*- coding: utf-8-*-import urllib2import urllibimport jsonimport timeimport datetimeimport threadingimport Queueimport sysreload(sys)sys.setdefaultencoding( "utf-8" )def get_response(url):    for a in range(3):        try:            request = urllib2.Request(url)            response = urllib2.urlopen(request)            result= response.read()            return result        except Exception,e:            print e            time.sleep(2)            continueclass ThreadCity(threading.Thread):    def __init__(self,queue_zq_citys):        threading.Thread.__init__(self)        self.queue_zq_citys=queue_zq_citys    def run(self):        sql = 'select cityid,cname from  table '        citylist=DBHelper.SqlHelper.ms.ExecQuery(sql)        for c in citylist:            try:                #根据搜索城市名称获取城市                qm_cname=c[1]                data=urllib.urlencode({'q':qm_cname})                url='xxx.xxx.com/ajax.do?' % data                result=get_response(url)                cjson=json.loads(result.decode('gb2312', 'ignore')) #json格式字符串转换为python对象                cityId=cjson["id"]                cityname=cjson["cName"]                #加入队列                self.queue_zq_citys.put({'cityid':cityId,'cityname':cityname})                time.sleep(1)            except Exception,e:               passclass ThreadCityDB(threading.Thread):    def __init__(self, queue_zq_citys):        threading.Thread.__init__(self)        self.queue_zq_citys = queue_zq_citys    def run(self):        while True:            try:                if self.queue_zq_citys.empty(): #队列为空                    pass                else:                    citys=self.queue_zq_citys.get() #从队列中取出数据                    if  citys is not None:                        sql = "insert into Table(cityid,cityname) values(%s,'%s')" % (                            citys['cityid'], citys['cityname'])                        #print  sql                        DBHelper.SqlHelper.ms.ExecNonQuery(sql.encode('utf-8'))                        self.queue_zq_citys.task_done() #告诉线程我完成了这个任务 是否继续join阻塞 让线程向前执行或者退出                    else:                        pass            except Exception,e:                passdef main():    try:        queue_zq_citys=Queue.Queue()  # 实例化存放抓取到的城市队列        #创建线程        city=ThreadCity(queue_zq_citys) #抓取线程 入队操作        cityDB=ThreadCityDB(queue_zq_citys) #出队操作 存入数据库        #启动线程        city.start()        cityDB.start()        #阻塞等待子线程执行完毕后再执行主线程        city.join()        cityDB.join()    except Exception,e:        passif __name__ == '__main__':    main()


1 0