python爬虫学习(下)——爬虫代码实现

来源:互联网 发布:深入浅出node.js系列 编辑:程序博客网 时间:2024/05/18 19:37

上篇分析出了数据获取的完整路径,下面对应介绍具体的代码实现


注:代码说明、我的一些总结心得都放到了代码注释里


整个程序主要由以下几个类组成:

Class Car:汽车模型,存储每个车的信息

Class CarFactory:传入获取的网络数据,生产出汽车模型

Class CarSpider:爬虫的主体类,串联整个业务

Class DataSaver:负责数据库操作,数据存储

Class RequestThread:后期我把请求改成了多线程操作,引入了这个类


具体代码及注释:

#CarSpider对象负责抓取数据spider = CarSpider()cars = spider.getDatas()#DataSaver对象负责入库dataSaver = DataSaver()dataSaver.updateCarsData(cars)print "finish"

CarSpider:
class CarSpider:    def __init__(self):        #类属性要放到init中        self.allCars = []    #对外暴露函数    def getDatas(self):        # 根据首字母得到所有车系列表        carList = self.__getCarSeriesListByInitialChar()        # 从车型列表中提取出每个车系首页的url        carUrlList = self.__getAllCarSeriesIndexUrl(carList)        # 解析出车系配置详情页面url        urlQueue = self.__getCarSeriesInfoUrls(carUrlList)        # 获取车辆详情        cars = self.__getCarsBycarSeriesInfoUrls(urlQueue)        return cars    #gzip解压(有的页面是经过gzip压缩的,gzip解压)    #私有方法,方法名以__开头    def __gzipDecode(self,response):        if response.info().get('Content-Encoding') == 'gzip':            compressedstream = StringIO.StringIO(response.read())            gziper = gzip.GzipFile(fileobj=compressedstream)            result = gziper.read()            return result        else:            return response.read()    #根据首字母得到所有车系列表    def __getCarSeriesListByInitialChar(self):        dataList = []        for i in range(65,91):            url = 'http://www.autohome.com.cn/grade/carhtml/%c.html'%(chr(i))            try:                response = urllib2.urlopen(url)            except urllib2.HTTPError,e:                print e.reason            zipDecodeData = self.__gzipDecode(response)            data = zipDecodeData.decode("gbk")#返回的数据有中文,需要经过编码转换            dataList.append(data)        return dataList    #从车型列表中提取出每个车系首页的url    def __getAllCarSeriesIndexUrl(self,dataList):        carInfoUrlList = []        #()分组的概念!!!        pattern = re.compile(r'<h4.*><a href="(.*)">.*</a></h4>', re.M)        for data in dataList:            carList = re.findall(pattern, data)            for carUrl in carList:                if carUrl.find('greylink') == -1:#greylink意味着车系信息没有价值(不全或陈旧),丢弃                    carInfoUrlList.append(carUrl)        return carInfoUrlList    #解析出车系配置详情页面    def __getCarSeriesInfoUrls(self,carUrlList):        urlQueue = Queue()        for carUrl in carUrlList:            #从车系首页url中提取车系id            pattern = re.compile(r'[0-9]+',re.M)            carId = re.search(pattern,carUrl).group()            #根绝总结的规律,拼出配置详情页url            carSeriesInfoUrl = 'http://car.autohome.com.cn/config/series/%d.html'%(int(carId))            #print carSeriesInfoUrl            urlQueue.put(carSeriesInfoUrl)        return urlQueue    # 获取车辆详情    def __getCarsBycarSeriesInfoUrls(self,urlQueue):        #Queue,线程安全,有阻塞机制        ouputDataQueue = Queue()        threads = []        for i in range(0,4):            #查了一下,python的多线程是伪多线程,把计算放到线程里意义不大,只把请求放到线程中去做            requestThread = RequestThread(i,urlQueue,ouputDataQueue)            threads.append(requestThread)            #不能在此处join,此处join后,循环是在当前线程中的,下次循环也会被阻塞住            #requestThread.join()            requestThread.start()        for requestThread in threads:            requestThread.join()        while ouputDataQueue.qsize() != 0:            outputData = ouputDataQueue.get()[0]            seriesInfoUrl = ouputDataQueue.get()[1]            data = outputData.decode("gbk")            pattern = re.compile(r'var config = ({.*};)', re.M)            result = re.findall(pattern, data)            if len(result) > 0:                infoJsonStr = result[0][0:-1]                #json转换                infoDict = json.loads(infoJsonStr)                carFactory = CarFactory(infoDict, seriesInfoUrl)                # carfactory依据infoDict生成car模型                cars = carFactory.analysisData()                for car in cars:                    self.allCars.append(car)            else:                continue            #从队列里get()数据后,数据使用完后要调用task_done(),让队列取消阻塞            ouputDataQueue.task_done()        return self.allCars


CarFactory:
class CarFactory:    def __init__(self,carsData,carSeriesUrl):        self.carsData = carsData        self.carSeriesUrl = carSeriesUrl        self.cars = []        self.carsNum = 0    def __setCarsParam(self,param,values):        for i in range(self.carsNum):            car = self.cars[i]            value = values[i]["value"]            # 反射机制            setattr(car,param,value)    def analysisData(self):        #一个车系中会有多种车型的信息        specsList = self.carsData["result"]["specsList"]        for spec in specsList:            car = Car();            car.specid = spec["specid"]            car.infoUrl = self.carSeriesUrl            self.cars.append(car)        self.carsNum = len(self.cars)        paramTypeItems = self.carsData["result"]["paramtypeitems"]        for paramTypeItem in paramTypeItems:            paramTypeName = paramTypeItem["name"]            paramItems = paramTypeItem["paramitems"]            if paramTypeName == u"基本参数":                for param in paramItems:                    paramName = param["name"]                    values = param["valueitems"]                    if paramName == u"厂商":                        self.__setCarsParam("family",values)                    .......            elif paramTypeName == u"发动机":                for param in paramItems:                    paramName = param["name"]                    values = param["valueitems"]                    if paramName == u"排量(L)":                        self.__setCarsParam("sv", values)                    elif paramName == u"最大马力(Ps)":                        self.__setCarsParam("hpower", values)                    ..........                    ..........                    ..........        return self.cars

Car:

#python中的枚举实现方式之一class FuelType:    Gasoline = 0    Diesel = 1class GearType:    MT = 0    AUTO = 1    DCT = 2    CVT = 3........class Car:    def __init__(self):        #实例变量要定义在init中        #self.xxx的调用也会触发__setattr__方法        self.specid = 0L  # id,long型        self.family = None  # 厂牌        self.name = None  # 车型        self.price = u"0万"  # 价格        self.level = None  # 级别        self.maxSpeed = 0  # 最高时速..........    #因为CarFactory中用了反射进行属性赋值,而python中每个属性没有独立对应的默认set方法,所以只能把逻辑都写在__setattr__里了    def __setattr__(self, key, value):        if key == 'specid':            self.__dict__[key] = int(value)        elif key == 'isTurbo':            if value.find(u'自然') != -1:                # __dict__ 返回的是一个字典,它的键(key)是属性名,键值(value)是相应的属性对象的数据值                self.__dict__[key] = False            else:                self.__dict__[key] = True        elif key == 'price':            #python里数量词默认是贪婪的(在少数语言里也可能是默认非贪婪),总是尝试匹配尽可能多的字符;非贪婪则相反,总是尝试匹配尽可能少的字符。在"*","?","+","{m,n}"后面加上?,使贪婪变成非贪婪。            pattern = re.compile(ur'(.*?)万.*', re.M)            result = re.findall(pattern,value)            if len(result) > 0:                self.__dict__[key] = float(result[0])            else:                self.__dict__[key] = float(0)        elif key == 'fuleType':            if value.find(u'汽油') != -1:                self.__dict__[key] = FuelType.Gasoline            else:                self.__dict__[key] = FuelType.Diesel        elif key == 'gearType':            pattern = re.compile(ur'手动|MT', re.M)            result = re.search(pattern,value)            if result != None:                self.__dict__[key] = GearType.MT            else:                pattern = re.compile(ur'双离合|DSG|DCT|PDK|tronic|MDKG|power shift', re.M)                result = re.search(pattern, value)                if result != None:                    self.__dict__[key] = GearType.DCT                else:                    pattern = re.compile(ur'无极|CVT', re.M)                    result = re.search(pattern, value)                    if result != None:                        self.__dict__[key] = GearType.CVT                    else:                        self.__dict__[key] = GearType.AUTO        ..........        else:            try:                number = float(value)                self.__dict__[key] = number            except StandardError:                self.__dict__[key] = value

RequestThread:

class RequestThread(threading.Thread):        def __init__(self,threadId,queue,outputQueue):        threading.Thread.__init__(self)        self.threadId = threadId        self.queue = queue        self.outputQueue = outputQueue    def run(self):#从url队列中取出一个尚未处理的url        while self.queue.qsize() != 0:            url = self.queue.get()            self.queue.task_done()#get配合task_done,告知queue,内容已取出,可以不阻塞了            print "%d: url:%s %d\n" % (self.threadId, url, self.queue.qsize())            try:                response = urllib2.urlopen(url)                #print id(response) #response会重复???                #此处返回data外,还将车系url返回(url会存回数据库)                data = self.gzipDecode(response)                self.outputQueue.put((data,url))            except urllib2.URLError,e:                print e.reason


DataSaver:

class DataSaver:    def __init__(self):        self.db = sqlite3.connect("CarDB.sqlite")        print self.db    def updateCarsData(self,cars):        #先删除之前所有数据,简单粗暴        self.db.execute("delete from Cars")        for car in cars:            self.db.execute("insert into Cars (specid,name,family,price,level,maxSpeed,accelerate,sv,hpower,mpower) VALUES (?,?,?,?,?,?,?,?,?,?)",[car.specid,car.name,car.family,car.price,car.level,car.maxSpeed,car.accelerate,                                                                                           car.sv,car.hpower,car.mpower])            self.db.commit()        self.db.close()

以上便是爬虫的主要代码,本人刚刚接触python,文中若有错误或不妥之处,望大家多多指教,谢谢。









0 0
原创粉丝点击