基于大数据的房价分析--1.数据爬取

来源:互联网 发布:软件手术刀 教程 编辑:程序博客网 时间:2024/04/20 10:07

爬取数据用的是python2.6+scrapy爬虫框架,一开始我写的是一个全站爬虫,可以根据一个种子url爬取58同城所有房价信息,但有个问题就是必须使用代理IP,否则爬虫很快就会被封禁,于是我想了个办法就是在linux中每五分钟运行一次爬虫程序,每次只爬取一个城市的房价信息,代码如下

1.spiders

#encoding=utf-8import syssys.path.append("..")from scrapy.spiders import Spiderfrom lxml import htmlimport plugfrom plug.utils import StringUtil,NumberUtilfrom ershoufang.items import HouseItemimport reimport scrapyimport timeimport pymongofrom scrapy.utils.project import get_project_settingsclass erShouSpider(Spider):    name = "ershoufang"    allowed_domains = ["58.com"]    def __init__(self):        super(erShouSpider,self).__init__()        self.settings = get_project_settings()        self.client = pymongo.MongoClient(                                                                self.settings['MONGO_IP'],                                                                self.settings['MONGO_PORT'])        self.cities_db = self.client[self.settings['CITY_DB']]        self.cities_Col = self.cities_db[self.settings['CITY_COL']]        self.fillurl=""        self.cityhost=""        self.city=""    def get_specify_request(self):        #返回指定的请求        condition = {"city":self.settings['CITY']}        if self.settings['PROVIENCE'] and self.settings['PROVIENCE']!="":            condition = {"city":self.settings['CITY'],"provience":self.settings['PROVIENCE']}            content = self.cities_Col.find_one(condition)            self.cityhost = content['cityhost']            self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost            self.city = content["_id"]            return [scrapy.Request(self.fillUrl)]    def get_sequence_request(self):        #按顺序进行爬取        requests = []        if self.cities_Col.count({"status":False}) <= 0:            print("全部设为false")            self.cities_Col.update({},{"$set":{"status":False}},True,True,True)        content = self.cities_Col.find_one({"status":False})        self.cities_Col.update({"_id":content["_id"]},{"$set":{"status":True}})        self.client.close()        self.cityhost = content['cityhost']        self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost        self.city = content["_id"]        requests.append(scrapy.Request(self.fillUrl))        return requests    def start_requests(self):        if self.settings['CITY'] and self.settings['CITY'] != '':            return self.get_specify_request()        else:            return self.get_sequence_request()    def parseUrls(self,html):        links = html.xpath(".//a/@href")        urls = []        for link in links:            if StringUtil.filtString(self.fillUrl+"pn\d+?/",link):                urls.append(link)        return urls    def parseItems(self,html,url):        houselist = html.xpath(".//ul[@class='house-list-wrap']//div[@class='list-info']")        items = []        for houseinfo in houselist:            detailurl = houseinfo.xpath(".//h2[1]/a/@href")            title = "".join(houseinfo.xpath(".//h2[1]/a/text()"))            roomNum = "".join(houseinfo.xpath(".//p[1]/span[1]/text()")[0].split())            size = "".join(houseinfo.xpath(".//p[1]/span[2]/text()"))            orient =  "".join(houseinfo.xpath(".//p[1]/span[3]/text()"))            floor = "".join(houseinfo.xpath(".//p[1]/span[4]/text()"))            address = "".join(("".join(houseinfo.xpath(".//p[2]/span[1]//a/text()"))).split())            sumprice = "".join(houseinfo.xpath("./following-sibling::div[1]//p[@class='sum']/b/text()"))            unitprice = "".join(houseinfo.xpath("./following-sibling::div[@class='price']//p[@class='unit']/text()"))            items.append(HouseItem(                                        _id = "".join(detailurl),                                        title = title,                                        roomNum = roomNum,                                        size = NumberUtil.fromString(size),                                        orient = orient,                                        floor = floor,                                        address = address,                                        sumPrice = NumberUtil.fromString(sumprice),                                        unitPrice = NumberUtil.fromString(unitprice),                                        city=self.city,                                        fromUrl = url,                                        nowTime = time.time(),                                        status = "SUBSPENDING")                                    )        return items    def parse(self,response):        if(response.body =='None'):            return        doc = html.fromstring(response.body.decode("utf-8"))        urls = self.parseUrls(doc)        items = self.parseItems(doc,response.url)        for url in urls:            yield scrapy.Request(url,callback=self.parse)        for item in items:            yield item

2.items

class HouseItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    title = scrapy.Field()    roomNum = scrapy.Field()    size = scrapy.Field()    orient = scrapy.Field()    floor = scrapy.Field()    address = scrapy.Field()    sumPrice = scrapy.Field()    unitPrice = scrapy.Field()    _id = scrapy.Field()    imageurl = scrapy.Field()    fromUrl = scrapy.Field()    city = scrapy.Field()    nowTime = scrapy.Field()    status = scrapy.Field()

3.pipelines

#coding: utf-8import codecsimport jsonimport pymongofrom scrapy.utils.project import get_project_settings       # Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom ershoufang.items import ProxyItemclass ErshoufangPipeline(object):        def __init__(self):            self.settings = get_project_settings()            self.client = pymongo.MongoClient(                host=self.settings['MONGO_IP'],                port=self.settings['MONGO_PORT'])            self.db = self.client[self.settings['MONGO_DB']]            self.proxyclient = self.proxy = self.client[self.settings['PROXY_DB']][self.settings['POOL_NAME']]            self.itemNumber = 0        def process_proxy(self,item):            self.proxyclient.insert(dict(item))        def process_item(self, item, spider):            if isinstance (item,ProxyItem):                self.process_proxy(item)                return item            try:                if not item['address']:                     print(item["fromUrl"+"网页异常"])                    return item                '''                if self.db.ershoufang.count({"_id":item["_id"],"city":item['city']})<= 0:                    print("删除")                    self.db.ershoufang.remove({"_id":item["_id"]})                '''                coll = self.db[self.settings['ALL']]                coll.insert(dict(item))                self.itemNumber += 1                print("爬取到第%s个房屋,地址为%s"%(self.itemNumber,item['address']))            except Exception,e:                print("房屋已存在"+item['address'])            return item        def closed_spider(self,spider):            self.client.close()            self.db.close()             print("本次爬取共爬取到%s条房屋数据"%self.itemNumber)

爬取了三天,爬取了两百多万的数据,结果如下
这里写图片描述

原创粉丝点击