基于大数据的房价分析--1.数据爬取
来源:互联网 发布:软件手术刀 教程 编辑:程序博客网 时间:2024/04/20 10:07
爬取数据用的是python2.6+scrapy爬虫框架,一开始我写的是一个全站爬虫,可以根据一个种子url爬取58同城所有房价信息,但有个问题就是必须使用代理IP,否则爬虫很快就会被封禁,于是我想了个办法就是在linux中每五分钟运行一次爬虫程序,每次只爬取一个城市的房价信息,代码如下
1.spiders
#encoding=utf-8import syssys.path.append("..")from scrapy.spiders import Spiderfrom lxml import htmlimport plugfrom plug.utils import StringUtil,NumberUtilfrom ershoufang.items import HouseItemimport reimport scrapyimport timeimport pymongofrom scrapy.utils.project import get_project_settingsclass erShouSpider(Spider): name = "ershoufang" allowed_domains = ["58.com"] def __init__(self): super(erShouSpider,self).__init__() self.settings = get_project_settings() self.client = pymongo.MongoClient( self.settings['MONGO_IP'], self.settings['MONGO_PORT']) self.cities_db = self.client[self.settings['CITY_DB']] self.cities_Col = self.cities_db[self.settings['CITY_COL']] self.fillurl="" self.cityhost="" self.city="" def get_specify_request(self): #返回指定的请求 condition = {"city":self.settings['CITY']} if self.settings['PROVIENCE'] and self.settings['PROVIENCE']!="": condition = {"city":self.settings['CITY'],"provience":self.settings['PROVIENCE']} content = self.cities_Col.find_one(condition) self.cityhost = content['cityhost'] self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost self.city = content["_id"] return [scrapy.Request(self.fillUrl)] def get_sequence_request(self): #按顺序进行爬取 requests = [] if self.cities_Col.count({"status":False}) <= 0: print("全部设为false") self.cities_Col.update({},{"$set":{"status":False}},True,True,True) content = self.cities_Col.find_one({"status":False}) self.cities_Col.update({"_id":content["_id"]},{"$set":{"status":True}}) self.client.close() self.cityhost = content['cityhost'] self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost self.city = content["_id"] requests.append(scrapy.Request(self.fillUrl)) return requests def start_requests(self): if self.settings['CITY'] and self.settings['CITY'] != '': return self.get_specify_request() else: return self.get_sequence_request() def parseUrls(self,html): links = html.xpath(".//a/@href") urls = [] for link in links: if StringUtil.filtString(self.fillUrl+"pn\d+?/",link): urls.append(link) return urls def parseItems(self,html,url): houselist = html.xpath(".//ul[@class='house-list-wrap']//div[@class='list-info']") items = [] for houseinfo in houselist: detailurl = houseinfo.xpath(".//h2[1]/a/@href") title = "".join(houseinfo.xpath(".//h2[1]/a/text()")) roomNum = "".join(houseinfo.xpath(".//p[1]/span[1]/text()")[0].split()) size = "".join(houseinfo.xpath(".//p[1]/span[2]/text()")) orient = "".join(houseinfo.xpath(".//p[1]/span[3]/text()")) floor = "".join(houseinfo.xpath(".//p[1]/span[4]/text()")) address = "".join(("".join(houseinfo.xpath(".//p[2]/span[1]//a/text()"))).split()) sumprice = "".join(houseinfo.xpath("./following-sibling::div[1]//p[@class='sum']/b/text()")) unitprice = "".join(houseinfo.xpath("./following-sibling::div[@class='price']//p[@class='unit']/text()")) items.append(HouseItem( _id = "".join(detailurl), title = title, roomNum = roomNum, size = NumberUtil.fromString(size), orient = orient, floor = floor, address = address, sumPrice = NumberUtil.fromString(sumprice), unitPrice = NumberUtil.fromString(unitprice), city=self.city, fromUrl = url, nowTime = time.time(), status = "SUBSPENDING") ) return items def parse(self,response): if(response.body =='None'): return doc = html.fromstring(response.body.decode("utf-8")) urls = self.parseUrls(doc) items = self.parseItems(doc,response.url) for url in urls: yield scrapy.Request(url,callback=self.parse) for item in items: yield item
2.items
class HouseItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() roomNum = scrapy.Field() size = scrapy.Field() orient = scrapy.Field() floor = scrapy.Field() address = scrapy.Field() sumPrice = scrapy.Field() unitPrice = scrapy.Field() _id = scrapy.Field() imageurl = scrapy.Field() fromUrl = scrapy.Field() city = scrapy.Field() nowTime = scrapy.Field() status = scrapy.Field()
3.pipelines
#coding: utf-8import codecsimport jsonimport pymongofrom scrapy.utils.project import get_project_settings # Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom ershoufang.items import ProxyItemclass ErshoufangPipeline(object): def __init__(self): self.settings = get_project_settings() self.client = pymongo.MongoClient( host=self.settings['MONGO_IP'], port=self.settings['MONGO_PORT']) self.db = self.client[self.settings['MONGO_DB']] self.proxyclient = self.proxy = self.client[self.settings['PROXY_DB']][self.settings['POOL_NAME']] self.itemNumber = 0 def process_proxy(self,item): self.proxyclient.insert(dict(item)) def process_item(self, item, spider): if isinstance (item,ProxyItem): self.process_proxy(item) return item try: if not item['address']: print(item["fromUrl"+"网页异常"]) return item ''' if self.db.ershoufang.count({"_id":item["_id"],"city":item['city']})<= 0: print("删除") self.db.ershoufang.remove({"_id":item["_id"]}) ''' coll = self.db[self.settings['ALL']] coll.insert(dict(item)) self.itemNumber += 1 print("爬取到第%s个房屋,地址为%s"%(self.itemNumber,item['address'])) except Exception,e: print("房屋已存在"+item['address']) return item def closed_spider(self,spider): self.client.close() self.db.close() print("本次爬取共爬取到%s条房屋数据"%self.itemNumber)
爬取了三天,爬取了两百多万的数据,结果如下
阅读全文
0 0
- 基于大数据的房价分析--1.数据爬取
- 基于大数据的房价分析
- 基于大数据的房价分析--2.数据解析
- 基于大数据的房价分析--3.echart+百度地图实现数据可视化
- 基于大数据的房价分析--4.用spring搭建后端接口
- 如何利用大数据分析一个城市的房价
- 大数据分析- 基于Hadoop/Mahout的大数据挖掘
- 大数据分析- 基于Hadoop/Mahout的大数据挖掘
- 大数据分析- 基于Hadoop/Mahout的大数据挖掘
- 大数据分析- 基于Hadoop/Mahout的大数据挖掘
- 大数据分析- 基于Hadoop/Mahout的大数据挖掘
- 基于mdrill的大数据分析
- 基于mdrill的大数据分析
- 基于大数据的中文舆情分析
- 爬链家的房价数据
- 基于Hadoop大数据分析
- 从大数据的角度看 房价一定会下跌
- 大数据的分析
- main函数参数解析
- FZU 1076 穿越沙漠
- nyist 737
- AR--基本原理实现科普
- Python安装的包的更新
- 基于大数据的房价分析--1.数据爬取
- 机器学习-线性回归
- 数据洞察打通全域社群
- c++ 单例模式写法
- SIFT特征提取分析
- 第6节项目1-求两个电阻的值(变量)
- 存储系统科普——文件系统介绍
- GUI界面
- 字符串的处理