使用bs4爬取链家网的二手房信息

来源:互联网 发布:生产者消费者模式java 编辑:程序博客网 时间:2024/04/27 19:22
# coding=utf-8#使用bs4爬取链家网的二手房信息import codecsimport requestsfrom bs4 import BeautifulSoupimport osclass HouseInfo:    def __init__(self,imageUrl,title,subTitle,totalPrice,unitPrice):        self.imageUrl=imageUrl        self.title=title        self.subTitle=subTitle        self.totalPrice=totalPrice        self.unitPrice=unitPrice        self.localImage=""    def __str__(self):        #return 'title:{} subTitle:{} totalPrice:{} unitPrice:{} imageUrl:{}'.format(self.title,self.subTitle,self.totalPrice,self.unitPrice,self.imageUrl)        return 'title:'+self.title+'\n subTitle:'+self.subTitle+'\n totalPrice:'+self.totalPrice+'\n unitPrice:'+self.unitPrice+'\n imageUrl:'+self.imageUrl+'localImage:'+self.localImageclass Spider:    def __init__(self):        self.currentPage = -1        if not os.path.exists('d:/python/pachong/lianjia'):            os.makedirs('d:/python/pachong/lianjia')    def setCurrentPage(self,page):        self.currentPage = page        path = 'd:/python/pachong/lianjia/{}'.format(page)        if not os.path.exists(path):            os.makedirs(path)    #获取html数据    def getHtmlData(self,page):        self.setCurrentPage(page)        #访问url        #设置请求头数据        headers = {#             'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',#             'Accept-Encoding':'gzip, deflate, sdch',#             'Accept-Language':'zh-CN,zh;q=0.8',#             'Cache-Control':'max-age=0',#             'Connection':'keep-alive',#             'Cookie':'_ga=GA1.2.297065422.1513175046;_gid=GA1.2.1857072517.1513175046',#             'Host':'jandan.net',#             'Referer':'http://jandan.net/',#             'Upgrade-Insecure-Requests':'1',            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'            }        url='https://bj.lianjia.com/ershoufang/pg{}/'.format(page)        #发起请求        response=requests.get(url,headers=headers,allow_redirects=False)        #获取数据        html=response.content.decode('utf8')        #print html        self.getDataFromHtml(html)    #解析html,获取相关信息    def getDataFromHtml(self,html):        soup = BeautifulSoup(html,'html.parser')        liList=soup.select(".sellListContent li")        houseList=[]        for li in liList:            imageUrl=li.select("img")[0]['data-original']            print imageUrl            title=(li.select(".title a")[0]).string            #print title            tag=li.select(".address .houseInfo")[0]            subTitle=tag.contents[1].string+tag.contents[2]            #print subTitle            totalPrice=li.select(".priceInfo > .totalPrice > span")[0].string            #print totalPrice            unitPrice = li.select(".priceInfo > .unitPrice > span")[0].string            print unitPrice            houseInfo=HouseInfo(imageUrl,title,subTitle,totalPrice,unitPrice)            houseList.append(houseInfo)        self.saveData(houseList)    #存储数据    def saveData(self,houseList):        with codecs.open("d:/python/pachong/info.txt","a+",encoding="utf8") as f:            for house in houseList:                localPath = self.saveImageData(house.imageUrl)                house.localImage = localPath                f.write(house.__str__())                f.write('\n-------------------------\n')                f.flush()    #开始爬虫    def beginSpider(self,beginPage,size):        for page in range(beginPage,beginPage+size):            self.getHtmlData(page)    #下载图片    def saveImageData(self,imageUrl):        response = requests.get(imageUrl)        data = response.content        nameList = imageUrl.split('/')        name = nameList[len(nameList)-1]        path = 'd:/python/pachong/lianjia/{}/{}'.format(self.currentPage,name)        with codecs.open(path,'wb+') as f:            f.write(data)        return pathif __name__ == "__main__":    spider = Spider()    spider.beginSpider(1,1)
原创粉丝点击