Python数据爬虫,爬链家的二手房信息
来源:互联网 发布:mysql删除多条数据 编辑:程序博客网 时间:2024/04/27 16:38
# -*- coding:utf-8 -*-from bs4 import BeautifulSoupimport reimport urllibimport urllib2import mathimport codecsdomain='http://bj.lianjia.com/'url = domain + '/ershoufang/'def get_all_xiaoqu_info(): html_doc = urllib.urlopen(url).read() soup = BeautifulSoup(html_doc, 'html.parser') def has_href_but_title(tag): return tag.has_attr('href') and not tag.has_attr('title') #得到所有区域 all_xiao_list=[] for item in soup.find(attrs={"data-role": "ershoufang"}).find_all('a'): #print "%s|%s" % (item.get('href') , item.string) quyu_url=item.get('href') quyu_html_doc = urllib.urlopen(domain + quyu_url).read() quyu_soup = BeautifulSoup(quyu_html_doc, 'html.parser') for item02 in quyu_soup.find(attrs={"data-role": "ershoufang"}).find_all(has_href_but_title): xiaoqu_info=list() xiqu_url=domain + item02.get('href') #print '%s|%s|%s' % (item.get('href'), item.string, item02.string) xiaoqu_info.append(domain + item02.get('href')) xiaoqu_info.append(item.string) xiaoqu_info.append(item02.string) all_xiao_list.append(xiaoqu_info) # for item in all_xiao_list: # print '|'.join(item) return all_xiao_listheaders = { 'Accept':r'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding':r'gzip, deflate', 'Accept-Language':r'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection':r'keep-alive', 'Host':r'bj.lianjia.com', 'Referer':r'http://bj.lianjia.com/ershoufang/beiqijia', 'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0', #'Cookie':r'lianjia_uuid=18acab8e-de62-43a9-b3d4-962b28968ec7; _smt_uid=57219c21.42bc10ea; CNZZDATA1253477573=294732555-1461819878-http%253A%252F%252Fwww.baidu.com%252F%7C1467557395; _jzqa=1.598087999714383000.1461820449.1467148977.1467525301.10; _jzqy=1.1461820449.1465381613.3.jzqsr=baidu.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6; CNZZDATA1254525948=1894244370-1461815948-http%253A%252F%252Fwww.baidu.com%252F%7C1467557174; CNZZDATA1255633284=1405522112-1461816002-http%253A%252F%252Fwww.baidu.com%252F%7C1467556338; CNZZDATA1255604082=136337279-1461816253-http%253A%252F%252Fwww.baidu.com%252F%7C1467557177; _ga=GA1.2.433541691.1461820452; miyue_hide=%20index%20%20index%20%20index%20%20index%20%20index%20; _jzqx=1.1461823997.1462357481.2.jzqsr=bj%2Elianjia%2Ecom|jzqct=/ershoufang/rs%e5%8d%97%e5%b2%b8.jzqsr=bj%2Elianjia%2Ecom|jzqct=/ershoufang/yanjiao/; select_city=110000; all-lj=b3d70adee01740a6a0beb306d0f6f412; _jzqb=1.31.10.1467525301.1; _jzqc=1; _jzqckmp=1; lianjia_ssid=dedfe462-961f-4b4e-a935-1e170a4cf252; _qzja=1.1927253355.1461820449108.1467148976556.1467525301331.1467528966161.1467528974420.0.0.0.136.10; _qzjb=1.1467525301331.31.0.0.0; _qzjc=1; _qzjto=31.1.0; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1', 'DN':'1', 'Cache-Control': 'max-age=0'}def getHouseInfoForOneURL(quyuInfo): house_list=[] #xiaoqu_url=r'http://bj.lianjia.com/ershoufang/changying/p1/' xiaoqu_url=quyuInfo[0] xiao_html_doc= urllib.urlopen(xiaoqu_url).read() xiaoqu_soup = BeautifulSoup(xiao_html_doc, 'html.parser') total_house=int(xiaoqu_soup.find("h2",class_="total fl").find('span').string) if total_house == 0: print 'no hourse fouund' return house_list if total_house > 3000: print '3000+ hourse fouund, need split it into small pages' return house_list total_page = math.ceil(total_house / 30.0) for page_index in range(1, int(total_page)+1): page_url= '%s%d' % ('http://bj.lianjia.com/ershoufang/changying/p',page_index) page_html_doc= urllib.urlopen(page_url).read() page_soup = BeautifulSoup(page_html_doc, 'html.parser') #print page_url house_info=dict() for house_info in page_soup.find('ul',class_='listContent').find_all('div', class_="info"): house_info['href']= house_info.find('a').get('href') house_info['title']= house_info.find('a').string house_info['address_xiaoqu']=house_info.find('div',class_="houseInfo").a.string house_info['address_info']= house_info.find('div',class_="houseInfo").a.next_sibling house_info['flood']= house_info.find('div',class_="flood").find('div',class_="positionInfo").span.next_sibling house_info['area']= house_info.find('div',class_="flood").find('div',class_="positionInfo").a.string house_info['totalPrice']= house_info.find('div',class_="totalPrice").span.string + house_info.find('div',class_="totalPrice").span.next_sibling house_info['unitPice']= house_info.find('div',class_="unitPrice").span.string house_info['area']= quyuInfo[1] house_info['area02']= quyuInfo[2] house_list.append(house_info) return house_listfile = codecs.open('lianjia.txt','wb','utf-8')quyu_list = get_all_xiaoqu_info()for quyu in quyu_list: print '%s %s :(%s)' % (quyu[0], quyu[1], quyu[2]) house_list = getHouseInfoForOneURL(quyu) for house in house_list: house_str= '%s|%s|%s|%s|%s|%s|%s|%s|%s\n' % ( house['area'], house['area02'],house['title'], house['address_xiaoqu'], house['address_info'],house['flood'], house['area'],house['totalPrice'],house['unitPice'] ) file.write(house_str)file.close()
0 0
- Python数据爬虫,爬链家的二手房信息
- python爬虫爬取链家二手房信息
- 【Python爬虫系列】Python 爬取搜房网二手房数据
- 【Python爬虫系列】Python 爬取上海链家二手房数据
- 简单Python爬取链接二手房信息
- 数据分析:pandas分析链家网二手房信息
- 使用网络爬虫爬取新浪二手房网站中的西安二手房信息
- 使用python抓取分析链家网二手房数据
- python爬取上海链家网二手房数据
- python链家网二手房数据爬取
- scrapy实战(一)-------------爬取链家网的二手房信息
- 使用bs4爬取链家网的二手房信息
- python爬虫信息提取的一般方法
- Python爬虫-爬取集思录的金融信息,并写入文件和检测数据变化发送邮件通知
- Python爬虫之实习僧招聘信息及数据分析
- Python爬虫之实习僧招聘信息及数据分析
- 操作 Python爬虫数据存储MySQL【3】爬取信息
- 爬取二手房信息
- logback 配置详解
- jquery deferred
- python命令行解析模块optionparser
- Magento开发系列之一 基础知识
- iReport设置文字自动换行
- Python数据爬虫,爬链家的二手房信息
- MemcacheQ 安装与使用
- tmux 安装和使用
- BZOJ 3110 [Zjoi2013]K大数查询 (整体二分 + 树状数组或线段树处理区间合值)
- Uva 12716 GCD XOR
- html htm xhtml的不同
- 从tomcat角度解读play源代码
- iOS-获取系统相簿里边的所有照片
- linux卸载openjdk后,安装jdk操作