【python 爬虫】链家天津租房在售房源数据爬虫
来源:互联网 发布:'中国网络墙 编辑:程序博客网 时间:2024/04/27 16:50
爬取字段:
户型、面积、朝向、小区、价格、url
#-*-coding:utf-8-*-import sysreload(sys)sys.setdefaultencoding('utf-8')import timeimport requestsfrom lxml import etreeimport pandas as pdtime1=time.time()import reimport randomcity='tj'host_url='http://'+city+'.lianjia.com/zufang/'print host_urldata1 = []data2 = []url_list = []for i in range(1,110): try: print "正在抓取第"+str(i)+"页。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。" url1="https://m.lianjia.com/tj/zufang/pg"+str(i)+"/?_t=1" user_agent_list=[ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13', 'Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1', 'Mozilla/5.0 (iPad; U; CPU OS 3_2_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B500 Safari/531.21.10', 'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124', 'Nokia5700AP23.01/SymbianOS/9.1 Series60/3.0', 'UCWEB7.0.2.37/28/999', 'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999', 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.10.229 Version/11.61', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2', 'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.10.229 Version/11.61', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12', 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13', 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13', 'Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1', 'Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.2.15 Version/10.10', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)' ] head={'User-Agent':random.choice(user_agent_list), "Cookie":"select_nation=1; lj-ss=9fc6cee08e4d99ced4584517044e1242; lianjia_uuid=cc14aa71-496b-40db-80fd-d136e55a4f81; UM_distinctid=15f22e787c4232-0d1b04a52ef6d2-3a3e5e06-1fa400-15f22e787c5c79; select_city=120000; lianjia_token=2.0079fcccf103161e086851e5c0a1095042; _ga=GA1.2.1281750422.1508119120; _gid=GA1.2.11799009.1508119120; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1508119120; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1508119221; _gat=1; _gat_past=1; _gat_new=1; _gat_global=1; _gat_new_global=1; CNZZDATA1254525948=274494434-1508117917-%7C1508117917; CNZZDATA1253491255=227527654-1508117030-%7C1508117030; lianjia_ssid=403ab941-dd7a-4efc-ac03-52079a8c9b93", "Host":"m.lianjia.com", "Referer":"https://m.lianjia.com/tj/zufang/pg3/" } session=requests.session() html=session.get(url1,headers=head).content # print html selector=etree.HTML(html) data1_1=selector.xpath('//div[@class="item_minor"]//span//em/text()') for each in data1_1: print each data1.append(each) data2_1=selector.xpath('//div[@class="item_other text_cut"]/text()') for each in data2_1: print each data2.append(each) data3_1=selector.xpath('//li[@class="pictext"]//a/@href') for each in data3_1: if "html" in each: k="https://m.lianjia.com"+str(each) print k url_list.append(k) except: passprint len(data1),len(data2),len(url_list)data = pd.DataFrame({"data1":data1, "data2": data2,"url_list":url_list})print len(data)# 写出excelwriter = pd.ExcelWriter(r'c:\\lianjia_new.xlsx', engine='xlsxwriter', options={'strings_to_urls': False})data.to_excel(writer, index=False)writer.close()time2 = time.time()print u'ok,爬虫结束!'print u'总共耗时:' + str(time2 - time1) + 's'
阅读全文
1 0
- 【python 爬虫】链家天津租房在售房源数据爬虫
- python爬虫租房信息在地图上显示
- python 爬虫抓取19楼租房信息
- python写爬虫3-MongoDB数据缓存(采集58出租房信息)
- python 数据爬虫 爬取糗百
- python 爬虫数据清洗
- Python爬虫爬数据
- python爬虫-->获取数据
- Python 爬虫学习2爬取租房网站信息
- Python爬虫初体验之赶集网租房信息获取
- Python爬虫--爬取赶集网的租房信息
- Python爬虫:获取链家,搜房,大众点评的数据
- python写爬虫4-多线程爬虫(采集58出租房信息)
- python写爬虫5-多进程爬虫(采集58出租房信息)
- Python网络爬虫,在网站上扒数据
- 聚合数据API爬虫:Python
- Python爬虫抓取动态数据
- python数据爬虫示例一
- arm-arago-linux-gnueabi-gcc没有那个文件或目录
- 刷新页面,js实现文章浏览量自动更新
- 【服务端知识点】MAC OSX 安装MongoDB
- 长期:如何找到适合自己的节奏?
- 一文说尽HMM(隐马尔科夫链)
- 【python 爬虫】链家天津租房在售房源数据爬虫
- 随机抽奖代码片段--Java
- 模仿京东首页DIV懒加载,根据滚动条来加载div
- java 从数据库表反射出实体类,自动生成实体类
- 关于python3的分号:
- 复习_日志审计
- 电信各种视频免流卡申请地址合集附地址失效解决方法
- Spring配置文件,引入dubbo框架,解决dubbo标签报错问题
- 网络爬虫初探