R语言爬虫实践一

来源:互联网 发布:漳州网络安全教育平台 编辑:程序博客网 时间:2024/05/16 06:34

R语言爬虫-上海市二手房信息

由于最近一直在关注房产信息,心血来潮想看一下最近上海市二手房的信息,所以利用R语言爬了1万+套二手房信息供参考,下面是抓取网页信息的代码,还请多多指教。

参考代码如下:

library(RCurl)  library(XML) library(raster)library(stringr)# 读取上海二手房数据  start_url = "http://sh.centanet.com/ershoufang/g1/"# 构造请求头  cust_header =c("User-Agent"="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0",                 "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",               "Accept-Language"="en-us",                 "Connection"="keep-alive")  # 读取PageSouce  pagesource <- getURL(start_url,httpheader=cust_header,.encoding="utf-8")  # 解析页数  parseTotalPage <- function(pagesource){    doc <- htmlParse(pagesource)      as.numeric(str_split(sapply(getNodeSet(doc, '//div[contains(@class,"result-lists")]//div[contains(@class,"select-bar clearfix")]//p[contains(@class,"pagerNum fr")]//span[@class="mr_10"]//text()'), xmlValue),"/")[[1]][2]) }  # 解析页面内容,获取小区名称、房间类型、房间面积、房间朝向|装修风格|年限、房间位置、房间价格、房间单价等信息parseContent <-  function(pagesource){    # 解析网页源代码  doc <- htmlParse(pagesource)  # 获取小区名称  district_name <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//a[@class="f000 mr_10"]//text()'), xmlValue)    # 获取房间类型  room_type <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//span[@class="f000 mr_10"]//text()'), xmlValue)    # 获取房间面积  room_area <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//span[@class="f000"]//text()'),xmlValue)  # 获取房间朝向等信息(包括朝向、楼层、装修、房间年份)  room_direction <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f7b mb_10")]//text()'),xmlValue)  room_direction <- trim(str_split(trim(paste(room_direction,collapse = "")),"\n")[[1]])  room_direction <- room_direction[which(nchar(room_direction) != 0)]  # 获取房间位置信息(包括房间所在上海区域、所在镇、哪条路)  room_location <- trim(sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[@class="f7b mb_15"]//text()'),xmlValue))  room_location <- room_location[which(nchar(room_location) != 0)]  room_info <- data.frame()  for(i in 1:length(str_split(room_location," "))){    room_region <- str_split(str_split(room_location," ")[[i]][1],"-")[[1]][1]    room_town <- str_split(str_split(room_location," ")[[i]][1],"-")[[1]][2]    room_road <- str_split(room_location," ")[[i]][2]    room_data <- data.frame(room_region,room_town,room_road)    room_info <- rbind(room_info,room_data)  }  # 获取房间总价  room_price <- sapply(getNodeSet(doc, '//div[contains(@class,"item-pricearea fr")]//p[@class="price-nub cRed"]//text()'), xmlValue)    # 获取房间单价  room_avgprice <- sapply(getNodeSet(doc, '//div[contains(@class,"item-pricearea fr")]//p[@class="f14 f000 mb_15 fsm"]//text()'), xmlValue)    # 整合信息  result <- data.frame(district_name, room_type, room_area, room_direction ,room_info$room_region, room_info$room_town, room_info$room_road, room_price, room_avgprice)}  # 获取总页数和第一页内容  total_page <- parseTotalPage(pagesource)pageresults <- parseContent(pagesource)# 生成2-n页url  page = 1:(total_page -1)  url_list = ""  url_list[page] = paste0("http://sh.centanet.com/ershoufang/g",page +1,"/")# 循环读取url,并进行下载解析  for (url in url_list){    pagesource <- getURL(url,httpheader=cust_header,.encoding="utf-8")    pageresult <- parseContent(pagesource)    pageresults <- rbind(pageresults,pageresult)  }  # 将结果写入并保存write.table(pageresults,"./pachong/pa_house.csv",row.names = TRUE)
原创粉丝点击