RCurl抓取团购信息

来源:互联网 发布:网络小鲜肉图片大全 编辑:程序博客网 时间:2024/04/29 20:15

抓取团购信息

myheader<-c(

"User-Agent"="Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2 ",

"Accept-Language"="en-us",

"Connection"="keep-alive",

"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7")#伪装header,防止不能爬取

temp<-getURL("http://t.dianping.com/list/nanjing-category_1",httpheader=myheader,encoding="UTF-8")#获取链接

write.table(temp,"temp.txt")#查看

 

temp1<-getURL("http://t.dianping.com/list/nanjing-category_1?pageIndex=1",httpheader=myheader,encoding="UTF-8")

write.table(temp,"temp1.txt")#查看下一页,判断能不能爬取

 

k=htmlParse(temp)#解析

k

youhui=sapply(getNodeSet(k,'//a [@class=\"tg-floor-title\"]'),xmlValue)#从获取的网页信息中截取出所需要的,查看源文件如下:

 <a class=\"tg-floor-title\" target=\"_blank\" href=\"/deal/21919506\"

data-hippo-track=\"|1|module#5_con_list,action#click,index#2,dealgrp_id#21919506,query_id#c682fa87-6ceb-4423-8d9e-b65720eac1de\"

 >

            <h3>大渝火锅</h3>

            <h4>        [16店通用]

 仅售90元!价值100元的代金券1张,除酒水饮料、调料外全场通用,可叠加使用,可免费使用包间,提供免费WiFi</h4>

        </a>

可以看到大概都在<a class=></a>中保存,所以用'//a [@class=\"tg-floor-title\"],@代表class是个属性,并且用中括号,规定就是这样写。

youhui[30]#取出一个看看

write.table(youhui,"meishiyouhui.txt")#写入一个文件查看一下

#循环取出7页来

urllist=0

page=1:7

urllist[page]=paste("http://t.dianping.com/list/nanjing-category_1?pageIndex=",page,sep='')#设定这7页的网页链接,观察网页链接可以得到这个规律,循环的过程跟上面的过程一致

for (url in urllist){

temp=getURL(url,httpheader=myheader,encoding="UTF-8")

k=htmlParse(temp)

youhui=sapply(getNodeSet(k,'//a [@class=\"tg-floor-title\"]'),xmlValue)

cat(url,"\n")

write.table(youhui,"meishi.txt",append=T)#append=T,这样读取7页的信息就不会重复覆盖

}

#随机设定伪装头的方法,定义一个列表,从里面随机选取一个作为user-agent

#大众点评美食团购页面,测试另外一种伪装头的方法
library(RCurl)
library(XML)
UserAgent=c(
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Intel Mac OS X 10.6; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 OPR/18.0.1284.68",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.9.1) Presto/2.12.388 Version/12.16",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 OPR/18.0.1284.68",
"Mozilla/5.0 (iPad; CPU OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) CriOS/30.0.1599.12 Mobile/11A465 Safari/8536.25",
"Mozilla/5.0 (iPad; CPU OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4",
"Mozilla/5.0 (iPad; CPU OS 7_0_2 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A501 Safari/9537.53",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",    
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
)
n=length(UserAgent)


myheader<-c(
"User-Agent"=UserAgent[sample(n-1,1)],#随机选择
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7")


urllist=0
page=1:3
urllist[page]=paste("http://t.dianping.com/list/nanjing-category_1?pageIndex=",page,sep='')
for(url in urllist){
temp<-getURL(url,encoding="UTF-8",httpheader=myheader)#


k=htmlParse(temp)
review=sapply(getNodeSet(k,'//a [@class=\"tg-floor-title\"]'),xmlValue)
review[page]
cat(url,"\n")


write.table(review,"dazhongdianping.txt",quote = FALSE,row.names = TRUE,
              col.names = FALSE,append=T)
#Sys.sleep(5)
}

0 0