R之前爬虫,数据处理整理

来源:互联网 发布:流量测试软件 编辑:程序博客网 时间:2024/06/05 23:40

数据简单爬取:

library(RCurl)

#  url  网址

#伪装myHttpheader

myHttpheader <- c(
  "User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)

 tmp <- getURL(url = url, httpheader = myHttpheader,debugfunction=d$update,verbose= TRUE)

#得到源代码

strsplit(tmp, split="")

#分割字符串  (断)

    pol <- unlist(pol)

#合成一列

    pol1 <- pol[c(-1)]

#去掉第一个

    where<-regexpr("title=(.*?)\n<", pol1)
    pol3 <- regmatches(pol1, where)

#寻找   "title=(.*?)\n<"片段的字符(正则表达)  在字符中的位置

#  得到该片段

    pol4<-gsub("title=","",pol4)

#不同于strsplit   代码除去   “”的内容

#\n需要  \\n   \需要\\\

lapply(X=pol4, function(file) {file[1]})

#返回列表第一个字符



进度条:

library(tcltk)

plot.new()
pb <- tkProgressBar("进度","已完成 %",  0, 100)

for (i in1:10){}

    info <- sprintf("%s已完成 %d%%",c, round(i*100/12))
    setTkProgressBar(pb, i*100/12, sprintf("进度 (%s)", info), info)


全代码:空气质量爬取
library(tcltk)
library(stringr)
library(RCurl)
library(ggplot2) 
all<-data.frame()
plot.new()
pb <- tkProgressBar("进度","已完成 %",  0, 100)
myHttpheader <- c(
  "User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
cityweath<-list("广州市","深圳市","珠海市","汕头市","佛山市","韶关市","中山市","江门市","湛江市","肇庆市")
d =debugGatherer()
thetime<-c()
quanty<-c()


for(j in 1:10){
  i<-1
  c<-cityweath[j]
  while (i<=12) {
    url<-"http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?city="
    url<-str_c(url,c,"&startdate=2016-01-01&enddate=2016-11-30&page=",i)
    tmp <- getURL(url = url, httpheader = myHttpheader,debugfunction=d$update,verbose= TRUE)
    pol<-c(strsplit(tmp, split=" coords="),strsplit(tmp, split=" </map></td>"))
    pol <- unlist(pol)
    pol1 <- pol[c(-1)]
    where<-regexpr("title=(.*?)\n<", pol1)
    pol3 <- regmatches(pol1, where)
    pol3 <- pol3[c(-1)] 
    pol4<-strsplit(pol3, split="tittle=")
    pol4<-gsub("title=","",pol4)
    pol4<-gsub("\\\"", "", pol4)
    pol4<-gsub("\\n", "", pol4)
    pol4<-gsub("><", "", pol4)
    pol4<-strsplit(pol4, split=" ")
    pol4<-c(pol4[length(pol4)],pol4[2:length(pol4)-1])
    thetime<-c(thetime,unlist(lapply(X=pol4, function(file) {file[1]})))
    quanty<-c(quanty,unlist(lapply(X=pol4, function(file) {file[3]})))
    info <- sprintf("%s已完成 %d%%",c, round(i*100/12))
    setTkProgressBar(pb, i*100/12, sprintf("进度 (%s)", info), info)
    i=i+1
  }
  filee<-str_c("C:/Users/Administrator/Desktop/R/",cityweath[j],".txt")
  
  if(j==1){all<-data.frame(thetime,quanty,stringsAsFactors=FALSE,col.names = c(cityweath[j]))}
  else all<-cbind(all,data.frame(thetime,quanty,stringsAsFactors=FALSE,col.names = c(cityweath[j])))
  thetime<-c()
  quanty<-c()
}
close(pb)#关闭进度条
write.csv(all,"all.csv")



0 0