统计个人CSDN的博客文章数量

来源：互联网发布：linux echo 清空文件编辑：程序博客网时间：2024/04/28 21:08

统计个人CSDN的博客文章数量

第一版

原始版本比较简单

只能统计第一页，而且没有进行排序

# coding:utf-8import urllib2from bs4 import BeautifulSoupimport sysreload(sys)sys.setdefaultencoding('utf-8')def getPage(): #伪装成浏览器登陆,获取网页源代码url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'totalList=[]contentList=[]headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  req = urllib2.Request(url=url,headers=headers)try:html = urllib2.urlopen(req).read()except urllib2.HTTPError,e:print e.codeprint e.reasonfd=open('counter.txt','w')page = BeautifulSoup(html,'lxml')mytimes=page.find(id='blog_rank')i =1for aa in mytimes.find_all('li'):if i<3:print aa.textfd.write(aa.text)fd.write('\n')totalList.append(aa.text)i +=1items = page.find_all('div',class_ ='list_item list_view')print '总共有文章%d 篇' % len(items)for item in items:content=item.find('a')read_time=item.find('span',class_ ='link_view')comments_time=item.find('span',class_ ='link_comments')totalcontent=content.text.strip()+read_time.text.strip()+comments_time.text.strip()print totalcontentcontentList.append(totalcontent)fd.write(totalcontent)fd.write('\n')fd.close()return totalList,contentListurls=getPage()

第二版

再增加一个版本

这个版本，直接能按照访问次数进行排序

2017.4.11日重新更新代码，本次更新内容：

将统计的内容，重新在程序文件下再建立一个文件夹，同时将统计内容放入到以当前时间为名字的文本中

避免了每次统计直接覆盖了上一次统计的数据

第二版

# coding:utf-8import urllib2,re,datetime,osfrom bs4 import BeautifulSoupimport sysreload(sys)sys.setdefaultencoding('utf-8')def getPage(): #伪装成浏览器登陆,获取网页源代码url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'baseurl='http://blog.csdn.net'totalList=[]contentList=[]sortlist=[]sortlist1=[]headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  req = urllib2.Request(url=url,headers=headers)try:html = urllib2.urlopen(req).read()except urllib2.HTTPError,e:print e.codeprint e.reasonpath='count'if not os.path.exists(path):              os.makedirs(path) fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'fd=open(fname,'w')page = BeautifulSoup(html,'lxml')mytimes=page.find(id='blog_rank')i =1for aa in mytimes.find_all('li'):if i<3:print aa.textfd.write(aa.text)fd.write('\n')totalList.append(aa.text)i +=1items = page.find_all('div',class_ ='list_item list_view')print '总共有文章%d 篇' % len(items)fd.write('总共有文章%d 篇' % len(items))fd.write('\n')for item in items:aa={}content=item.find('a')contemtUrl=baseurl+content.get('href')read_time=item.find('span',class_ ='link_view')tmp=str(read_time.text.strip())number = int(filter(str.isdigit, tmp))sortlist1.append(number)comments_time=item.find('span',class_ ='link_comments')aa['indexs']=numberaa['content']=content.text.strip()aa['read_time']=tmpaa['comments_time']=comments_time.text.strip()aa['contemtUrl']=contemtUrlsortlist.append(aa)sortlist1.sort()print sortlist1for i in sortlist1:for a in sortlist:if int(i) == int(a['indexs']):totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['comments_time']+'\t'+a['contemtUrl']print totalcontentfd.write(totalcontent)fd.write('\n')contentList.append(totalcontent)fd.close()return contentListurls=getPage()

第三版

这一个版本比较有意思

#coding:utf-8  import urllib2,re,time,random,os,datetimefrom bs4 import BeautifulSoupimport webbrowser as webimport sys  reload(sys)  sys.setdefaultencoding('utf-8')    def getPage(): #伪装成浏览器登陆,获取网页源代码      url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'      baseurl='http://blog.csdn.net'     contentList=[]      sortlist=[]    sortlist1=[]    urlList=[]    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}        req = urllib2.Request(url=url,headers=headers)      try:          html = urllib2.urlopen(req).read()      except urllib2.HTTPError,e:          print e.code          print e.reason      path=u'count'    if not os.path.exists(path):              os.makedirs(path)     fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'    print fname    fd=open(fname,'w')    page = BeautifulSoup(html,'lxml')    items = page.find_all('div',class_ ='list_item list_view')      print u'总共有文章%d 篇' % len(items)      fd.write('总共有文章%d 篇' % len(items))    fd.write('\n')    for item in items:          aa={}          content=item.find('a')                contemtUrl=baseurl+content.get('href')        #print contemtUrl                  read_time=item.find('span',class_ ='link_view')          readtime=str(read_time.text.strip())        #print readtime          readtimeNumber = int(filter(str.isdigit, readtime))        #print readtimeNumber        sortlist1.append(readtimeNumber)          #time.sleep(2)        aa['indexs']=readtimeNumber          aa['content']=content.text.strip()          aa['read_time']=readtime           aa['contemtUrl']=contemtUrl          sortlist.append(aa)      sortlist1.sort()      print sortlist1            for i in sortlist1:          for a in sortlist:              if int(i) == int(a['indexs']):                  totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']                  print totalcontent                  fd.write(totalcontent)                  fd.write('\n')                urlList.append(a['contemtUrl'])                contentList.append(totalcontent)      fd.close()      return urlList  urls=getPage()count=random.randint(10,50)print u'将要打开关闭浏览器次数为：',countfor i in range(5):print urls[i]j=0while j< count:    if j == 15:        j=0    for i in range(5):        web.open_new_tab(urls[i+38])        time.sleep(1)        web.open_new_tab(urls[random.randint(1,44)])        time.sleep(1)    web.open_new_tab('http://blog.csdn.net/qiqiyingse/article/details/51801918')    time.sleep(3)    os.system('taskkill /f /IM Chrome.exe')    j = j+1

第四版

本次更新是博客文章大于50篇以后，可能需要2页显示，但是只能统计两页内容

因此重新更新

#coding:utf-8  import urllib2,re,time,random,os,datetimefrom bs4 import BeautifulSoupimport webbrowser as webimport sys  reload(sys)  sys.setdefaultencoding('utf-8')    def getPage(): #伪装成浏览器登陆,获取网页源代码      url1 = 'http://blog.csdn.net/qiqiyingse/article/list/1?viewmode=contents'      url2 = 'http://blog.csdn.net/qiqiyingse/article/list/2?viewmode=contents'      baseurl='http://blog.csdn.net'     contentList=[]      sortlist=[]    sortlist1=[]    urlList=[]    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}        req1 = urllib2.Request(url=url1,headers=headers)      req2 = urllib2.Request(url=url2,headers=headers)      try:          html1 = urllib2.urlopen(req1).read()          html2 = urllib2.urlopen(req2).read()      except urllib2.HTTPError,e:          print e.code          print e.reason      path=u'count'    if not os.path.exists(path):              os.makedirs(path)     fname=path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'    print fname    fd=open(fname,'w')    page1 = BeautifulSoup(html1,'lxml')    page2 = BeautifulSoup(html2,'lxml')    items1 = page1.find_all('div',class_ ='list_item list_view')      items2 = page2.find_all('div',class_ ='list_item list_view')      cont_print= u'总共有文章%d 篇' % (len(items1)+len(items2))    print cont_print    fd.write(cont_print)      fd.write('\n')    for item in items1:          aa={}          content=item.find('a')                contemtUrl=baseurl+content.get('href')        #print contemtUrl                  read_time=item.find('span',class_ ='link_view')          readtime=str(read_time.text.strip())        #print readtime          readtimeNumber = int(filter(str.isdigit, readtime))        #print readtimeNumber        sortlist1.append(readtimeNumber)          #time.sleep(2)        aa['indexs']=readtimeNumber          aa['content']=content.text.strip()          aa['read_time']=readtime           aa['contemtUrl']=contemtUrl          sortlist.append(aa)    for item in items2:          aa={}          content=item.find('a')                contemtUrl=baseurl+content.get('href')        #print contemtUrl                  read_time=item.find('span',class_ ='link_view')          readtime=str(read_time.text.strip())        #print readtime          readtimeNumber = int(filter(str.isdigit, readtime))        #print readtimeNumber        sortlist1.append(readtimeNumber)          #time.sleep(2)        aa['indexs']=readtimeNumber          aa['content']=content.text.strip()          aa['read_time']=readtime           aa['contemtUrl']=contemtUrl          sortlist.append(aa)      sortlist1.sort()      print sortlist1            for i in sortlist1:          for a in sortlist:              if int(i) == int(a['indexs']):                  totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']                  print totalcontent                  fd.write(totalcontent)                  fd.write('\n')                urlList.append(a['contemtUrl'])                contentList.append(totalcontent)      fd.close()      return urlList  urls=getPage()

第五版

这次版本对整个函数进行了调整

1.让每一部分看起来更易读

2.可以统计个人名下所有的博客内容了，不管你有多少篇多少页博客，都能给统计到

3.更新了排序算法，这样就修复了之前的一个bug

代码如下：

#coding:utf-8import urllib2,re,time,random,os,datetimefrom bs4 import BeautifulSoupimport webbrowser as webimport sysreload(sys)  sys.setdefaultencoding('utf-8')#自定义打印函数def self_log(msg):print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)#获取页面内容def  get_html(url):headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}req = urllib2.Request(url=url,headers=headers)try:html = urllib2.urlopen(req).read()except urllib2.HTTPError,e:print e.codereturn html#得到博客页面总数def get_last_page(html,fd):if not html:self_log(u'页面错误，停止运行') returnpage = BeautifulSoup(html,'lxml')if page.find('div',class_ ='pagelist').find_all('a'):last_page=page.find('div',class_ ='pagelist').find_all('a')last_page= last_page[len(last_page)-1].get('href')[-1:]self_log('总共有%s 页博客' % last_page)fd.write('总共有%s 页博客\n' % last_page)return last_pageelse:return 1#获取积分内容def get_rank(html,fd):if not html:self_log(u'页面错误，停止运行') returnpage = BeautifulSoup(html,'lxml')rank_list=[]if page.find(id='blog_rank'):rank_content=page.find(id='blog_rank')i =1for rank in rank_content.find_all('li'):if i<3:self_log(rank.text)fd.write(rank.text)fd.write('\n')rank_list.append(rank.text)i +=1return rank_list#获取页面列表def get_items(url):content_html=get_html(url)page = BeautifulSoup(content_html,'lxml')items = page.find_all('div',class_ ='list_item list_view')return items#根据每一个items list 提取需要的元素def handle_items(items,content_list,read_num_for_sort):for item in items:temp={}#临时变量title=item.find('a')#标题content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数comments_time=item.find('span',class_ ='link_comments')#评论次数read_number = int(filter(str.isdigit, str(read_times)))#提取出来具体阅读次数的数字，为之后的排序做准备read_num_for_sort.append(read_number)#将数据打包temp['indexs']=read_numbertemp['title']=title.text.strip()temp['read_times']=read_timestemp['comments_time']=comments_time.text.strip()temp['content_url']=content_urlcontent_list.append(temp)#创建文件夹def mkdir_folder(path):if not os.path.exists(path):  os.makedirs(path) #程序运行主函数def run(url):read_num_for_sort=[]content_list=[]content_totle_list=[]#定义文件夹名字并创建文件夹dir_path='count'mkdir_folder(dir_path)#定义文件名字count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'fd=open(count_file_name,'w')#1.从主页进入获取页面总数main_html=get_html(url)last_page=get_last_page(main_html,fd)#2.获取积分内容rank_list=get_rank(main_html,fd)#3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容for i in range(1,int(last_page)+1):main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % iself_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item listhandle_items(items,content_list,read_num_for_sort)#处理item list#4.根据阅读次数 进行排序read_num_for_sort.sort()print read_num_for_sort'''这也是一种排序思想，其中有一些缺陷for i in read_num_for_sort:for a in content_list:if int(i) == int(a['indexs']):totalcontent=a['content']+'\t|'+a['read_time']+'\t|'+a['comments_time']+'\t|'+a['contemtUrl']'''self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)#根据 indexs（阅读次数）这个索引值进行排序article_index = 1for a in content_list:#组装打印语句totalcontent= '第'+str(article_index)+'篇  |'+a['title']+'\t|'+a['read_times']+'\t|'+a['comments_time']+'\t|'+a['content_url']self_log(totalcontent)#将其存贮到本地fd.write(totalcontent)fd.write('\n')article_index +=1content_totle_list.append(totalcontent)fd.close()return content_totle_listif __name__ == '__main__': print ''''             *****************************************             **    Welcome to Spider of Count CSDN  **             **      Created on 2017-04-12          **             **      @author: Jimy_Fengqi           **             *****************************************'''  url='http://blog.csdn.net/qiqiyingse?viewmode=contents'run(url)

第六版

此次更新，优化一点点内容

主要是可以将文章内容存贮到本地

同时，将统计信息重新放到一个excel 里面

代码如下：

#coding:utf-8import urllib2,re,time,random,os,datetimefrom bs4 import BeautifulSoupfrom pyExcelerator import * #导入excel相关包 import sysreload(sys)  sys.setdefaultencoding('utf-8')#自定义打印函数def self_log(msg):print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)#获取页面内容def  get_html(url):headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}req = urllib2.Request(url=url,headers=headers)try:html = urllib2.urlopen(req).read()except urllib2.HTTPError,e:print e.codereturn html#得到博客页面总数def get_last_page(html,fd):if not html:self_log(u'页面错误，停止运行') returnpage = BeautifulSoup(html,'lxml')if page.find('div',class_ ='pagelist').find_all('a'):last_page=page.find('div',class_ ='pagelist').find_all('a')last_page= last_page[len(last_page)-1].get('href')[-1:]self_log('总共有%s 页博客' % last_page)fd.write('总共有%s 页博客\n' % last_page)return last_pageelse:return 1#获取积分内容def get_rank(html,fd):if not html:self_log(u'页面错误，停止运行') returnpage = BeautifulSoup(html,'lxml')rank_list=[]if page.find(id='blog_rank'):rank_content=page.find(id='blog_rank')i =1for rank in rank_content.find_all('li'):if i<3:self_log(rank.text)fd.write(rank.text)fd.write('\n')rank_list.append(rank.text)i +=1return rank_list#获取页面列表def get_items(url):content_html=get_html(url)page = BeautifulSoup(content_html,'lxml')items = page.find_all('div',class_ ='list_item list_view')return items#根据每一个items list 提取需要的元素def handle_items(items,content_list,read_num_for_sort):for item in items:temp={}#临时变量title=item.find('a')#标题content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数comments_time=item.find('span',class_ ='link_comments')#评论次数read_number = int(filter(str.isdigit, str(read_times)))#提取出来具体阅读次数的数字，为之后的排序做准备read_num_for_sort.append(read_number)#将数据打包temp['indexs']=read_numbertemp['title']=title.text.strip()temp['read_times']=read_timestemp['comments_time']=comments_time.text.strip()temp['content_url']=content_urlcontent_list.append(temp)#创建文件夹def mkdir_folder(path):if not os.path.exists(path):  os.makedirs(path) #获取页面信息def getContent(html):page = BeautifulSoup(html,'lxml')try:title=page.find('div',class_='article_title').find('a').texttitle=title.strip()except Exception,e:print etry:content=page.find('div',class_='article_content')dir_path='count'artitle_name_path=dir_path+'/'+title+'.txt'with open(artitle_name_path+'.txt','w') as f:f.write(content.text)self_log(u'存贮文章：%s 完毕' % title)except Exception,e:print e#存贮每一篇文章到本地def run_to_get_article(content_total_list):self_log('start save every article  ')for article_content in content_total_list:article_url=article_content.split('|')[4]self_log( '将要存贮的地址是： %s ...' % article_url)artitle_html=get_html(article_url)getContent(artitle_html)#将内容存贮到excel中def run_to_save_info_in_excel(content_total_list):self_log('start save info into excel')excel_w=Workbook()excel_sheet_name=time.strftime('%Y-%m-%d_%H-%M-%S')excel_content_handler=excel_w.add_sheet(excel_sheet_name) first_line=[u'编号',u'标题',u'阅读次数',u'评论次数',u'文章地址']cols=0for content in first_line:excel_content_handler.write(0,cols,content)cols +=1index=1for article_content in content_total_list:cols =0for a in article_content.split('|'):excel_content_handler.write(index,cols,a)cols +=1index +=1excel_w.save('count/'+'count'+time.strftime('%Y-%m-%d')+'.xls')#程序运行主函数def run(url):read_num_for_sort=[]content_list=[]content_total_list=[]#定义文件夹名字并创建文件夹dir_path='count'mkdir_folder(dir_path)#定义文件名字count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'fd=open(count_file_name,'w')#1.从主页进入获取页面总数main_html=get_html(url)last_page=get_last_page(main_html,fd)#2.获取积分内容rank_list=get_rank(main_html,fd)#3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容for i in range(1,int(last_page)+1):main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % iself_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item listhandle_items(items,content_list,read_num_for_sort)#处理item list#4.根据阅读次数 进行排序read_num_for_sort.sort()print read_num_for_sort'''这也是一种排序思想，其中有一些缺陷for i in read_num_for_sort:for a in content_list:if int(i) == int(a['indexs']):totalcontent=a['content']+'\t|'+a['read_time']+'\t|'+a['comments_time']+'\t|'+a['contemtUrl']'''self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数#根据 indexs（阅读次数）这个索引值进行排序#非常好的一个根据列表中字典数据进行排序的方法content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)article_index = 1for a in content_list:#组装打印语句totalcontent= '第'+str(article_index)+'篇  |'+a['title']+'\t|'+a['read_times']+'\t|'+a['comments_time']+'\t|'+a['content_url']#self_log(totalcontent)#将其存贮到本地fd.write(totalcontent)fd.write('\n')article_index +=1content_total_list.append(totalcontent)fd.close()return content_total_listif __name__ == '__main__': print ''''             *****************************************             **    Welcome to Spider of Count CSDN  **             **      Created on 2017-04-12          **             **      @author: Jimy_Fengqi           **             *****************************************''' url=raw_input(u'请输入将要统计的个人csdn主页地址，类似如下：\n http://blog.csdn.net/qiqiyingse?viewmode=contents')if not url:url='http://blog.csdn.net/qiqiyingse?viewmode=contents'content_total_list=run(url)run_to_save_info_in_excel(content_total_list)run_to_get_article(content_total_list)

第七版

更新一下excel 的使用方法

#coding:utf-8import urllib2,re,time,random,os,datetimefrom bs4 import BeautifulSoupfrom pyExcelerator import * #导入excel相关包 import xlrd import xlwtfrom xlutils.copy import copyimport sysreload(sys)  sys.setdefaultencoding('utf-8')def create_excel(data):excle_file_name=str(time.strftime('%Y-%m-%d')+'.xls')#以当天日期创建excel表#判断一个文件是否存在def file_is_exist(file_name):path = os.path.join(os.getcwd()+'/count/'+file_name)print 'current file [%s] path is [%s]' % (file_name,path)is_exists = os.path.exists(path)return is_exists#读取复制一份，并且增加一张新表def read_and_copy_excle(excle_file_name):read_excel_flag=xlrd.open_workbook(excle_file_name,formatting_info=True)#保存原有格式count = len(read_excel_flag.sheets()) #sheet数量for r in read_excel_flag.sheets():print r.name #sheet名称worksheet_copy=copy(read_excel_flag)#复制一份excelwrite_excel(worksheet_copy,excle_file_name)#之后再次插入一份#写exceldef write_excel(excel_flag,excle_file_name):sheet_name=str(time.strftime('%Y-%m-%d_%H-%M-%S'))sheet_flag = excel_flag.add_sheet(sheet_name,cell_overwrite_ok=True) #创建sheetfirst_line=[u'编号',u'标题',u'阅读次数',u'评论次数',u'文章地址']#生成第一行for i in range(0,len(first_line)):sheet_flag.write(0,i,first_line[i],set_style('Times New Roman',220,True,40))if i== 1:#设置行宽sheet_flag.col(i).width=256*150elif i == 4:sheet_flag.col(i).width=256*80else:sheet_flag.col(i).width=256*15row_index=1for article_content in data:cols_index =0for data_detail in article_content.split('|'):sheet_flag.write(row_index,cols_index,data_detail,set_style('Arial',300,False,cols_index))#sheet_flag.col(cols_index).width=sheet_flag.col(cols_index+1).widthcols_index +=1row_index +=1style = xlwt.easyxf('font:height 240, color-index red, bold on;align: wrap on, vert centre, horiz center');  sheet_flag.write(row_index+1,cols_index+1, 'hello world', style) sheet_flag.write(row_index+2,cols_index+2,'start',set_style(u'宋体',300,False,20))excel_flag.save('count/'+excle_file_name) #保存文件#设置单元格格式def set_style(name,height,bold,color_index):style = xlwt.XFStyle() # 初始化样式font = xlwt.Font() # 为样式创建字体font.name = name # 字体名称font.bold = bold #字体加粗font.color_index = color_index #字体颜色， 但是貌似无效font.height = height borders= xlwt.Borders()#字体边框borders.left= 6borders.right= 6borders.top= 6borders.bottom= 6style.font = fontif bold:style.borders = bordersreturn style#文件存在就复制一份，并在其表的后面插入一个，不存在就新创建一个if file_is_exist(excle_file_name):print 'file 【%s】 exist ' % excle_file_nameread_and_copy_excle(excle_file_name)#复制一个excle并追加一个sheet页else:print 'file 【%s】is not  exist, will create it ' % excle_file_nameexcel_flag=xlwt.Workbook()#新建excel工作薄write_excel(excel_flag,excle_file_name)#自定义打印函数def self_log(msg):print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)#获取页面内容def  get_html(url):headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}req = urllib2.Request(url=url,headers=headers)try:html = urllib2.urlopen(req).read()except urllib2.HTTPError,e:print e.codereturn html#得到博客页面总数def get_last_page(html,fd):if not html:self_log(u'页面错误，停止运行') returnpage = BeautifulSoup(html,'lxml')if page.find('div',class_ ='pagelist').find_all('a'):last_page=page.find('div',class_ ='pagelist').find_all('a')last_page= last_page[len(last_page)-1].get('href')[-1:]self_log('总共有%s 页博客' % last_page)fd.write('总共有%s 页博客\n' % last_page)return last_pageelse:return 1#获取积分内容def get_rank(html,fd):if not html:self_log(u'页面错误，停止运行') returnpage = BeautifulSoup(html,'lxml')rank_list=[]if page.find(id='blog_rank'):rank_content=page.find(id='blog_rank')i =1for rank in rank_content.find_all('li'):if i<3:self_log(rank.text)fd.write(rank.text)fd.write('\n')rank_list.append(rank.text)i +=1return rank_list#获取页面列表def get_items(url):content_html=get_html(url)page = BeautifulSoup(content_html,'lxml')items = page.find_all('div',class_ ='list_item list_view')return items#根据每一个items list 提取需要的元素def handle_items(items,content_list,read_num_for_sort):for item in items:temp={}#临时变量title=item.find('a')#标题content_url='http://blog.csdn.net'+title.get('href')#标题对应文章的地址read_times=item.find('span',class_ ='link_view').text.strip()#阅读次数comments_time=item.find('span',class_ ='link_comments')#评论次数read_number = int(filter(str.isdigit, str(read_times)))#提取出来具体阅读次数的数字，为之后的排序做准备read_num_for_sort.append(read_number)#将数据打包temp['indexs']=read_numbertemp['title']=title.text.strip()temp['read_times']=read_timestemp['comments_time']=comments_time.text.strip()temp['content_url']=content_urlcontent_list.append(temp)#创建文件夹def mkdir_folder(path):if not os.path.exists(path):  os.makedirs(path) #获取页面信息def getContent(html):page = BeautifulSoup(html,'lxml')try:title=page.find('div',class_='article_title').find('a').texttitle=title.strip()except Exception,e:print etry:content=page.find('div',class_='article_content')dir_path='count'artitle_name_path=dir_path+'/'+title+'.txt'with open(artitle_name_path+'.txt','w') as f:f.write(content.text)self_log(u'存贮文章：%s 完毕' % title)except Exception,e:print e#存贮每一篇文章到本地def run_to_get_article(content_total_list):self_log('start save every article  ')for article_content in content_total_list:article_url=article_content.split('|')[4]self_log( '将要存贮的地址是： %s ...' % article_url)artitle_html=get_html(article_url)getContent(artitle_html)#将内容存贮到excel中def run_to_save_info_in_excel(content_total_list):self_log('start save info into excel')excel_w=Workbook()excel_sheet_name=time.strftime('%Y-%m-%d_%H-%M-%S')excel_content_handler=excel_w.add_sheet(excel_sheet_name) first_line=[u'编号',u'标题',u'阅读次数',u'评论次数',u'文章地址']cols=0for content in first_line:excel_content_handler.write(0,cols,content)cols +=1index=1for article_content in content_total_list:cols =0for a in article_content.split('|'):excel_content_handler.write(index,cols,a)cols +=1index +=1excel_w.save('count/'+'count_'+time.strftime('%Y-%m-%d_%H-%M')+'.xls')#程序运行主函数def run(url):read_num_for_sort=[]content_list=[]content_total_list=[]#定义文件夹名字并创建文件夹dir_path='count'mkdir_folder(dir_path)#定义文件名字count_file_name=dir_path+'/'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+'.txt'fd=open(count_file_name,'w')#1.从主页进入获取页面总数main_html=get_html(url)last_page=get_last_page(main_html,fd)#2.获取积分内容rank_list=get_rank(main_html,fd)#3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容for i in range(1,int(last_page)+1):main_url=url.split('?')[0]+'/article/list/%d?viewmode=contents' % iself_log('即将获取第%d页的内容，地址是：%s' % (i,main_url))items=get_items(main_url)#获取每一页的页面内容，根据页面内容得到文章item listhandle_items(items,content_list,read_num_for_sort)#处理item list#4.根据阅读次数 进行排序read_num_for_sort.sort()print read_num_for_sort'''这也是一种排序思想，其中有一些缺陷for i in read_num_for_sort:for a in content_list:if int(i) == int(a['indexs']):totalcontent=a['content']+'\t|'+a['read_time']+'\t|'+a['comments_time']+'\t|'+a['contemtUrl']'''self_log('总共有%d 篇文章' % len(content_list))#根据得到的数据，统计文章总数#根据 indexs（阅读次数）这个索引值进行排序#非常好的一个根据列表中字典数据进行排序的方法content_list = sorted(content_list,cmp=lambda x,y:cmp(x['indexs'],y['indexs']),reverse=0)article_index = 1for a in content_list:#组装打印语句totalcontent= '第'+str(article_index)+'篇|'+a['title']+'|'+a['read_times']+'|'+a['comments_time']+'|'+a['content_url']#self_log(totalcontent)print totalcontent#将其存贮到本地fd.write(totalcontent)fd.write('\n')article_index +=1content_total_list.append(totalcontent)fd.close()return content_total_listif __name__ == '__main__': print ''''             *****************************************             **    Welcome to Spider of Count CSDN  **             **      Created on 2017-04-12          **             **      @author: Jimy_Fengqi           **             *****************************************''' url=raw_input(u'请输入将要统计的个人csdn主页地址，类似如下：\n http://blog.csdn.net/qiqiyingse?viewmode=contents')if not url:url='http://blog.csdn.net/qiqiyingse?viewmode=contents'content_total_list=run(url)#run_to_save_info_in_excel(content_total_list)create_excel(content_total_list)#run_to_get_article(content_total_list)

1 0