博客迁址

来源:互联网 发布:unity3d角色动画模型 编辑:程序博客网 时间:2024/05/01 18:11

新博客地址 http://4ct10n.cn

近期会将新的文章发表到新博客上 ,如果有什么问题还请大家纠正

QQ:1792034533
Email:act01n@163.com

ps:贴上自己的导出csdn的代码成md格式

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Date    : 2017-10-21 23:19:58# @Author  : 4ct10n (act01n@163.com)# @Link    : http://example.orgimport requestsimport sys  from bs4 import BeautifulSoupdef Get_all_page(url):    pages = []    lists = []    base_url = 'http://blog.csdn.net'    # get pages    res = requests.get(url)    soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')    page = soup.find_all(id='papelist')[0].find_all('a')    num = page[len(page)-1]['href'].split('/')    num = int(num[len(num)-1])    base = page[len(page)-1]['href'][0:-len(str(num))]    pages = [base+str(i) for i in range(1,num+1)]    # get lists    for ps in pages:        res = requests.get(base_url+ps)        soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')             ls = soup.find_all('div',attrs={'id':'article_list'})        ls = [i['href'] for i in ls[0].find_all('a')][::3]        # print ps,len(ls),ls        lists += ls        # lists = list(set(lists))    lists = [base_url+i for i in lists]    return listsdef get_content(url,path):    # url = 'http://blog.csdn.net/qq_31481187/article/details/78163593'    res = requests.get(url)    soup = BeautifulSoup(res.content, 'html.parser', from_encoding='utf-8')    source = '<link rel="stylesheet" type="text/css" href="http://static.blog.csdn.net/css/csdn_blog_detail.min.css">\n'    de = soup.find_all(attrs={'name':"description"})    tit = soup.find_all(attrs={'class':"link_title"})    tim = soup.find_all(attrs={'class':"link_postdate"})    cate = soup.find_all('div',attrs={'class':"category_r"})    con = soup.find_all('div',attrs={'class':'markdown_views'})    title = tit[0].get_text().strip()    # print '|'+title+'    description = de[0].attrs['content'].strip(' ')    time = tim[0].string    category = cate[0].find_all('span')[0].get_text().split(u'\uff08')[0]    string = '---\n'    string += 'title: '+title+'\n'    string += 'tags: ['+category+']'+'\n'    string += 'date: '+time+'\n'    string += '---\n'    string += description+'\n'    string += '<!-- more -->'+'\n'    string += source+str(con[0])    # content = con[0].find_all('code')[0]    f = open(path+'/'+title+'.md','w')    f.write(string)    print 'export :',title     # print str(con[0])# print Get_all_page('http://blog.csdn.net/qq_31481187')# print if __name__ == '__main__':    reload(sys)      sys.setdefaultencoding('utf-8')    url =  raw_input('url:') #'http://blog.csdn.net/qq_31481187/'    path = raw_input('store_path:')#'/tmp/blog/'    ALL = raw_input('export ALL ?yes/no:')    if ALL=='yes':        urls = Get_all_page(url)        for link in urls:             get_content(link,path)          elif ALL=='no':        get_content(url,path)    # for link in urls:     #   print link    #   get_content(link,path)
原创粉丝点击