python生成sitemap格式文件

来源:互联网 发布:淘宝海纳百川手机壁纸 编辑:程序博客网 时间:2024/04/29 16:40

来源:https://blog.lqsos.com/archives/32.html

#!/usr/bin/python3# -*- coding: utf-8 -*-# author=Heimport xml.dom.minidomimport datetimefrom urllib import requestfrom bs4 import BeautifulSoup'''要执行的url'''URL = 'https://blog.lqsos.com''''所有url列表'''URL_LIST = {}'''模拟header'''HEADER = {    'Cookie': 'AD_RS_COOKIE=20080917',    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ '                  'Chrome/58.0.3029.110 Safari/537.36'}def get_http(url, headers=None, charset='utf8'):    """    发送请求    :param url:     :param headers:     :param charset:     :return:     """    if headers is None:        headers = {}    try:        return request.urlopen(request.Request(url=url, headers=headers)).read().decode(charset)    except Exception:        pass    return ''def open_url(url):    """    打开链接,并返回该链接下的所有链接    :param url:     :return:     """    soup = BeautifulSoup(get_http(url=url, headers=HEADER), 'html.parser')    all_a = soup.find_all('a')    url_list = {}    for a_i in all_a:        if foreign_chain(a_i.get('href')) is True:            url_list[a_i.get('href')] = a_i.get('href')            URL_LIST[a_i.get('href')] = a_i.get('href')    return url_listdef foreign_chain(url):    """    验证是否是外链    :param url:     :return:     """    return url.find(URL) == 0'''首页'''home_all_url = open_url(URL)'''循环首页下的所有链接'''if isinstance(home_all_url, dict):    # 循环首页下的所有链接    for home_url in home_all_url:        # 验证是否是本站域名        if foreign_chain(home_url) is True:            open_url(home_url)URL_LIST_COPY = URL_LIST.copy()for copy_i in URL_LIST_COPY:    open_url(copy_i)# 创建文件doc = xml.dom.minidom.Document()root = doc.createElement('urlset')# 设置根节点的属性root.setAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')root.setAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')root.setAttribute('xsi:schemaLocation', 'http://www.sitemaps.org/schemas/sitemap/0.9 \            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')doc.appendChild(root)for url_list_i in URL_LIST:    nodeUrl = doc.createElement('url')    nodeLoc = doc.createElement('loc')    nodeLoc.appendChild(doc.createTextNode(str(url_list_i)))    nodeLastmod = doc.createElement("lastmod")    nodeLastmod.appendChild(doc.createTextNode(str(datetime.datetime.now().date())))    nodePriority = doc.createElement("priority")    nodePriority.appendChild(doc.createTextNode('1.0'))    nodeUrl.appendChild(nodeLoc)    nodeUrl.appendChild(nodeLastmod)    nodeUrl.appendChild(nodePriority)    root.appendChild(nodeUrl)fp = open('sitemap.xml', 'w')doc.writexml(fp, indent='\t', addindent='\t', newl='\n', encoding="utf-8")

通过linux的crontab命令定时更新文件

原创粉丝点击