保存网页内容至word,相关内容导入mysql

来源:互联网 发布:坦赞铁路 知乎 编辑:程序博客网 时间:2024/05/09 14:03
# -*- coding: utf-8 -*-import reimport osimport pymysqlimport requestsimport shutilfrom lxml import etreefrom docx import Documentfrom docx.enum.text import WD_PARAGRAPH_ALIGNMENTnational = "ABC" # 文件夹名称# 删除旧目录if os.path.exists(national):    shutil.rmtree(national)# 创建新目录os.mkdir(national)# 修改文件目录为ABCos.chdir(national)# retval = os.getcwd()# print ("目录修改成功 %s" % retval) 查看修改后的目录def linked_database(name):  # 创建数据表函数    db = pymysql.connect('自己IP', '用户名', '密码', '数据库名', charset='utf8')  # 链接数据库    cursor = db.cursor()  # 操作游标    cursor.execute("DROP TABLE IF EXISTS %s" % name)  # 判断是否存在名称为name变量的表,若存在删除    sql = """CREATE TABLE %s (NewsID NVARCHAR(20),DeclareDate NVARCHAR(10) not null,ClassifyID             NVARCHAR(12),Classify NVARCHAR(60),Title NVARCHAR(200),Autor NVARCHAR(100),NewsSource              NVARCHAR(200),Industry NVARCHAR(100),Content TEXT,AccessoryName NVARCHAR(100))""" % name    cursor.execute(sql)  # 执行sql语句    db.close()  # 关闭数据库def insert_database(name):   # 插入数据函数    db = pymysql.connect('自己IP', '用户名', '密码', '数据库名', charset='utf8')    cursor = db.cursor()                    # 创建数据库游标    sql1 = ("INSERT INTO %s(DeclareDate, Title, NewsSource, Industry, Content)\              VALUES ('%s', '%s', '%s', '%s', '%s')" % (name, DeclareDate, title, NewsSource, my, word))    cursor.execute(sql1)  # 执行sql语句    db.commit()  # 提交数据    db.close()  # 关闭数据库def insert_database1(name):   # 插入数据函数    db = pymysql.connect('自己IP', '用户名', '密码', '数据库名', charset='utf8')    cursor = db.cursor()                    # 创建数据库游标    sql2 = ("INSERT INTO %s(DeclareDate, Title, NewsSource, Industry, AccessoryName)\              VALUES ('%s', '%s', '%s', '%s', '%s')" % (name, DeclareDate, title, NewsSource, my, new_title))    cursor.execute(sql2)  # 执行sql语句    db.commit()  # 提交数据    db.close()  # 关闭数据库def into_document(new_title, word):  # 插入word文档函数    exam_doc = Document()  # 创建word文档    heading = exam_doc.add_heading(new_title, 0)  # 写入标题,0表示word的标题等级    heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 居中    exam_doc.add_paragraph(word)  # word变量里的文字存入word文档    filename = "%s.docx" % new_title  # 命名word文档    exam_doc.save(filename)  # 保存def processing_title(title):  # 处理title特殊字符函数    chars = ["/", "\"", "'", "·", "", "", "", "", "", "", "", "‘", "’",             "“", "”", "", "", "…", "–", "", "", ""]  # 创建列表值    new_title = ""    for i in range(len(title)):  # 遍历title字符串        if title[i] not in chars:  # 如果字符串中存在chars列表的字符,则替换为_            new_title += title[i]        else:            new_title += "_"    return new_titledef get_url2(page_num, url, xpath_url2):  # 获取二级链接函数    scroll_list = None    try:        for z in page_num:            url2 = url + "_" + str(z) + ".htm"   # url拼接            content = requests.get(url2).content            html = etree.HTML(content)            scroll_list = html.xpath(xpath_url2)    except:        print("未获取到url!")        return None    return scroll_listdef news_content(my, xpath_title, xpath_DeclareDate, xpath_img):  # 获取新闻内容函数    content = requests.get(my).content    html = etree.HTML(content)    try:        title = html.xpath(xpath_title)        title = ''.join(title).strip()  # 拼接title1字符串        title = title        DeclareDate1 = html.xpath(xpath_DeclareDate)        DeclareDate1 = ''.join(DeclareDate1).strip()  # 拼接DeclareDate1字符串        DeclareDate = re.search(r'20\d{2}[-]\d{1,2}[-]\d{1,2}', DeclareDate1).group()  # 正则表达式        NewsSource1 = re.search(r'来源:\S*', DeclareDate1).group()  # 正则表达式        NewsSource = re.sub(r"来源:", "", str(NewsSource1))  # 正则表达式        NewsSource = ''.join(NewsSource).strip()  # 拼接NewsSource字符串        img_list = html.xpath(xpath_img)        word1 = html.xpath(word1_xpath)        word1 = '\n  '.join(word1).strip()  # 拼接word1字符串        word1 = word1.replace("\'", "\\'")  # 转义        word = word1        return title, DeclareDate, NewsSource, img_list, word    except:        print("网页不存在!")#######################################name = "news"page_num = [18, 19]  # 创建列表值url = 'http://*************'urls = "http://************"xpath_url2 = "//div[@class='xlayer02 yh ohd clear']/span[@class='fs1']//a/@href"  xpath_title = "//*[@id='layer213' or @id='title']//text()"xpath_DeclareDate = "//*[@class='layer31' or @class='xt2 yh fl' or @class='layer31 xt2']//text()"xpath_img = "//*[@id='imgContent' or @id='layer216']//@src"word1_xpath = "//*[@id='imgContent' or @id='layer216']//text()|//*[@id='imgContent' or @id='layer216']//@src"linked_database(name)scroll_list = get_url2(page_num, url, xpath_url2)for x in scroll_list:  # 遍历scroll_list列表    my = "http://yn.yunnan.cn/" + x    print(my)
    title = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[0]    DeclareDate = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[1]    NewsSource = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[2]    img_list = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[3]    word = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[4]    if len(img_list) == 1 and word[-4:] == ".jpg":  # img_list长度为1且最后四个字符为.jpg        if len(word) < 2000:                        # 如果word长度小于2000字符            insert_database(name)        elif len(word) > 2000:  # 如果word长度大于2000字符            new_title = processing_title(title)            into_document(new_title, word)            insert_database1(name)    elif len(img_list) == 0:  # img_list长度为0        if len(word) < 2000:            insert_database(name)        elif len(word) > 2000:            new_title = processing_title(title)            into_document(new_title, word)            insert_database1(name)    else:  # img_list长度不为1或后面四个字符不是.jpg        new_title = processing_title(title)        into_document(new_title, word)        insert_database1(name)print("----抓取完成!----")