保存网页内容至word,相关内容导入mysql
来源:互联网 发布:坦赞铁路 知乎 编辑:程序博客网 时间:2024/05/09 14:03
# -*- coding: utf-8 -*-import reimport osimport pymysqlimport requestsimport shutilfrom lxml import etreefrom docx import Documentfrom docx.enum.text import WD_PARAGRAPH_ALIGNMENTnational = "ABC" # 文件夹名称# 删除旧目录if os.path.exists(national): shutil.rmtree(national)# 创建新目录os.mkdir(national)# 修改文件目录为ABCos.chdir(national)# retval = os.getcwd()# print ("目录修改成功 %s" % retval) 查看修改后的目录def linked_database(name): # 创建数据表函数 db = pymysql.connect('自己IP', '用户名', '密码', '数据库名', charset='utf8') # 链接数据库 cursor = db.cursor() # 操作游标 cursor.execute("DROP TABLE IF EXISTS %s" % name) # 判断是否存在名称为name变量的表,若存在删除 sql = """CREATE TABLE %s (NewsID NVARCHAR(20),DeclareDate NVARCHAR(10) not null,ClassifyID NVARCHAR(12),Classify NVARCHAR(60),Title NVARCHAR(200),Autor NVARCHAR(100),NewsSource NVARCHAR(200),Industry NVARCHAR(100),Content TEXT,AccessoryName NVARCHAR(100))""" % name cursor.execute(sql) # 执行sql语句 db.close() # 关闭数据库def insert_database(name): # 插入数据函数 db = pymysql.connect('自己IP', '用户名', '密码', '数据库名', charset='utf8') cursor = db.cursor() # 创建数据库游标 sql1 = ("INSERT INTO %s(DeclareDate, Title, NewsSource, Industry, Content)\ VALUES ('%s', '%s', '%s', '%s', '%s')" % (name, DeclareDate, title, NewsSource, my, word)) cursor.execute(sql1) # 执行sql语句 db.commit() # 提交数据 db.close() # 关闭数据库def insert_database1(name): # 插入数据函数 db = pymysql.connect('自己IP', '用户名', '密码', '数据库名', charset='utf8') cursor = db.cursor() # 创建数据库游标 sql2 = ("INSERT INTO %s(DeclareDate, Title, NewsSource, Industry, AccessoryName)\ VALUES ('%s', '%s', '%s', '%s', '%s')" % (name, DeclareDate, title, NewsSource, my, new_title)) cursor.execute(sql2) # 执行sql语句 db.commit() # 提交数据 db.close() # 关闭数据库def into_document(new_title, word): # 插入word文档函数 exam_doc = Document() # 创建word文档 heading = exam_doc.add_heading(new_title, 0) # 写入标题,0表示word的标题等级 heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 居中 exam_doc.add_paragraph(word) # word变量里的文字存入word文档 filename = "%s.docx" % new_title # 命名word文档 exam_doc.save(filename) # 保存def processing_title(title): # 处理title特殊字符函数 chars = ["/", "\"", "'", "·", "。", "?", "!", ",", "、", ";", ":", "‘", "’", "“", "”", "(", ")", "…", "–", ".", "《", "》"] # 创建列表值 new_title = "" for i in range(len(title)): # 遍历title字符串 if title[i] not in chars: # 如果字符串中存在chars列表的字符,则替换为_ new_title += title[i] else: new_title += "_" return new_titledef get_url2(page_num, url, xpath_url2): # 获取二级链接函数 scroll_list = None try: for z in page_num: url2 = url + "_" + str(z) + ".htm" # url拼接 content = requests.get(url2).content html = etree.HTML(content) scroll_list = html.xpath(xpath_url2) except: print("未获取到url!") return None return scroll_listdef news_content(my, xpath_title, xpath_DeclareDate, xpath_img): # 获取新闻内容函数 content = requests.get(my).content html = etree.HTML(content) try: title = html.xpath(xpath_title) title = ''.join(title).strip() # 拼接title1字符串 title = title DeclareDate1 = html.xpath(xpath_DeclareDate) DeclareDate1 = ''.join(DeclareDate1).strip() # 拼接DeclareDate1字符串 DeclareDate = re.search(r'20\d{2}[-]\d{1,2}[-]\d{1,2}', DeclareDate1).group() # 正则表达式 NewsSource1 = re.search(r'来源:\S*', DeclareDate1).group() # 正则表达式 NewsSource = re.sub(r"来源:", "", str(NewsSource1)) # 正则表达式 NewsSource = ''.join(NewsSource).strip() # 拼接NewsSource字符串 img_list = html.xpath(xpath_img) word1 = html.xpath(word1_xpath) word1 = '\n '.join(word1).strip() # 拼接word1字符串 word1 = word1.replace("\'", "\\'") # 转义’ word = word1 return title, DeclareDate, NewsSource, img_list, word except: print("网页不存在!")#######################################name = "news"page_num = [18, 19] # 创建列表值url = 'http://*************'urls = "http://************"xpath_url2 = "//div[@class='xlayer02 yh ohd clear']/span[@class='fs1']//a/@href" xpath_title = "//*[@id='layer213' or @id='title']//text()"xpath_DeclareDate = "//*[@class='layer31' or @class='xt2 yh fl' or @class='layer31 xt2']//text()"xpath_img = "//*[@id='imgContent' or @id='layer216']//@src"word1_xpath = "//*[@id='imgContent' or @id='layer216']//text()|//*[@id='imgContent' or @id='layer216']//@src"linked_database(name)scroll_list = get_url2(page_num, url, xpath_url2)for x in scroll_list: # 遍历scroll_list列表 my = "http://yn.yunnan.cn/" + x print(my)
title = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[0] DeclareDate = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[1] NewsSource = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[2] img_list = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[3] word = news_content(my, xpath_title, xpath_DeclareDate, xpath_img)[4] if len(img_list) == 1 and word[-4:] == ".jpg": # img_list长度为1且最后四个字符为.jpg if len(word) < 2000: # 如果word长度小于2000字符 insert_database(name) elif len(word) > 2000: # 如果word长度大于2000字符 new_title = processing_title(title) into_document(new_title, word) insert_database1(name) elif len(img_list) == 0: # img_list长度为0 if len(word) < 2000: insert_database(name) elif len(word) > 2000: new_title = processing_title(title) into_document(new_title, word) insert_database1(name) else: # img_list长度不为1或后面四个字符不是.jpg new_title = processing_title(title) into_document(new_title, word) insert_database1(name)print("----抓取完成!----")
阅读全文
0 0
- 保存网页内容至word,相关内容导入mysql
- 把一个网页文件的内容导入到word里
- 把一个网页文件的内容导入到word里
- 网页导入word
- 保存网页数据到WORD
- php读取word\pdf等文档的内容,并将其保存到网页中
- 网页内容写入word文档
- 保存网页内容到txt
- 获取网页内容,并保存
- 网页表格内容导入excel
- mysql相关内容
- 【ZT】Mysql保存word,jpg
- 将WORD内容保存为BMP
- Word是保存网页最好的工具
- PHP 网页保存为Word文档
- word中表格相关内容
- 指定页面区域内容导入Word
- javascript页面内容导入Word和Excel
- 更换Actionbar,Toolbar的使用
- 蓝桥杯 回文数
- Oozie的简介及安装部署
- markdown sample
- SQLContext/HiveContext/SparkSession的使用(二)
- 保存网页内容至word,相关内容导入mysql
- Linux下命令行安装WebLogic 10.3.6
- appstore 上架新版本后有比较严重的问题,可以直接在appstore回滚到老版本吗?
- MySQL中实现连续日期内数据统计,缺省天数0补全
- leetcode 667. Beautiful Arrangement II 双指针遍历
- GreenDao的学习
- 开源软件成熟度评测报告-分布式消息中间件
- Spring Boot
- java线程