python爬虫记录(二)
来源:互联网 发布:清远数据库工程师招聘 编辑:程序博客网 时间:2024/05/29 07:52
#!/user/bin/python# -*- coding: UTF-8 -*-import urllibimport urllib2import lxmlimport MySQLdbfrom bs4 import BeautifulSoupimport httplibhttplib.HTTPConnection._http_vsn = 10httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'user_agent = '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36''' hdr = { 'User-Agent' : user_agent }db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")cursor = db.cursor()str_sql2 = '''INSERT INTO `xiaoshuo`.`chapter1` (`bookId`, `chapterNum`, `chapterName`, `chapterUrl`) VALUES '''str_sql3 = '''INSERT INTO `xiaoshuo`.`chapter` (`bookId`, `chapterNum`, `chapterName`, `chapterUrl`) VALUES (%s,%s,%s,%s)'''def getUrlFromDbAndGetChapterInfo(): global str_sql2 str_sql1 = 'select bookId, bookName, url from book1' cursor1 = db.cursor() cursor1.execute(str_sql1) url_list = cursor1.fetchall() cursor1.close() print "get book url list:", url_list for item in url_list: param=[] bookId = item[0] bookName = item[1].encode('utf-8') url = item[2].encode('utf-8') print "bookId:", bookId, "bookName:", bookName, "url:", url getChapterInfoAndSaveInDb(bookId, url, param) try: cursor.executemany(str_sql3,param) db.commit() except MySQLdb.Error, e: sqlError = "Error:%s" % str(e) print "sqlError:", sqlErrordef getChapterInfoAndSaveInDb(bookId, url, param): request = urllib2.Request(url, headers=hdr) response = urllib2.urlopen(request) html_data = response.read().decode('gbk') #f = open('2.html') soup = BeautifulSoup(html_data,'lxml') mylist = soup.find_all('div', id ='list') for item in mylist: section_list = item.find_all('dd') for item in section_list: #print item chapterUrl = "http://www.biquzi.com" + item.find('a').get('href') #print "章节url:", chapterUrl tmp = item.find('a').get_text().split(' ') chapterNum = "" chapterName = "" if len(tmp)>1: chapterNum = tmp[0].encode("utf-8") chapterName = tmp[1].encode("utf-8") else: str1 = item.find('a').get_text().split(u'章',1) if len(str1) == 1: continue chapterNum = str1[0].encode("utf-8") + "章" chapterName = str1[1].encode("utf-8") #temp_str = '("' + str(bookId) + '", "'+ chapterNum + '", "'+ chapterName + '", "'+ chapterUrl + '"),' tup1 = (bookId, chapterNum, chapterName, chapterUrl) param.append(tup1) #print "第几章:", chapterNum, len(chapterNum) #print "章节名:", chapterName, len(chapterName) #print "-----------------------------------------------------------------------------------------" #str_sql2 = str_sql2 + ''.join(str_list) #f.close()if __name__ == "__main__": print ("<<<-----Start Get Book Chapter And Save In Db------>>") getUrlFromDbAndGetChapterInfo() cursor.close() db.close()
阅读全文
0 0
- python爬虫记录(二)
- python 爬虫(二)
- python爬虫(二)
- python爬虫(二)
- Python 爬虫(二)
- 爬虫学习记录(二)
- python爬虫记录(一)
- Python爬虫(二)图片下载爬虫
- Python爬虫整理(二)
- python简单爬虫(二)
- python 爬虫笔记(二)
- python网络爬虫(二)
- Python爬虫实战(二)
- Python爬虫小记(二)
- python网络爬虫(二)
- 《python初级爬虫》(二)
- Python爬虫个人记录(二) 获取fishc 课件下载链接
- python爬虫记录
- 设计模式之装饰器模式
- 堆和栈的区别
- java中的基本数据类型和引用数据类型
- 模拟登录163邮箱
- 数组常用方法
- python爬虫记录(二)
- 2017 Multi-University Training Contest
- maven导入原创jar包
- openwrt资料整理
- 机房重构之EA中的ER图转换为数据库
- iText输出中文的三种字体选择方式
- 阿里云(腾讯云)Linux数据盘挂载
- elasticsearch集群快照使用共享文件系统
- idea类注释