python爬虫记录(二)

来源:互联网 发布:清远数据库工程师招聘 编辑:程序博客网 时间:2024/05/29 07:52
#!/user/bin/python# -*- coding: UTF-8 -*-import urllibimport urllib2import lxmlimport MySQLdbfrom bs4 import BeautifulSoupimport httplibhttplib.HTTPConnection._http_vsn = 10httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'user_agent = '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5)                 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36''' hdr = { 'User-Agent' : user_agent }db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")cursor = db.cursor()str_sql2 = '''INSERT INTO `xiaoshuo`.`chapter1` (`bookId`, `chapterNum`,             `chapterName`, `chapterUrl`) VALUES '''str_sql3 = '''INSERT INTO `xiaoshuo`.`chapter` (`bookId`, `chapterNum`, `chapterName`, `chapterUrl`)             VALUES (%s,%s,%s,%s)'''def getUrlFromDbAndGetChapterInfo():    global str_sql2    str_sql1 = 'select bookId, bookName, url from book1'    cursor1 = db.cursor()    cursor1.execute(str_sql1)    url_list = cursor1.fetchall()     cursor1.close()    print "get book url list:", url_list    for item in url_list:        param=[]        bookId = item[0]        bookName = item[1].encode('utf-8')        url = item[2].encode('utf-8')        print "bookId:", bookId, "bookName:", bookName, "url:", url        getChapterInfoAndSaveInDb(bookId, url, param)                try:            cursor.executemany(str_sql3,param)            db.commit()        except MySQLdb.Error, e:              sqlError =  "Error:%s" % str(e)             print "sqlError:", sqlErrordef getChapterInfoAndSaveInDb(bookId, url, param):    request = urllib2.Request(url, headers=hdr)    response = urllib2.urlopen(request)    html_data = response.read().decode('gbk')    #f  = open('2.html')    soup = BeautifulSoup(html_data,'lxml')    mylist = soup.find_all('div', id ='list')    for item in mylist:        section_list = item.find_all('dd')        for item in section_list:            #print item            chapterUrl = "http://www.biquzi.com" + item.find('a').get('href')            #print "章节url:", chapterUrl            tmp = item.find('a').get_text().split(' ')            chapterNum = ""            chapterName = ""            if len(tmp)>1:                chapterNum = tmp[0].encode("utf-8")                chapterName = tmp[1].encode("utf-8")            else:                str1 = item.find('a').get_text().split(u'章',1)                if len(str1) == 1:                    continue                chapterNum = str1[0].encode("utf-8") + "章"                chapterName = str1[1].encode("utf-8")            #temp_str = '("' + str(bookId) + '", "'+ chapterNum + '", "'+ chapterName + '", "'+ chapterUrl + '"),'            tup1 = (bookId, chapterNum, chapterName, chapterUrl)            param.append(tup1)            #print "第几章:", chapterNum, len(chapterNum)            #print "章节名:", chapterName, len(chapterName)            #print "-----------------------------------------------------------------------------------------"    #str_sql2 = str_sql2 + ''.join(str_list)    #f.close()if __name__ == "__main__":    print ("<<<-----Start Get Book Chapter And Save In Db------>>")    getUrlFromDbAndGetChapterInfo()    cursor.close()    db.close()

原创粉丝点击