使用python脚本下载www.wuxia.net.cn上的书籍，并且将它们合并成一个文件

来源：互联网发布：抽腹水的人活不久知乎编辑：程序博客网时间：2024/06/07 14:20
如题，太累了，直接贴上source，应该能看明白，不明白的话，请给我回复，我会及时回复。
# -*- coding: cp936 -*-import sgmllibimport urllib2import HTMLParserimport codecsclass BookParser(sgmllib.SGMLParser):    def __init__(self):        # inherit from the SGMLParser class        sgmllib.SGMLParser.__init__(self)        # create a list this will store all the links found        self.links = []        self.bookNames = []        self.inside_a_element = 0        self.count = 0;    def unknown_starttag(self, tag, attrs):        #print "unknown tag start " + tag        for key, value in attrs:            if key.lower() == "href":                if "/book/" in value and ".html" in value:                    self.links.append(value)                    self.inside_a_element = 1                def unknown_endtag(self, tag):        #print "in end tag"        if self.inside_a_element:            self.inside_a_element = 0            self.count += 1                def handle_data(self, text):        #print text        if self.inside_a_element:            text = unicode(text, 'utf-8')            text = text + str(self.count) + ".txt"            self.bookNames.append(text)            codecs.open(text, 'a', "utf-8")class PageParser(sgmllib.SGMLParser):    def __init__(self):        # inherit from the SGMLParser class        sgmllib.SGMLParser.__init__(self)        # create a list this will store all the links found        self.links = []    def unknown_starttag(self, tag, attrs):        #print "unknown tag start " + tag        for key, value in attrs:            if key.lower() == "href":                if "/book/" in value and ".html" in value:                    self.links.append(value)        class ContentParser(sgmllib.SGMLParser):    def __init__(self, bookNames):        # inherit from the SGMLParser class        sgmllib.SGMLParser.__init__(self)        # create a list this will store all the divs(page content) found        self.divs = []        # create a list this will store all the h1(title) found        self.headone = []        print " in content parser " + bookNames        self.logfile = codecs.open(bookNames, 'a', "utf-8")        self.inside_a_element = 0        self.h = HTMLParser.HTMLParser()    # this function is called once an anchor tag is found    def unknown_starttag(self, tag, attrs):        #print "unknown tag start " + tag        if tag.lower() == "h1":            self.inside_a_element = 1        if tag.lower() == "p":            self.inside_a_element = 2    def unknown_endtag(self, tag):        if self.inside_a_element:            self.inside_a_element = 0            self.logfile.write("\n")            def handle_data(self, text):        #print "handle data "  + text        if self.inside_a_element == 1:            text = unicode(text, 'utf-8')            self.logfile.write(text + "\n\n")        if self.inside_a_element == 2:            text = unicode(text, 'utf-8')            self.logfile.write(text)    def handle_charref(self, ref):        print "#############"        print "chart is " + ref        print self.h("&#%(ref)")    def handle_entityref(self, ref):        self.logfile.write(self.h.unescape("&"+ref+";"))        #print "#############"        #print "enttity is + " + ref        #print self.h.unescape("&"+ref+";")def getBookList(url):    bookDict = {}    sock = urllib2.urlopen(url)    # make sure the string that is going to be parsed is 8-bit ascii    if sock.info().dict['content-type'] == 'text/html':        parser = BookParser()        parser.feed(sock.read())        bookDic = dict(zip(parser.links, parser.bookNames))    return bookDicdef getPageList(bookUrl):    pageList = []    sock = urllib2.urlopen(bookUrl)    # make sure the string that is going to be parsed is 8-bit ascii    if sock.info().dict['content-type'] == 'text/html':        parser = PageParser()        parser.feed(sock.read())        pageList = parser.links    return pageListdef getPageContent(pageUrl, bookNames):    sock = urllib2.urlopen(pageUrl)    # make sure the string that is going to be parsed is 8-bit ascii    if True:        parser = ContentParser(bookNames)        parser.feed(sock.read())        # print out links        for link in parser.divs:            print linkdef main(wuxiaUrl):    bookDic = getBookList(wuxiaUrl)    print type(bookDic)    print bookDic    for link, bookNames in bookDic.iteritems():        print "link is " + link        print "bookNames is " + bookNames        pageList = getPageList("http://www.wuxia.net.cn" + link)        for page in pageList:            getPageContent("http://www.wuxia.net.cn" + page, bookNames)if __name__ == '__main__':    # this str is author's page    main("http://www.wuxia.net.cn/author/shiweihan.html")    #if len(sys.argv) < 2:    #    print("Usage: %s xuxiaurl"%sys.argv[0])    #else:    #    main(sys.argv[1])