Python中的html.parser

来源:互联网 发布:什么是淘宝店铺首页 编辑:程序博客网 时间:2024/05/22 09:48
class WebPageParser(html.parser.HTMLParser):    def __init__(self, strict = False):        super().__init__(strict)        self.urls = []        self.data = []        self.is_href = False;    def handle_starttag(self, tag, attrs):        if tag == 'a':            self.urls.extend([v for k , v in attrs if k == 'href'])            self.is_href = True    def handle_endtag(self, tag):        print(tag)        self.is_href = False    def handle_startendtag(self, tag, attrs):        print(tag, attrs)    def handle_data(self, data):        if self.is_href:            self.data.append(data)                def reset(self):        super().reset()        self.urls = []        self.data = []        self.is_href = False        if __name__ == '__main__':    webUrl = r"http://www.baidu.com"    webPage = urllib.request.urlopen(webUrl)    webPageParser = WebPageParser()    webPageParser.feed(str(webPage.read()))    print(webPageParser.urls)    print(webPageParser.data)


该类的使用很简单,主要是要继承html.parser.HTMLParser,然后重载相对应的方法。

原创粉丝点击