第三个解析HTML文件的Python程序

来源:互联网 发布:研华数据采集卡价格 编辑:程序博客网 时间:2024/05/01 18:25
#-*-coding:utf-8-*-from HTMLParser import HTMLParserfrom htmlentitydefs import name2codepointclass MyHTMLParser(HTMLParser):        def handle_starttag(self, tag, attrs):        print 'start tag : ', tag        for attr in attrs:            print 'attrs : ' , attr        def handle_endtag(self, tag):        print 'end tag : ', tag        def handle_data(self, data):        print 'data :', data        def handle_comment(self, data):        print 'Comment :', data            def handle_entityref(self, name):        c = unichr(name2codepoint[name])        print 'Nmae net :', c        def handle_charref(self, name):        if name.startswith('X'):            c = unichr(int(name[1:], 16))        else :            c = unichr(int(name))        def handle_decl(self, data):        print 'Decl :', datamyParser = MyHTMLParser()myParser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">')print '\n'myParser.feed('<img src="python-logo.png" alt="The Python logo">')print '\n'myParser.feed('<style type="text/css">#python { color: green }</style>')print '\n'myParser.feed('<script type="text/javascript">alert("<strong>hello!</strong>");</script>')print '\n'myParser.feed('<!-- a comment --><!--[if IE 9]>IE-specific content<![endif]-->')print '\n''''重点查看这段语句'''for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:    myParser.feed(chunk)print '\n'myParser.feed('<p><a class=link href=#main>tag soup</p ></a>')