第三个解析HTML文件的Python程序
来源:互联网 发布:研华数据采集卡价格 编辑:程序博客网 时间:2024/05/01 18:25
#-*-coding:utf-8-*-from HTMLParser import HTMLParserfrom htmlentitydefs import name2codepointclass MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): print 'start tag : ', tag for attr in attrs: print 'attrs : ' , attr def handle_endtag(self, tag): print 'end tag : ', tag def handle_data(self, data): print 'data :', data def handle_comment(self, data): print 'Comment :', data def handle_entityref(self, name): c = unichr(name2codepoint[name]) print 'Nmae net :', c def handle_charref(self, name): if name.startswith('X'): c = unichr(int(name[1:], 16)) else : c = unichr(int(name)) def handle_decl(self, data): print 'Decl :', datamyParser = MyHTMLParser()myParser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">')print '\n'myParser.feed('<img src="python-logo.png" alt="The Python logo">')print '\n'myParser.feed('<style type="text/css">#python { color: green }</style>')print '\n'myParser.feed('<script type="text/javascript">alert("<strong>hello!</strong>");</script>')print '\n'myParser.feed('<!-- a comment --><!--[if IE 9]>IE-specific content<![endif]-->')print '\n''''重点查看这段语句'''for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']: myParser.feed(chunk)print '\n'myParser.feed('<p><a class=link href=#main>tag soup</p ></a>')