python下用HTMLParser分析网页方法

来源:互联网 发布:mac桌面如何显示图标 编辑:程序博客网 时间:2024/06/06 01:01

http://www.cnzzad.com/outtut/35897.html


HTMLParser是python用来解析html的模块。它可以分析出html里面的标签、数据等等,是一种处理html的简便途径。HTMLParser采用的是一种事件驱动的模式,当HTMLParser找到一个特定的标记时,它会去调用一个用户定义的函数,以此来通知程序处理。它主要的用户回调函数的命名都是以handler_开头的,都是HTMLParser的成员函数。当我们使用时,就从HTMLParser派生出新的类,然后重新定义这几个以handler_开头的函数即可。这几个函数包括:

handle_startendtag  处理开始标签和结束标签
handle_starttag     处理开始标签,比如<xx>
handle_endtag       处理结束标签,比如</xx>
handle_charref      处理特殊字符串,就是以&#开头的,一般是内码表示的字符
handle_entityref    处理一些特殊字符,以&开头的,比如 &nbsp;
handle_data         处理数据,就是<xx>data</xx>中间的那些数据
handle_comment      处理注释
handle_decl         处理<!开头的,比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
handle_pi           处理形如<?instruction>的东西

    这里我以从网页中获取到url为例,介绍一下。要想获取到url,肯定是要分析<a>标签,然后取到它的href属性的值。下面是代码:

先来大致看看HTMLParser的源代码:

class HTMLParseError(Exception):      """Exception raised for all parse errors."""      def __init__(self, msg, position=(None, None)):          assert msg          self.msg = msg          self.lineno = position[0]          self.offset = position[1]      def __str__(self):          result = self.msg          if self.lineno is not None:              result = result + ", at line %d" % self.lineno          if self.offset is not None:              result = result + ", column %d" % (self.offset + 1)          return result    class HTMLParser(_markupbase.ParserBase):      """Find tags and other markup and call handler functions.     Usage:         p = HTMLParser()         p.feed(data)         ...         p.close()     Start tags are handled by calling self.handle_starttag() or     self.handle_startendtag(); end tags by self.handle_endtag().  The     data between tags is passed from the parser to the derived class     by calling self.handle_data() with the data as argument (the data     may be split up in arbitrary chunks).  Entity references are     passed by calling self.handle_entityref() with the entity     reference as the argument.  Numeric character references are     passed to self.handle_charref() with the string containing the     reference as the argument.     """      CDATA_CONTENT_ELEMENTS = ("script", "style")        def __init__(self):          """Initialize and reset this instance."""          self.reset()      def reset(self):          """Reset this instance.  Loses all unprocessed data."""          self.rawdata = ''          self.lasttag = '???'          self.interesting = interesting_normal          _markupbase.ParserBase.reset(self)      def feed(self, data):          """Feed data to the parser.         Call this as often as you want, with as little or as much text         as you want (may include '/n').         """          self.rawdata = self.rawdata + data          self.goahead(0)      def close(self):          """Handle any buffered data."""          self.goahead(1)      def error(self, message):          raise HTMLParseError(message, self.getpos())      __starttag_text = None      def get_starttag_text(self):          """Return full source of start tag: '<...>'."""          return self.__starttag_text      def set_cdata_mode(self):          self.interesting = interesting_cdata      def clear_cdata_mode(self):          self.interesting = interesting_normal      # Internal -- handle data as far as reasonable.  May leave state      # and data to be processed by a subsequent call.  If 'end' is      # true, force handling all data as if followed by EOF marker.      def goahead(self, end):          rawdata = self.rawdata          i = 0          n = len(rawdata)          while i < n:              match = self.interesting.search(rawdata, i) # < or &              if match:                  j = match.start()              else:                  j = n              if i < j: self.handle_data(rawdata[i:j])              i = self.updatepos(i, j)              if i == n: break              startswith = rawdata.startswith              if startswith('<', i):                  if starttagopen.match(rawdata, i): # < + letter                      k = self.parse_starttag(i)                  elif startswith("</", i):                      k = self.parse_endtag(i)                  elif startswith("<!--", i):                      k = self.parse_comment(i)                  elif startswith("<?", i):                      k = self.parse_pi(i)                  elif startswith("<!", i):                      k = self.parse_declaration(i)                  elif (i + 1) < n:                      self.handle_data("<")                      k = i + 1                  else:                      break                  if k < 0:                      if end:                          self.error("EOF in middle of construct")                      break                  i = self.updatepos(i, k)              elif startswith("&#", i):                  match = charref.match(rawdata, i)                  if match:                      name = match.group()[2:-1]                      self.handle_charref(name)                      k = match.end()                      if not startswith(';', k-1):                          k = k - 1                      i = self.updatepos(i, k)                      continue                  else:                      break              elif startswith('&', i):                  match = entityref.match(rawdata, i)                  if match:                      name = match.group(1)                      self.handle_entityref(name)                      k = match.end()                      if not startswith(';', k-1):                          k = k - 1                      i = self.updatepos(i, k)                      continue                  match = incomplete.match(rawdata, i)                  if match:                      # match.group() will contain at least 2 chars                      if end and match.group() == rawdata[i:]:                          self.error("EOF in middle of entity or char ref")                      # incomplete                      break                  elif (i + 1) < n:                      # not the end of the buffer, and can't be confused                      # with some other construct                      self.handle_data("&")                      i = self.updatepos(i, i + 1)                  else:                      break              else:                  assert 0, "interesting.search() lied"          # end while          if end and i < n:              self.handle_data(rawdata[i:n])              i = self.updatepos(i, n)          self.rawdata = rawdata[i:]      # Internal -- parse processing instr, return end or -1 if not terminated      def parse_pi(self, i):          rawdata = self.rawdata          assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'          match = piclose.search(rawdata, i+2) # >          if not match:              return -1          j = match.start()          self.handle_pi(rawdata[i+2: j])          j = match.end()          return j      # Internal -- handle starttag, return end or -1 if not terminated      def parse_starttag(self, i):          self.__starttag_text = None          endpos = self.check_for_whole_start_tag(i)          if endpos < 0:              return endpos          rawdata = self.rawdata          self.__starttag_text = rawdata[i:endpos]          # Now parse the data between i+1 and j into a tag and attrs          attrs = []          match = tagfind.match(rawdata, i+1)          assert match, 'unexpected call to parse_starttag()'          k = match.end()          self.lasttag = tag = rawdata[i+1:k].lower()          while k < endpos:              m = attrfind.match(rawdata, k)              if not m:                  break              attrname, rest, attrvalue = m.group(1, 2, 3)              if not rest:                  attrvalue = None              elif attrvalue[:1] == '/'' == attrvalue[-1:] or /                   attrvalue[:1] == '"' == attrvalue[-1:]:                  attrvalue = attrvalue[1:-1]                  attrvalue = self.unescape(attrvalue)              attrs.append((attrname.lower(), attrvalue))              k = m.end()          end = rawdata[k:endpos].strip()          if end not in (">", "/>"):              lineno, offset = self.getpos()              if "/n" in self.__starttag_text:                  lineno = lineno + self.__starttag_text.count("/n")                  offset = len(self.__starttag_text) /                           - self.__starttag_text.rfind("/n")              else:                  offset = offset + len(self.__starttag_text)              self.error("junk characters in start tag: %r"                         % (rawdata[k:endpos][:20],))          if end.endswith('/>'):              # XHTML-style empty tag: <span attr="value" />              self.handle_startendtag(tag, attrs)          else:              self.handle_starttag(tag, attrs)              if tag in self.CDATA_CONTENT_ELEMENTS:                  self.set_cdata_mode()          return endpos      # Internal -- check to see if we have a complete starttag; return end      # or -1 if incomplete.      def check_for_whole_start_tag(self, i):          rawdata = self.rawdata          m = locatestarttagend.match(rawdata, i)          if m:              j = m.end()              next = rawdata[j:j+1]              if next == ">":                  return j + 1              if next == "/":                  if rawdata.startswith("/>", j):                      return j + 2                  if rawdata.startswith("/", j):                      # buffer boundary                      return -1                  # else bogus input                  self.updatepos(i, j + 1)                  self.error("malformed empty start tag")              if next == "":                  # end of input                  return -1              if next in ("abcdefghijklmnopqrstuvwxyz=/"                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):                  # end of input in or before attribute value, or we have the                  # '/' from a '/>' ending                  return -1              self.updatepos(i, j)              self.error("malformed start tag")          raise AssertionError("we should not get here!")      # Internal -- parse endtag, return end or -1 if incomplete      def parse_endtag(self, i):          rawdata = self.rawdata          assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"          match = endendtag.search(rawdata, i+1) # >          if not match:              return -1          j = match.end()          match = endtagfind.match(rawdata, i) # </ + tag + >          if not match:              self.error("bad end tag: %r" % (rawdata[i:j],))          tag = match.group(1)          self.handle_endtag(tag.lower())          self.clear_cdata_mode()          return j      # Overridable -- finish processing of start+end tag: <tag.../>      def handle_startendtag(self, tag, attrs):          self.handle_starttag(tag, attrs)          self.handle_endtag(tag)      # Overridable -- handle start tag      def handle_starttag(self, tag, attrs):          pass      # Overridable -- handle end tag      def handle_endtag(self, tag):          pass      # Overridable -- handle character reference      def handle_charref(self, name):          pass      # Overridable -- handle entity reference      def handle_entityref(self, name):          pass      # Overridable -- handle data      def handle_data(self, data):          pass      # Overridable -- handle comment      def handle_comment(self, data):          pass      # Overridable -- handle declaration      def handle_decl(self, decl):          pass      # Overridable -- handle processing instruction      def handle_pi(self, data):          pass      def unknown_decl(self, data):          self.error("unknown declaration: %r" % (data,))      # Internal -- helper to remove special character quoting      entitydefs = None      def unescape(self, s):          if '&' not in s:              return s          def replaceEntities(s):              s = s.groups()[0]              if s[0] == "#":                  s = s[1:]                  if s[0] in ['x','X']:                      c = int(s[1:], 16)                  else:                      c = int(s)                  return chr(c)              else:                  # Cannot use name2codepoint directly, because HTMLParser                  # supports apos, which is not part of HTML 4                  import html.entities                  if HTMLParser.entitydefs is None:                      entitydefs = HTMLParser.entitydefs = {'apos':"'"}                      for k, v in html.entities.name2codepoint.items():                          entitydefs[k] = chr(v)                  try:                      return self.entitydefs[s]                  except KeyError:                      return '&'+s+';'          return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|/w{1,8}));",                        replaceEntities, s, re.ASCII)  

 使用示例代码:找链接
#-*- encoding: gb2312 -*-import HTMLParserclass MyParser(HTMLParser.HTMLParser):    def __init__(self):        HTMLParser.HTMLParser.__init__(self)                    def handle_starttag(self, tag, attrs):        # 这里重新定义了处理开始标签的函数        if tag == 'a':            # 判断标签<a>的属性            for name,value in attrs:                if name == 'href':                    print value        if __name__ == '__main__':    a = '<html><head><title>test</title><body><a href="http://www.163.com">链接到163</a></body></html>'        my = MyParser()    # 传入要分析的数据,是html的。    my.feed(a)第二个示例程序:找图片链接    # -*- coding:utf-8 -*-      # file: GetImage.py      #      import Tkinter      import urllib      import HTMLParser      class MyHTMLParser(HTMLParser.HTMLParser):                                              # 创建HTML解析类              def __init__(self):                      HTMLParser.HTMLParser.__init__(self)                      self.gifs = []                                                          # 创建列表,保存gif                      self.jpgs = []                                                          # 创建列表,保存jpg              def handle_starttag(self, tags, attrs):                                         # 处理起始标记                      if tags == 'img':                                                       # 处理图片                              for attr in attrs:                                      for t in attr:                                              if 'gif' in t:                                                      self.gifs.append(t)                     # 添加到gif列表                                              elif 'jpg' in t:                                                      self.jpgs.append(t)                     # 添加到jpg列表                                              else:                                                      pass              def get_gifs(self):                                                             # 返回gif列表                      return self.gifs              def get_jpgs(self):                                                             # 返回jpg列表                      return self.jpgs      class Window:              def __init__(self, root):                      self.root = root                                                        # 创建组件                      self.label = Tkinter.Label(root, text = '输入URL:')                      self.label.place(x = 5, y = 15)                      self.entryUrl = Tkinter.Entry(root,width = 30)                       self.entryUrl.place(x = 65, y = 15)                      self.get = Tkinter.Button(root,                                       text = '获取图片', command = self.Get)                      self.get.place(x = 280, y = 15)                      self.edit = Tkinter.Text(root,width = 470,height = 600)                      self.edit.place(y = 50)              def Get(self):                      url = self.entryUrl.get()                                               # 获取URL                      page = urllib.urlopen(url)                                              # 打开URL                      data = page.read()                                                      # 读取URL内容                      parser = MyHTMLParser()                                                 # 生成实例对象                      parser.feed(data)                                                       # 处理HTML数据                      self.edit.insert(Tkinter.END, '====GIF====\n')                          # 输出数据                      gifs = parser.get_gifs()                      for gif in gifs:                              self.edit.insert(Tkinter.END, gif + '\n')                      self.edit.insert(Tkinter.END, '===========\n')                      self.edit.insert(Tkinter.END, '====JPG====\n')                      jpgs = parser.get_jpgs()                      for jpg in jpgs:                              self.edit.insert(Tkinter.END, jpg + '\n')                      self.edit.insert(Tkinter.END, '===========\n')                      page.close()      root = Tkinter.Tk()      window = Window(root)      root.minsize(600,480)      root.mainloop()  示例代码二: 
http://hi.baidu.com/muinlive/blog/item/ce584ff43c569adaf2d385b8.html

近段时间想用python写一个从网页上抓取股票年报数据的工具,python 自带的Lib中htmlparser有htmllib.HTMLParser、sgmllib.SGMLParser、HTMLParser.HTMLParser,我挑了最后一个HTMLParser.HTMLParser来试试,但可惜的是网页中的table不能解释,script内容也不能过滤。其实两个也就没再试了。后来找下发现一个叫Beautiful Soup的东东听说很好用,但学是没有试。下面一段代码是创建将网面的内容创建成一个对象,这样可以更方便读取它的内容,可以按TAG的分类进行读取,可惜HTMLParser功能太弱,没做成自己想要的结果。等下再试试那可口的浓汤。
#! /usr/bin/env python# -*- coding:gb18030 -*-from HTMLParser import HTMLParserimport reclass HtmlTag:    def __init__(self,parent,tagname):        self.tagname=tagname        self.attrs={}        self.parent=parent        self.childs=[]        self.data=''    def setattr(self,name,value):        self.attrs[name]=value    def addchild(self,child):        self.childs.append(child)    def setdata(self,data):        self.data=dataclass htmlsnif (HTMLParser):    def __init__(self):        HTMLParser.__init__(self)        self._ActiveTag=HtmlTag(None,'Root')        self._TagTile=[]        self._TagTree=self._ActiveTag        self._TagCatalog={}        self._ParentTag=self._ActiveTag        self._TagTitle=None        self._TagBody=None    def handle_starttag(self,tag,attrs):        newtag=HtmlTag(self._ActiveTag,tag)        for k,v in attrs:            newtag.setattr(k,v)        self._TagTile.append(newtag)        self._ActiveTag.addchild(newtag)        self._ParentTag=self._ActiveTag        self._ActiveTag=newtag        if str.lower(tag)=='title':            self._TagTitle=newtag        elif str.lower(tag)=='body':            self._TagBody=newtag        if tag in self._TagCatalog:            self._TagCatalog[tag].append(newtag)        else:            self._TagCatalog[tag]=[newtag]        print(tag)    def handle_endtag(self,tag):        self._ParentTag=self._ParentTag.parent        self._ActiveTag=self._ActiveTag.parent    def handle_data(self,data):        self._ActiveTag.setdata(data)    def handle_startendtag(self,tag,attrs):        newtag=HtmlTag(self._ActiveTag,tag)        for k,v in attrs:            newtag.setattr(k,v)        self._ActiveTag.addchild(newtag)        self._TagTile.append(newtag)        if tag in self._TagCatalog:            self._TagCatalog[tag].append(newtag)        else:            self._TagCatalog[tag]=[newtag]    def handle_comment(self,data):        newtag=HtmlTag(self._TagTree,'comment')        newtag.setdata(data)        self._TagTile.append(newtag)        if 'comment' in self._TagCatalog:            self._TagCatalog['comment'].append(newtag)        else:            self._TagCatalog['comment']=[newtag]

另外一个例子:http://crquan.blogbus.com/logs/8269701.html
标签过滤????
#!/usr/bin/env python import sysimport urllibimport HTMLParser class CustomParser(HTMLParser.HTMLParser):    selected = ('table', 'h1', 'font', 'ul', 'li', 'tr', 'td', 'a')        def reset(self):        HTMLParser.HTMLParser.reset(self)        self._level_stack = []    def handle_starttag(self, tag, attrs):        if tag in CustomParser.selected:            self._level_stack.append(tag)    def handle_endtag(self, tag):        if self._level_stack \        and tag in CustomParser.selected \        and tag == self._level_stack[-1]:            self._level_stack.pop()    def handle_data(self, data):        if "/".join(self._level_stack) in (            'table/tr/td',            'table/tr/td/h1/font',            'table/tr/td/ul/li'):            print self._level_stack, data        if len(sys.argv) > 1:    params = urllib.urlencode({'ip': sys.argv[1], 'action': 2})else:    params = None content = unicode(urllib.urlopen('http://www.ip138.com/ips8.asp',params).read(), 'GB2312') parser = CustomParser()parser.feed(content)parser.close()