python 专题七 HTML XML解析

来源：互联网发布：手游源码和服务端编辑：程序博客网时间：2024/06/03 22:57

HTMLParser 解析 Title 和body

from htmlentitydefs import entitydefs import HTMLParser class TitleParser(HTMLParser.HTMLParser):     def __init__(self):         self.taglevels=[]         self.handledtags=['title','body']         self.processing=None         HTMLParser.HTMLParser.__init__(self)     def handle_starttag(self,tag,attrs):         if tag in self.handledtags:             self.data=''             self.processing=tag     def handle_data(self,data):         if self.processing:             self.data +=data     def handle_endtag(self,tag):         if tag==self.processing:             print str(tag)+':'+str(tp.gettitle())             self.processing=None     def handle_entityref(self,name):         if entitydefs.has_key(name):             self.handle_data(entitydefs[name])         else:             self.handle_data('&'+name+';')     def handle_charref(self,name):         try:             charnum=int(name)         except ValueError:             return         if charnum<1 or charnum>255:             return         self.handle_data(chr(charnum))     def gettitle(self):         return self.data fd=open('test1.html') tp=TitleParser() tp.feed(fd.read())

XML解析

"""解析XML文件1.Element XML树的节点2.Text代表文本,包括Element的换行符3.scanNode为一递归函数,如果当前的节点有子节点,进行递归调用4.Node的类型    ELEMENT_NODE = 1    ATTRIBUTE_NODE = 2    TEXT_NODE = 3    CDATA_SECTION_NODE = 4    ENTITY_REFERENCE_NODE = 5    ENTITY_NODE = 6    PROCESSING_INSTRUCTION_NODE = 7    COMMENT_NODE = 8    DOCUMENT_NODE = 9    DOCUMENT_TYPE_NODE = 10    DOCUMENT_FRAGMENT_NODE = 11    NOTATION_NODE = 12"""from xml.dom import minidom,NodeNode.TEXT_NODEdef scanNode(node,level = 0):    msg = node.__class__.__name__    if node.nodeType == Node.ELEMENT_NODE:        msg += ",tag" + node.tagName    print " " * level * 4,msg    if node.hasChildNodes:        for child in node.childNodes:            scanNode(child,level + 1)doc = minidom.parse("JCSample.xml")scanNode(doc)

使用DOM解析XMl

from xml.dom import minidom, Nodeimport re, textwrap########################################################################class SampleScanner:    """"""    #----------------------------------------------------------------------    def __init__(self, doc):        """Constructor"""        assert(isinstance(doc, minidom.Document))        for child in doc.childNodes:            if child.nodeType == Node.ELEMENT_NODE and \               child.tagName == "book":                self.handle_book(child)                    def handle_book(self, node):                for child in node.childNodes:            if child.nodeType != Node.ELEMENT_NODE:                continue            if child.tagName == "title":                print "Book titile is:", self.gettext(child.childNodes)            if child.tagName == "author":                self.handle_author(child)            if child.tagName == "chapter":                self.handle_chapter(child)                    def handle_chapter(self, node):        number = node.getAttribute("number")        print "number:", number        title_node = node.getElementsByTagName("title")        print "title:", self.gettext(title_node)                for child in node.childNodes:            if child.nodeType != Node.ELEMENT_NODE:                continue            if child.tagName == "para":                self.handle_chapter_para(child)                    def handle_chapter_para(self, node):        company = ""        company = self.gettext(node.getElementsByTagName("company"))        print "chapter:para:company", company                            def handle_author(self, node):        for child in node.childNodes:            if child.nodeType != Node.ELEMENT_NODE:                continue            if child.tagName == "name":                self.handle_author_name(child)            if child.tagName == "affiliation":                print "affiliation:", self.gettext(child.childNodes)                    def handle_author_name(self, node):        first = ""        last = ""        for child in node.childNodes:            if child.nodeType != Node.ELEMENT_NODE:                continue            if child.tagName == "first":                first = self.gettext(child.childNodes)            if child.tagName == 'last':                last = self.gettext(child.childNodes)                        print "firstname:%s,lastname:%s" % (first, last)                            def gettext(self, nodelist):        retlist = []        for node in nodelist:            if node.nodeType == Node.TEXT_NODE:                retlist.append(node.wholeText)            elif node.hasChildNodes:                retlist.append(self.gettext(node.childNodes))                        return re.sub('\s+', " ", ''.join(retlist))                    if __name__=="__main__":    doc = minidom.parse("simple.xml")    sample = SampleScanner(doc)

XML如下

<?xml version="1.0" ?><!--Simple xml document__chapter 8--><book><title>sample xml thing</title><author><name><first>ma</first><last>xiaoju</last></name><affiliation>Springs Widgets, Inc.</affiliation></author><chapter number="1"><title>First</title><para>I think widgets are greate.You should buy lots of them forom<company>Spirngy Widgts, Inc</company></para></chapter></book>