HTMLParser 解析 Title 和body
from htmlentitydefs import entitydefs import HTMLParser class TitleParser(HTMLParser.HTMLParser): def __init__(self): self.taglevels=[] self.handledtags=['title','body'] self.processing=None HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): if tag in self.handledtags: self.data='' self.processing=tag def handle_data(self,data): if self.processing: self.data +=data def handle_endtag(self,tag): if tag==self.processing: print str(tag)+':'+str(tp.gettitle()) self.processing=None def handle_entityref(self,name): if entitydefs.has_key(name): self.handle_data(entitydefs[name]) else: self.handle_data('&'+name+';') def handle_charref(self,name): try: charnum=int(name) except ValueError: return if charnum<1 or charnum>255: return self.handle_data(chr(charnum)) def gettitle(self): return self.data fd=open('test1.html') tp=TitleParser() tp.feed(fd.read())
XML解析
"""解析XML文件1.Element XML树的节点2.Text代表文本,包括Element的换行符3.scanNode为一递归函数,如果当前的节点有子节点,进行递归调用4.Node的类型 ELEMENT_NODE = 1 ATTRIBUTE_NODE = 2 TEXT_NODE = 3 CDATA_SECTION_NODE = 4 ENTITY_REFERENCE_NODE = 5 ENTITY_NODE = 6 PROCESSING_INSTRUCTION_NODE = 7 COMMENT_NODE = 8 DOCUMENT_NODE = 9 DOCUMENT_TYPE_NODE = 10 DOCUMENT_FRAGMENT_NODE = 11 NOTATION_NODE = 12"""from xml.dom import minidom,NodeNode.TEXT_NODEdef scanNode(node,level = 0): msg = node.__class__.__name__ if node.nodeType == Node.ELEMENT_NODE: msg += ",tag" + node.tagName print " " * level * 4,msg if node.hasChildNodes: for child in node.childNodes: scanNode(child,level + 1)doc = minidom.parse("JCSample.xml")scanNode(doc)
使用DOM解析XMl
from xml.dom import minidom, Nodeimport re, textwrap########################################################################class SampleScanner: """""" #---------------------------------------------------------------------- def __init__(self, doc): """Constructor""" assert(isinstance(doc, minidom.Document)) for child in doc.childNodes: if child.nodeType == Node.ELEMENT_NODE and \ child.tagName == "book": self.handle_book(child) def handle_book(self, node): for child in node.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "title": print "Book titile is:", self.gettext(child.childNodes) if child.tagName == "author": self.handle_author(child) if child.tagName == "chapter": self.handle_chapter(child) def handle_chapter(self, node): number = node.getAttribute("number") print "number:", number title_node = node.getElementsByTagName("title") print "title:", self.gettext(title_node) for child in node.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "para": self.handle_chapter_para(child) def handle_chapter_para(self, node): company = "" company = self.gettext(node.getElementsByTagName("company")) print "chapter:para:company", company def handle_author(self, node): for child in node.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "name": self.handle_author_name(child) if child.tagName == "affiliation": print "affiliation:", self.gettext(child.childNodes) def handle_author_name(self, node): first = "" last = "" for child in node.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "first": first = self.gettext(child.childNodes) if child.tagName == 'last': last = self.gettext(child.childNodes) print "firstname:%s,lastname:%s" % (first, last) def gettext(self, nodelist): retlist = [] for node in nodelist: if node.nodeType == Node.TEXT_NODE: retlist.append(node.wholeText) elif node.hasChildNodes: retlist.append(self.gettext(node.childNodes)) return re.sub('\s+', " ", ''.join(retlist)) if __name__=="__main__": doc = minidom.parse("simple.xml") sample = SampleScanner(doc)
XML如下
<?xml version="1.0" ?><!--Simple xml document__chapter 8--><book><title>sample xml thing</title><author><name><first>ma</first><last>xiaoju</last></name><affiliation>Springs Widgets, Inc.</affiliation></author><chapter number="1"><title>First</title><para>I think widgets are greate.You should buy lots of them forom<company>Spirngy Widgts, Inc</company></para></chapter></book>