[Python]HTML转换为TXT的脚本

来源:互联网 发布:深圳壹叁叁网络老挝 编辑:程序博客网 时间:2024/05/01 16:45
1 from formatter import AbstractFormatter, NullWriter 2 from htmllib import HTMLParser 3  4 def _(str, in_encoder="gbk", out_encoder="utf8"): 5     return unicode(str, in_encoder).encode(out_encoder) 6  7  8 class myWriter(NullWriter): 9     def __init__(self):10         NullWriter.__init__(self)11         self._bodyText = []12 13     def send_flowing_data(self, str):14         self._bodyText.append(str)15 16     def _get_bodyText(self):17         return '/n'.join(self._bodyText)18 19     bodyText = property(_get_bodyText, None, None, 'plain text from body')20 21 class myHTMLParser(HTMLParser):22     def do_meta(self, attrs):23         self.metas = attrs24 25 def convertFile(filename):26     mywriter = myWriter()27     absformatter = AbstractFormatter(mywriter)28     parser = myHTMLParser(absformatter)29     parser.feed(open(filename).read())30     return ( _(parser.title), parser.formatter.writer.bodyText )31 32 import os33 import os.path34 35 OUTPUTDIR = "./txt"36 INPUTDIR = "."37 if __name__ == "__main__":38     if not os.path.exists(OUTPUTDIR):39         os.mkdir(OUTPUTDIR)40 41     for file in os.listdir(INPUTDIR):42         if file[-4:] == '.htm':43             print "Coverting", file,44             outfilename, text = convertFile(file)45             outfilename = outfilename + '.txt'46             outfullname = os.path.join(OUTPUTDIR, outfilename)47             open(outfullname, "wt").write(text)48             print "Done!"49  

0 0