PDF-TXT-XML

来源：互联网发布：android 电子商城源码编辑：程序博客网时间：2024/06/03 17:23

第一步，从PDF转成TXT

注意：去除空格、空行等

__author__ = 'wangfei'# -*- coding: utf-8 -*-import sysimport osreload(sys)sys.setdefaultencoding('utf-8')from pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfpage import PDFTextExtractionNotAllowedfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter import PDFPageAggregatorfrom pdfminer.layout import *#手动输入，以后改成在某个文件夹下读取子文件fp = open('pdf/sln.pdf', 'rb')#用文件对象来创建一个pdf文档分析器parser = PDFParser(fp)# 创建一个  PDF 文档doc = PDFDocument(parser)# 检测文档是否提供txt转换，不提供就忽略if not doc.is_extractable:    raise PDFTextExtractionNotAllowed# 创建PDf 资源管理器 来管理共享资源rsrcmgr = PDFResourceManager()# 创建一个PDF设备对象laparams = LAParams()device = PDFPageAggregator(rsrcmgr, laparams=laparams)interpreter = PDFPageInterpreter(rsrcmgr, device)# 处理文档对象中每一页的内容# doc.get_pages() 获取page列表# 循环遍历列表，每次处理一个page的内容# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象#  一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性，all = []for page in PDFPage.create_pages(doc):    interpreter.process_page(page)    layout = device.get_result()    for x in layout:        if(isinstance(x, LTTextBox)):#coment by dz :delete the \n            string = x.get_text().replace('\n', '')            #print string            all.append(string.strip())#写入文件到txt目录ls = os.linesepfObj = open('txt/sln.txt', 'wb')fObj.writelines(['%s%s' % (x, ls) for x in all])fObj.close()

第二步，从TXT到XML

首先XML处理类：

__author__ = 'wangfei'import xml.dom.minidom as Domclass XMLGenerator:    def __init__(self, xml_name):        self.doc = Dom.Document()        self.xml_name = xml_name    def createNode(self, node_name):        return self.doc.createElement(node_name)    def addNode(self, node, pre_node = None):        cur_node = node        if pre_node is not None:            pre_node.appendChild(cur_node)        else:            self.doc.appendChild(cur_node)        return cur_node    def setNodeAttr(self, node, att_name, value):        cur_node = node        cur_node.setAttribute(att_name, value)    def setNodeValue(self, cur_node, value):        node_data = self.doc.createTextNode(value)        cur_node.appendChild(node_data)    def genXML(self):        f = open(self.xml_name, "w")        f.write(self.doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8"))        f.close()

根据文本信息处理TXT文档，文本信息包括章节标题标记、图像、公式等

#! /usr/bin/env python#coding:utf-8import sysimport linecacheimport refrom XMLGenerator import *reload(sys)sys.setdefaultencoding('utf-8')fileName = "txt/sln.txt"try:    fobj = open(fileName, 'r')except IOError, e:    print("*** file open error:", e)else:tittle = linecache.getline(fileName, 1).lower().strip('\n')xmlFile = XMLGenerator(tittle.strip('\n') + ".xml")#xml root node article#add articlenode_article = xmlFile.createNode("div")xmlFile.setNodeAttr(node_article,"id","article")xmlFile.addNode(node=node_article)#add tittlenode_tittle =xmlFile.createNode("div")xmlFile.setNodeAttr(node_tittle,"id","tittle")xmlFile.setNodeAttr(node_tittle,"class","ltx_title ltx_title_document")xmlFile.addNode(node_tittle,node_article)    #add intrunode_intru = xmlFile.createNode("div")xmlFile.setNodeAttr(node_intru,"id","intru")xmlFile.setNodeAttr(node_intru,"class","ltx_p")xmlFile.addNode(node_intru,node_tittle)#NULLnode_section= xmlFile.createNode("div")node_sub = xmlFile.createNode("div")n_sec = [([-1] * 15) for i in range(22)]section = []k=0sec=0sub=0sec_info=0sub_info_1=0sub_info_2=0    #read paper section namefor (num,eachLine) in enumerate(fobj):if num ==0:xmlFile.setNodeValue(node_tittle,eachLine.strip())elif num ==1:xmlFile.setNodeValue(node_intru,eachLine.strip())elif(eachLine != "\n"):p_set = '^2\.(\d+)'p_sub ='^2\.(\d+)\.(\d+)'p_num ='^\d\d+'p_fig ='^Fig'p_ch  = '^Chapter'p_h   ='^H\.'words = len(eachLine.split(' '))m_h   =re.search(p_h,eachLine)m_ch  = re.search(p_ch,eachLine)m_fig = re.search(p_fig,eachLine)m_num =re.search(p_num,eachLine)m_set = re.search(p_set,eachLine)m_sub = re.search(p_sub,eachLine)if(m_h == None and m_num==None and m_fig==None and m_ch==None and words>=9 or m_set!=None):k =k+1section.append(eachLine.strip())if(m_set!=None and m_sub==None):  #for the 2nd sectionsec_info = int(m_set.group(1))n_sec[sec_info][0]=ksec = k#add the next section to the cur nodenode_section = xmlFile.createNode("div")xmlFile.setNodeAttr(node_section,"id","s2.ss" + m_set.group(1))xmlFile.setNodeAttr(node_section, "class", "ltx_section")xmlFile.addNode(node_section, node_article)#xmlFile.setNodeValue(node_section,eachLine.strip())node_st =xmlFile.createNode("h2")xmlFile.setNodeAttr(node_st,"class","ltx_title ltx_titleh_section")xmlFile.addNode(node_st,node_section)xmlFile.setNodeValue(node_st,eachLine.strip())elif m_set!=None and m_sub!=None:sub_info_1 = int(m_sub.group(1))sub_info_2 = int(m_sub.group(2))print sub_info_1print sub_info_2n_sec[sub_info_1][sub_info_2]=ksub =k#add sub to sectionnode_sub =xmlFile.createNode("div")xmlFile.setNodeAttr(node_sub,"id","s"+"2.ss"+m_sub.group(1)+".sss"+m_sub.group(2))xmlFile.setNodeAttr(node_sub,"class","ltx_subsection")xmlFile.addNode(node_sub,node_section)node_st =xmlFile.createNode("h2")xmlFile.setNodeAttr(node_st,"class","ltx_title ltx_titleh_subsection")xmlFile.addNode(node_st,node_sub)xmlFile.setNodeValue(node_st,eachLine.strip())else :#add paragraphif sec>sub: n_para = str(k-sec) node_para = xmlFile.createNode("div")xmlFile.setNodeAttr(node_para,"id","s2.ss"+str(sec_info)+".p"+n_para)xmlFile.setNodeAttr(node_sub,"class","ltx_para")xmlFile.addNode(node_para,node_section)xmlFile.setNodeValue(node_para,eachLine.strip())else :n_para = str(k-sub)node_para =xmlFile.createNode("div")xmlFile.setNodeAttr(node_para,"id","s2."+str(sub_info_1)+"."+str(sub_info_1)+".p"+n_para)xmlFile.setNodeAttr(node_para,"class","ltx_para")xmlFile.addNode(node_para,node_sub)xmlFile.setNodeValue(node_para,eachLine.strip())#genxmlFile.genXML()fobj.close()

0 0