PDF-TXT-XML
来源:互联网 发布:android 电子商城源码 编辑:程序博客网 时间:2024/06/03 17:23
第一步,从PDF转成TXT
注意:去除空格、空行等
__author__ = 'wangfei'# -*- coding: utf-8 -*-import sysimport osreload(sys)sys.setdefaultencoding('utf-8')from pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfpage import PDFTextExtractionNotAllowedfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter import PDFPageAggregatorfrom pdfminer.layout import *#手动输入,以后改成在某个文件夹下读取子文件fp = open('pdf/sln.pdf', 'rb')#用文件对象来创建一个pdf文档分析器parser = PDFParser(fp)# 创建一个 PDF 文档doc = PDFDocument(parser)# 检测文档是否提供txt转换,不提供就忽略if not doc.is_extractable: raise PDFTextExtractionNotAllowed# 创建PDf 资源管理器 来管理共享资源rsrcmgr = PDFResourceManager()# 创建一个PDF设备对象laparams = LAParams()device = PDFPageAggregator(rsrcmgr, laparams=laparams)interpreter = PDFPageInterpreter(rsrcmgr, device)# 处理文档对象中每一页的内容# doc.get_pages() 获取page列表# 循环遍历列表,每次处理一个page的内容# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象# 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,all = []for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for x in layout: if(isinstance(x, LTTextBox)):#coment by dz :delete the \n string = x.get_text().replace('\n', '') #print string all.append(string.strip())#写入文件到txt目录ls = os.linesepfObj = open('txt/sln.txt', 'wb')fObj.writelines(['%s%s' % (x, ls) for x in all])fObj.close()
第二步,从TXT到XML
首先XML处理类:
__author__ = 'wangfei'import xml.dom.minidom as Domclass XMLGenerator: def __init__(self, xml_name): self.doc = Dom.Document() self.xml_name = xml_name def createNode(self, node_name): return self.doc.createElement(node_name) def addNode(self, node, pre_node = None): cur_node = node if pre_node is not None: pre_node.appendChild(cur_node) else: self.doc.appendChild(cur_node) return cur_node def setNodeAttr(self, node, att_name, value): cur_node = node cur_node.setAttribute(att_name, value) def setNodeValue(self, cur_node, value): node_data = self.doc.createTextNode(value) cur_node.appendChild(node_data) def genXML(self): f = open(self.xml_name, "w") f.write(self.doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8")) f.close()
根据文本信息处理TXT文档,文本信息包括章节标题标记、图像、公式等
#! /usr/bin/env python#coding:utf-8import sysimport linecacheimport refrom XMLGenerator import *reload(sys)sys.setdefaultencoding('utf-8')fileName = "txt/sln.txt"try: fobj = open(fileName, 'r')except IOError, e: print("*** file open error:", e)else:tittle = linecache.getline(fileName, 1).lower().strip('\n')xmlFile = XMLGenerator(tittle.strip('\n') + ".xml")#xml root node article#add articlenode_article = xmlFile.createNode("div")xmlFile.setNodeAttr(node_article,"id","article")xmlFile.addNode(node=node_article)#add tittlenode_tittle =xmlFile.createNode("div")xmlFile.setNodeAttr(node_tittle,"id","tittle")xmlFile.setNodeAttr(node_tittle,"class","ltx_title ltx_title_document")xmlFile.addNode(node_tittle,node_article) #add intrunode_intru = xmlFile.createNode("div")xmlFile.setNodeAttr(node_intru,"id","intru")xmlFile.setNodeAttr(node_intru,"class","ltx_p")xmlFile.addNode(node_intru,node_tittle)#NULLnode_section= xmlFile.createNode("div")node_sub = xmlFile.createNode("div")n_sec = [([-1] * 15) for i in range(22)]section = []k=0sec=0sub=0sec_info=0sub_info_1=0sub_info_2=0 #read paper section namefor (num,eachLine) in enumerate(fobj):if num ==0:xmlFile.setNodeValue(node_tittle,eachLine.strip())elif num ==1:xmlFile.setNodeValue(node_intru,eachLine.strip())elif(eachLine != "\n"):p_set = '^2\.(\d+)'p_sub ='^2\.(\d+)\.(\d+)'p_num ='^\d\d+'p_fig ='^Fig'p_ch = '^Chapter'p_h ='^H\.'words = len(eachLine.split(' '))m_h =re.search(p_h,eachLine)m_ch = re.search(p_ch,eachLine)m_fig = re.search(p_fig,eachLine)m_num =re.search(p_num,eachLine)m_set = re.search(p_set,eachLine)m_sub = re.search(p_sub,eachLine)if(m_h == None and m_num==None and m_fig==None and m_ch==None and words>=9 or m_set!=None):k =k+1section.append(eachLine.strip())if(m_set!=None and m_sub==None): #for the 2nd sectionsec_info = int(m_set.group(1))n_sec[sec_info][0]=ksec = k#add the next section to the cur nodenode_section = xmlFile.createNode("div")xmlFile.setNodeAttr(node_section,"id","s2.ss" + m_set.group(1))xmlFile.setNodeAttr(node_section, "class", "ltx_section")xmlFile.addNode(node_section, node_article)#xmlFile.setNodeValue(node_section,eachLine.strip())node_st =xmlFile.createNode("h2")xmlFile.setNodeAttr(node_st,"class","ltx_title ltx_titleh_section")xmlFile.addNode(node_st,node_section)xmlFile.setNodeValue(node_st,eachLine.strip())elif m_set!=None and m_sub!=None:sub_info_1 = int(m_sub.group(1))sub_info_2 = int(m_sub.group(2))print sub_info_1print sub_info_2n_sec[sub_info_1][sub_info_2]=ksub =k#add sub to sectionnode_sub =xmlFile.createNode("div")xmlFile.setNodeAttr(node_sub,"id","s"+"2.ss"+m_sub.group(1)+".sss"+m_sub.group(2))xmlFile.setNodeAttr(node_sub,"class","ltx_subsection")xmlFile.addNode(node_sub,node_section)node_st =xmlFile.createNode("h2")xmlFile.setNodeAttr(node_st,"class","ltx_title ltx_titleh_subsection")xmlFile.addNode(node_st,node_sub)xmlFile.setNodeValue(node_st,eachLine.strip())else :#add paragraphif sec>sub: n_para = str(k-sec) node_para = xmlFile.createNode("div")xmlFile.setNodeAttr(node_para,"id","s2.ss"+str(sec_info)+".p"+n_para)xmlFile.setNodeAttr(node_sub,"class","ltx_para")xmlFile.addNode(node_para,node_section)xmlFile.setNodeValue(node_para,eachLine.strip())else :n_para = str(k-sub)node_para =xmlFile.createNode("div")xmlFile.setNodeAttr(node_para,"id","s2."+str(sub_info_1)+"."+str(sub_info_1)+".p"+n_para)xmlFile.setNodeAttr(node_para,"class","ltx_para")xmlFile.addNode(node_para,node_sub)xmlFile.setNodeValue(node_para,eachLine.strip())#genxmlFile.genXML()fobj.close()
0 0
- PDF-TXT-XML
- lucene-使用lius解析pdf、ppt、rtf、txt、xml
- lucene 索引非txt文档 (pdf word rtf html xml)
- lucene 索引非txt文档 (pdf word rtf html xml)
- lucene 索引非txt文档 (pdf word rtf html xml)
- lucene 索引非txt文档 (pdf word rtf html xml)
- PDF转换成TXT
- PDF TO TXT
- PDF转TXT格式
- python3 pdf 转 txt
- 使用Jacob批量转换word为txt、pdf、xps、html、xml等文档
- 使用Jacob批量转换word为txt、pdf、xps、html、xml等文档
- 使用Jacob批量转换word为txt、pdf、xps、html、xml等文档
- JS导出excel、doc、png、pdf 、xml、json、sql、txt、powerpoint、csv
- 使用Jacob批量转换word为txt、pdf、xps、html、xml等文档
- pdf转化为txt (PDFBox)
- PDF转TXT文件源码
- CentOS 5.5 PDF转TXT
- Jquery之Ajax实例应用
- ListFragment简介与用法
- 面向对象设计的六大原则
- java学习笔记(十二) -- Swing先行(1)
- 乌克兰歌曲
- PDF-TXT-XML
- 自学Java系列 笔记2 高级类特性2
- AFNetWorking
- android性能优化
- ascii码表
- Delphi 10 Seattle Update1下载破解激活
- 自学Java系列 笔记2 异常处理1
- POJ 2676 Sudoku
- Java Web 中文乱码问题总结