Python_操作txt、xls、csv、PDF

来源：互联网发布：windows 10 蓝屏重启编辑：程序博客网时间：2024/05/15 09:39
读写TXT文件
读写xls文件
读写csv文件
读写PDF文件
#-*-coding:utf-8-*-'''created by zwg in 2016-10-22''''''txt、xlsx、csv、pdf文件读写操作'''################################一、txt文件操作（file和open几乎相同）# file=open('zwgoracle.txt','r+')#读写，覆盖，无则自动生成# file=open('zwgoracle.txt','r')#读，无则自动生成# file=open('zwgoracle.txt','w+')#读写，覆盖，无则自动生成# file=open('zwgoracle.txt','w')#写，覆盖，无则自动生成# file=open('zwgoracle.txt','a+')#读写，续写，无则自动生成# file=open('zwgoracle.txt','a')#写，续写，无则自动生成########读操作# file=open('zwgoracle.txt','r')# 一次读取全部内容，大文件时不建议# txt_all=file.read()# print txt_all# 逐行读取前面固定字节的内容，默认全部# txt_oneline=file.readline()# txt_nextline=file.readline(23)# print '读取第一行的所有内容      ：%s'%txt_oneline# print '读取第二行的前23个字节内容：%s'%txt_nextline# file.seek(len(txt_oneline)+1)# print '游标向前移动一行的距离后，重新读取一行的内容：%s'%file.readline()# 全部读取并生成列表，每一行在同一个字符串中# all_lines=file.readlines()# k=-1# for line in all_lines:#     k=k+1#     if line[0:7]=='笨蛋1号':#         print line,#         #print line.replace('\n','')#         break# print all_lines[k]########写操作# file=open('zwgoracle.txt','a+')#打开的方式为a或a+，默认写在最后，seek无效#打开的方式为w，自动覆盖，可seek#打开的方式为r+，自动覆盖，可seek#write直接写单个字符串，writelines分行写列表# file.write('笨蛋1号\t16212001\t001\t0\t22\n')# file.write('笨蛋2号\t16212001\t001\t0\t22\n')# file.writelines(['笨蛋3号\t16212001\t001\t0\t22\n','笨蛋4号\t16212001\t001\t0\t22\n'])# # print file.tell()# file.close()################################二、xls文件操作# 读取# import xlrd,xlwt,xlsxwriter# data=xlrd.open_workbook('student_id.xls')# # table=data.sheet_by_name('info')# table=data.sheets()[0]# # print table.col_values(0)[1]  #第0列的第1个数据# # print table.row_values(1)[1]  #第1行的第1个数据# print '表格行数：%s；表格列数：%s'%(table.nrows,table.ncols)# print '获取第10行，第1列的内容：%s'%(table.cell(9,0).value.encode('gbk'))# 创建并写入# myxls=xlwt.Workbook()# mysheet=myxls.add_sheet('The first sheet')# n1=table.nrows# n2=table.ncols# for i in range(n1):#     for j in range(n2):#         mysheet.write(i,j,table.cell(i,j).value)# myxls.save('student_id_copy.xls')#修改，比较麻烦，在以上两者间建立通道，或者自定义复制成新文件也行# from xlutils.copy import copy# import xlrd# file=xlrd.open_workbook('student_id_copy.xls',formatting_info=True)# mysheet=file.sheet_by_index(0)# n1=mysheet.nrows# n2=mysheet.ncols# newfile=copy(file)# newsheet=newfile.get_sheet(0)# newsheet.write(n1,0,'笨蛋'.decode('gbk'))# newsheet.write(n1,1,'00001')# newfile.save('student_id_copy.xls')################################ 三、csv文件操作# import csv# f=file('zwg.csv','a+b')# #a续写，w覆盖写，r读（+代表读写都有，b代表文件模式）# writer=csv.writer(f)# writer.writerow(list('ABCD'))# data=[[12,3,4,5],[4,3,2,1]]# writer.writerows(data)# f.close()## f=file('zwg.csv','rb')# reader=csv.reader(f)# for i in reader:#     print i################################ 四、PDF文件操作# pdfminer，建议使用# PDFParser：从一个文件中获取数据# PDFDocument：保存获取的数据，和PDFParser是相互关联的# PDFPageInterpreter处理页面内容# PDFDevice将其翻译成你需要的格式# PDFResourceManager用于存储共享资源，如字体或图像。# from pdfminer.converter import PDFPageAggregator# from pdfminer.pdfparser import PDFParser# from pdfminer.pdfdocument import PDFDocument# from pdfminer.pdfpage import PDFPage# from pdfminer.pdfpage import PDFTextExtractionNotAllowed# from pdfminer.pdfinterp import PDFResourceManager# from pdfminer.pdfinterp import PDFPageInterpreter# from pdfminer.layout import *# import re# fp = open('Python.pdf', 'rb')# #用文件对象来创建一个pdf文档分析器# parser = PDFParser(fp)# # 创建一个  PDF 文档# doc = PDFDocument(parser)# #获取文档目录# # outlines = doc.get_outlines()# # for (level,title,dest,a,se) in outlines:# #     print level, title     #标题级别及级别# #检查文件是否允许文本提取# if not doc.is_extractable:#     raise PDFTextExtractionNotAllowed# #创建一个PDF资源管理器对象来存储共享资源# #caching = False不缓存# rsrcmgr = PDFResourceManager(caching = False)# # 创建一个PDF设备对象# laparams = LAParams()# # 创建一个PDF页面聚合对象# device = PDFPageAggregator(rsrcmgr, laparams=laparams)# #创建一个PDF解析器对象# interpreter = PDFPageInterpreter(rsrcmgr, device)# #处理文档当中的每个页面# # doc.get_pages() 获取page列表# #for i, page in enumerate(document.get_pages()):# #PDFPage.create_pages(document) 获取page列表的另一种方式# file1=file('Python.txt','w+')# # 循环遍历列表，每次处理一个page的内容# k=1# for page in PDFPage.create_pages(doc):#     if k==4:#         break#     k=k+1#     interpreter.process_page(page)#     # 接受该页面的LTPage对象#     layout=device.get_result()#     # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象#     # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等#     for x in layout:#         #如果x是水平文本对象的话#         if(isinstance(x,LTTextBoxHorizontal)):#             file1.write(x.get_text().encode('gbk','ignore'))# pyPdf,主要用于生成PDF，PDF合并# import pyPdf# file=open('Python.pdf','rb')# to_doc=pyPdf.PdfFileWriter()# from_doc=pyPdf.PdfFileReader(file)# S=from_doc.getPage(1)# to_doc.addPage(S)#写入缓存# outputStream = open("Python_copy.pdf", "wb")# to_doc.write(outputStream)#存入文档# outputStream.close()
0 0