Python提取数据
来源:互联网 发布:8051 单片机配置 编辑:程序博客网 时间:2024/05/08 01:25
Python提取数据
python提取数据
目前实现的代码:
import redef read_write_file(in_file_path, out_file_path, reg_expression, line_per_loop): in_file = open(in_file_path) out_file = open(out_file_path, "a") #pattern = re.compile(r'warn.mse.360.cn') pattern = re.compile(reg_expression) while 1: lines = in_file.readlines(line_per_loop) if not lines: break for line in lines: #process the line match = pattern.search(line) if match: print line out_file.write(line) in_file.close() out_file.close()if __name__ == "__main__": read_write_file("log.txt", "out.txt", r'INCLUDE', 100000)
访问目录的部分
import os, sys# depth: the current dir:depth = 0def dir_walk(dir, out_file, depth): out_file.write(dir + '\n') current_depth = 0 file_num = 0 list = os.listdir(dir) for line in list: file_path = os.path.join(dir, line) if os.path.isdir(file_path): # is a directory #file.write(line + '\\' + '\n') dir_walk(file_path, out_file, depth - 1) elif os.path: # is a file #process the line read_write_file(file_path, out_file, r'warn\.mse\.360\.cn', 100000)if __name__ == "__main__": #dir_path = os.getcwd() file = open('path.txt', 'w') dir_walk('E:\\git', file, 2)
数据部分在优盘里
最后完善的代码:
import reimport os, sys# in_file_path: the file will be read that maybe in different path# out_file: the content that was been extracted from the source file# reg_expression: the regulation rules that want to extract# liner_per_loop: the lines that will be loaded per timedef read_write_file(in_file_path, out_file, reg_expression, line_per_loop): in_file = open(in_file_path) #pattern = re.compile(r'warn.mse.360.cn') pattern = re.compile(reg_expression) while 1: lines = in_file.readlines(line_per_loop) if not lines: break for line in lines: #process the line match = pattern.search(line) if match: print line out_file.write(line) in_file.close()# dir: the directory that will be search# out_file: the file that has been opened# depth: the current dir:depth = 0def dir_walk(dir, out_file, current_depth, max_depth): print current_depth if current_depth > max_depth: return list = os.listdir(dir) for line in list: print line file_path = os.path.join(dir, line) if os.path.isdir(file_path): # is a directory dir_walk(file_path, out_file, current_depth + 1, max_depth) elif os.path: # is a file if current_depth != 0: #process the line read_write_file(file_path, out_file, r'INCLUDE', 100000)if __name__ == "__main__": current_dir = os.getcwd() out_file = open("extract_log.txt", "a") dir_walk(current_dir, out_file, 0, 3) out_file.close()
在第0层中,代码只会对文件夹进行扫描,其他层次扫描所有的文件和目录。
0 0
- python 提取sqlit数据
- Python提取数据
- Python:提取网页数据
- Python学习之提取8684公交数据
- python提取百万数据到csv文件
- python爬虫介绍,HTML数据提取
- Python保存/提取数据的方法
- Python使用xslt提取网页数据
- Python使用xslt提取网页数据
- Python爬虫---提取数据(2)--beautifulsoup
- 【Python爬虫2】网页数据提取
- python解析csv文件 提取数据
- 提取数据
- 使用python 提取html文件中的特定数据
- 使用python 提取html文件中的特定数据
- 使用python 提取html文件中的特定数据
- python解析html提取数据,并生成word文档
- python--用linecache模块方便提取文本数据
- 相对布局和线形布局
- 跑偏程序员自学cocos2d-x之三 场景切换
- log4j
- android textView加圆环 只需要加个背景就行了
- C++派生类构造函数调用顺序
- Python提取数据
- junit的Test突然不能用了-Test is not an annotation type
- Aspose.word在asp.net mvc中如何使用的个人总结
- softmax代价函数的导数计算
- JavaScript语法
- 操作文件File/目录Directory的工具类
- 【BZOJ3670】[Noi2014]动物园【KMP】【fail树】
- 安卓取服务器上面的数据
- PAT-B 1009. 说反话