Python提取数据

来源:互联网 发布:8051 单片机配置 编辑:程序博客网 时间:2024/05/08 01:25

Python提取数据

python提取数据
目前实现的代码:

import redef read_write_file(in_file_path, out_file_path, reg_expression, line_per_loop):    in_file  = open(in_file_path)    out_file = open(out_file_path, "a")    #pattern = re.compile(r'warn.mse.360.cn')    pattern = re.compile(reg_expression)    while 1:        lines = in_file.readlines(line_per_loop)        if not lines:            break        for line in lines:            #process the line            match = pattern.search(line)            if match:                print line                out_file.write(line)    in_file.close()    out_file.close()if __name__ == "__main__":    read_write_file("log.txt", "out.txt", r'INCLUDE', 100000)

访问目录的部分

import os, sys# depth: the current dir:depth = 0def dir_walk(dir, out_file, depth):    out_file.write(dir + '\n')    current_depth = 0    file_num = 0    list = os.listdir(dir)    for line in list:        file_path = os.path.join(dir, line)        if os.path.isdir(file_path):    # is a directory            #file.write(line + '\\' + '\n')            dir_walk(file_path, out_file, depth - 1)        elif os.path:    # is a file            #process the line            read_write_file(file_path, out_file, r'warn\.mse\.360\.cn', 100000)if __name__ == "__main__":    #dir_path = os.getcwd()    file = open('path.txt', 'w')    dir_walk('E:\\git', file, 2)

数据部分在优盘里

最后完善的代码:

import reimport os, sys# in_file_path: the file will be read that maybe in different path# out_file: the content that was been extracted from the source file# reg_expression: the regulation rules that want to extract# liner_per_loop: the lines that will be loaded per timedef read_write_file(in_file_path, out_file, reg_expression, line_per_loop):    in_file  = open(in_file_path)    #pattern = re.compile(r'warn.mse.360.cn')    pattern = re.compile(reg_expression)    while 1:        lines = in_file.readlines(line_per_loop)        if not lines:            break        for line in lines:            #process the line            match = pattern.search(line)            if match:                print line                out_file.write(line)    in_file.close()# dir: the directory that will be search# out_file: the file that has been opened# depth: the current dir:depth = 0def dir_walk(dir, out_file, current_depth, max_depth):    print current_depth    if current_depth > max_depth:        return    list = os.listdir(dir)    for line in list:        print line        file_path = os.path.join(dir, line)        if os.path.isdir(file_path):    # is a directory            dir_walk(file_path, out_file, current_depth + 1, max_depth)        elif os.path:    # is a file            if current_depth != 0:                #process the line                read_write_file(file_path, out_file, r'INCLUDE', 100000)if __name__ == "__main__":    current_dir = os.getcwd()    out_file = open("extract_log.txt", "a")    dir_walk(current_dir, out_file, 0, 3)    out_file.close()

在第0层中,代码只会对文件夹进行扫描,其他层次扫描所有的文件和目录。

0 0