文本处理脚本

来源：互联网发布：电脑上有趣的软件编辑：程序博客网时间：2024/05/07 03:10
最近要帮师兄处理一些数据，一个个粘贴太麻烦了，于是写了一个文本处理的脚本，只针对特定任务可用。但先留存下来，如果以后需要再进行修改
#-*- encoding:utf8 -*-import osimport reimport sys__author__ = 'sangoly'class DataExtractor():    def __init__(self, source_path='.', target_path=None):        self._source_path = source_path        if not target_path:            self._target_path = self._source_path        else:            self._target_path = target_path    def do_extract_action(self):        file_name_info_list = self._file_scanner()        self._extract_data(file_name_info_list)    def _file_scanner(self):        file_list = os.listdir(self._source_path)        x_source_name = None        for index, file_name in enumerate(file_list):            if file_name.startswith("bps_"):                x_source_name = file_name                break        if not x_source_name:            print "Not found x file, please check it and try again."            exit(0)        file_list.remove(x_source_name)        x_source_name, source_suffix = x_source_name.split('.')        y_source_number = x_source_name.split('_')[1]        if int(y_source_number) != len(file_list):            print "The Y files' number is not correct!"            exit(0)        y_source_name_segement = file_list[0].split('_')[:-1]        y_source_name = '_'.join(y_source_name_segement) + "_"        return [x_source_name, y_source_name, y_source_number, '.' + source_suffix]    def _extract_data(self, file_name_info_list, target_file_name="result"):        x_source_name = file_name_info_list[0]        y_source_name = file_name_info_list[1]        y_source_number = file_name_info_list[2]        source_suffix = file_name_info_list[3]        x_prefix = "x=["        y_prefix = "y=["        suffix = "];"        x_coordinate_list = []        y_coordinate_list = []        #Get the x coordinate        x_source_real_path = os.path.join(os.sep, self._source_path,                                          x_source_name + source_suffix)        try:            x_file = open(x_source_real_path)            for x_coordinate in x_file.readlines():                x_coordinate_list.append(x_coordinate.strip())            x_file.close()            #Get the y coordinate            re_rule = r'total\s+\S+\s'            re_pattern = re.compile(re_rule)            for i in range(1, int(y_source_number) + 1):                y_source_file_real_path = os.path.join(os.sep, self._source_path,                                                       y_source_name + str(i) + source_suffix)                y_file = open(y_source_file_real_path)                y_file_content = y_file.read()                re_result = re.findall(re_pattern, y_file_content)                data_string = re_result[0].split('\t')[1].replace(',', '.').strip()                y_coordinate_list.append(data_string)                y_file.close()            target_file_real_path = os.path.join(os.sep, self._target_path,                                                 target_file_name + source_suffix)            target_file = open(target_file_real_path, 'w')            target_file.write(x_prefix + ','.join(x_coordinate_list) + suffix + '\n\n')            target_file.write(y_prefix + ','.join(y_coordinate_list) + suffix)            target_file.close()        except IOError, e:            print 'File not exist.'            print eif __name__ == '__main__':    if len(sys.argv) < 2:        print "The argument's number must be two at least(include the script name)"        exit(0)    #As default the target_path is same to source_path    source_path = sys.argv[1]    target_path = None    if len(sys.argv) > 2:        target_path = sys.argv[2]    data_extractor = DataExtractor(source_path, target_path)    data_extractor.do_extract_action()
0 0