python查找中文字符

来源:互联网 发布:对网络推广的认识 编辑:程序博客网 时间:2024/06/07 02:59

#filename Seek.py
import unicodedata

import sys
import os
class Seek():
    """
    功能:查找中文,并替换成指定字符或字符串
    使用方法:python脚本用法
    参数说明:
        -d    : 文件目录(绝对或相对路径)(默认为脚本所在目录)
        -t    : 文件类型(文件名后缀,如.jsp;.txt)(默认为所有文件)
        -sf   : 是否包括子目录(Y|N)(option,默认不包括子目录)
        -r    : 输出文件名(option,默认为'ChineseCharacter.txt'),位于脚本目录下
        -encoding:文件字符编码(默认为utf-8)

    """
       
    def __init__(self):
        """
        初始化查找程序
        参数解析

        """
        #TODO:
        self.d = '.'
        self.sf = 'N'
        self.t = 'ALL'
        self.r = 'ChineseCharacter.txt'
        self.encoding = 'utf-8'
        varL = 0
        for ar in sys.argv:
            if ar == '-d':
                self.d = sys.argv[varL+1]
                continue

            if ar =='-sf':
                sf = sys.argv[varL+1].upper()
                if (sf == 'Y') | (sf == 'N'):
                    self.sf = sf
                else:
                    print('input error with sf parameter')
                continue

            if ar=='-r':
                self.r = sys.argv[varL+1]
                continue

            if ar=='-t':
                self.t = sys.argv[varL+1]
                continue
               
            if ar == '-encoding':
                self.encoding = sys.argv[varL+1]
                continue

            varL+=1

    def seeking(self):
        """"
        开始查找字符
       
        """
        try:
            #output file
            self.rfile = open(self.r,'w',encoding=self.encoding)
            #start seek
            for f in os.listdir(self.d):
                path = os.path.join(self.d,f)
                if self.__isFile(path):
                    if self.t != 'ALL':
                        if f.endswith(self.t):
                            self.__seek(path)
                    else:
                        self.__seek(path)
                elif self.__isDir(path) and self.sf == 'Y' :
                    #seek the sub folder when the self.sf equals 'Y'
                    self.start(path)
        except Exception as error:
            print('seek error %s' % error)
        finally:
            self.__close()
      
               
    def __close(self):
        """
        关闭文件及输入流和输出流

        """
        #close the stream and file
        self.rfile.close()

    def __isFile(self,file):
        #
        return os.path.isfile(file)
   
    def __isDir(self,path):
        #
        return os.path.isdir(path)
   
    def __openFile(self,file):
        pass
   
    def __closeFile(self,file):
        file.close()

    def __seek(self,file):
        """
        查找
       
        """
        #seek character
        fileObj = open(file,'r',encoding=self.encoding)
        lineList = fileObj.readlines()
        #块注释标记
        blockComment = 'finish'
        try:
            isC = False
            for line in lineList:
                #查找出注释部分,并跳过
                #
                #跳过'/*'和'*/'中的内容,处理剩余的内容
                if blockComment == 'start':
                    #块注释内容
                    index = line.find('*/')
                    if index != -1:
                        blockComment = 'finish'
                        #块注释结束
                        #处理当前行'*/'后的内容
                        line = line[index+2:]
                    else:
                        #仍处于块注释内容中,跳过
                        continue
                if line.startswith('//'):
                    #行注释
                    #跳过行
                    continue
                if line.startswith('/*'):
                    #块注释开始
                    blockComment = 'start'
                    continue

                #查找字符
                indexTag = 0;
                for s in line:
                    sIndex = line.index(s)
                    try:
                        #将不是LATIN开头的字符都找出来
                        if unicodedata.name(s).startswith('CJK') == True:
                            #TODO
                            #content = lineList.index(line)+1+s
                            isC = True
                            #如果两个字符间隔大于1,表示为不连续的中文
                            if (sIndex - indexTag) > 1 :
                                self.__writeFile('/t'+s)
                            else:
                                self.__writeFile(s)
                            indexTag = sIndex
                    except Exception as error:
                        print('seek character error : %s in %s' % (error,fileObj.name))
                        continue
                if isC:
                    for t in range(8):
                        self.__writeFile('/t')
                    self.__writeFile('line:')
                    self.__writeFile('%d' % (lineList.index(line)+1))
                    self.__writeFile('/n')
                isC = False
               
        finally:
            self.__writeFile('/n')
            self.__writeFile('------------'+fileObj.name)
            self.__writeFile('/n')
            fileObj.close()
       
    def __writeFile(self,content):
        self.rfile.write(content)
       
       
if __name__ == '__main__':
    seek=Seek()
    seek.seeking()