python多进程分块读取文件

来源:互联网 发布:js alter 标题 编辑:程序博客网 时间:2024/06/06 04:51
# -*- coding: utf-8 -*-import urlparseimport datetimeimport osfrom multiprocessing import Process,Queue,Array,RLockWORKERS = 6BLOCK_SIZE = 0FILE_SIZE = 0FILE_NAME = 'try.log'def getFilesize(file):    global FILE_SIZE    fs = open(file,'r')    fs.seek(0,os.SEEK_END)    FILE_SIZE = fs.tell()    fs.close()def process_found(pid,array,rlock):    global FILE_SIZE,BLOCK_SIZE    fs = open(FILE_NAME,'rb')    try:        rlock.acquire()        begin = array[0]        end = (begin + BLOCK_SIZE)        print begin,end        if begin >= FILE_SIZE:            print 'begin',begin            array[0] = begin            raise Exception('end of file')        if end < FILE_SIZE:            fs.seek(end)            fs.readline()            end = fs.tell()        if end >= FILE_SIZE:            end = FILE_SIZE        array[0] = end        print '-------------',begin,end    except Exception, e:        print e.__class__.__name__,str(e)        return    finally:        rlock.release()    fs.seek(begin)    pos = begin    fd = open('tmp_pid'+str(pid)+'_jobs','wb')    while pos < end:        fd.write(fs.readline())        pos = fs.tell()    fs.close()    fd.close()def main():    global FILE_SIZE,BLOCK_SIZE,WORKERS,FILE_NAME    getFilesize(FILE_NAME)    BLOCK_SIZE = FILE_SIZE/WORKERS    print FILE_SIZE,BLOCK_SIZE    rlock = RLock()    array = Array('l',WORKERS)    array[0] = 0    process=[]    for i in range(WORKERS):        p=Process(target=process_found, args=[i,array,rlock])        process.append(p)    for i in range(WORKERS):        process[i].start()    for i in range(WORKERS):        process[i].join()if __name__ == '__main__':    main()