python 一个简单的依靠文件来判断key是否重复的方法

来源:互联网 发布:连通区域图像分割算法 编辑:程序博客网 时间:2024/05/21 06:39

         写此功能的初衷:

 遇到过这样一种情况,在向数据库中写入数据时,有三个字段决定了数据是否重复,本可以创建联合主键,但不希望因此增加内存的使用,于是希望将这三个字段连接到一起,然后写入一个文件,在不占用内存的情况下,判断数据是否有重复。


         

#coding=utf-8'''Created on 2016-3-8@author: Administrator'''import hashlibimport structimport mmapfrom fileinput import filenameimport osclass KeyData():    def __init__(self,filename=None):        '''        初始化  默认生成4g大小文件        '''        self.data_size = 2**32-1        self.mod = 1000000        self.m = None        self.pattern = '1s32sl'        if not filename ==None:            if not os.path.exists(filename):                self.create_data(filename)            else:                self.open_data(filename)    def create_data(self,filename):        '''        创建data        '''        if os.path.exists(filename):            return False,'data has exist'                self.__init_file(filename, self.data_size)        return True,'ok'    def open_data(self,filename,access=mmap.ACCESS_WRITE):        '''        打开data        '''        if self.m == None:            size = os.path.getsize(filename)            fd = open(filename,'r+b')            self.m =  mmap.mmap(fd.fileno(),size,access=access)        return True,'ok'        def closedata(self):        self.m.close()        self.m = None        def DJBHash(self,key):        hash = 5381        for i in range(len(key)):           hash = ((hash << 5) + hash) + ord(key[i])        return hash    def __init_file(self,filename,size):        #最大2的32次方减1        with open(filename,'wb') as f:            f.seek(size-1)            f.write(b'\x00')                self.open_data(filename)        self.m[:4] = struct.pack('l',44+self.mod*40)            def get_md5(self,string):        m = hashlib.md5()           m.update(string)           return m.hexdigest()    def get_md5_hash(self,key):        md5string = self.get_md5(key)        hash = self.DJBHash(md5string)%self.mod        return md5string,hash    def addkey(self,key):        if self.exist(key):            return False,'key has exist'        md5string,hash = self.get_md5_hash(key)        self.addkey2(hash*40+4,md5string)        return True,'ok'        def get_next_index(self):        string = self.m[0:4]        return struct.unpack('l',string)        def addkey2(self,index,md5string,lastindex=None):        start = index        end = index+40        string = self.m[start:end]        result = struct.unpack(self.pattern,string)        if not result[0] == '1':            self.m[start:end] = struct.pack(self.pattern,'1',md5string,0)            if not lastindex == None:                start = lastindex                end = lastindex+40                string = self.m[start:end]                result = struct.unpack(self.pattern,string)                self.m[start:end] = struct.pack(self.pattern,result[0],result[1],index)                self.m[0:4] = struct.pack('l',index+40)                    else:            next_index = self.get_next_index()[0]            self.addkey2(next_index,md5string,index)                def exist(self,key):        md5string,hash = self.get_md5_hash(key)        return self.exist_link(hash*40+4, md5string)        def exist_link(self,index,md5string):        start = index        end = index+40        string = self.m[start:end]        result = struct.unpack(self.pattern,string)                if not result[0]=='1':            return False                if result[1]== md5string:            return True                if result[2] ==0:            return False                return self.exist_link(result[2],md5string)    def test():    kd = KeyData('test')    print kd.addkey('kwsy')    print kd.addkey('kwsy')if __name__ =='__main__':    test()
               

0 0
原创粉丝点击