qqzeng-ip.dat IP库读取python版

来源:互联网 发布:英国亚马逊海淘 知乎 编辑:程序博客网 时间:2024/04/30 00:22

qqzeng-ip.dat是一个特殊格式的dat文件,可以快速的查找IP对应的地理位置信息。据作者测试的结果来看,是100万ip查找速度0.5秒。

当然这和语言有非常大的关系,python的循环性能一直是为人所诟病的。目前python版本测试的结果是10万IP的查找速度是3.X秒左右,还算够用,毕竟真实情况下的30秒~5分钟内的日志不太可能出现一批数据中的不重复IP超过10万个。

作者提供了解析dat的java/c/php脚本,但没有提供python版本的。所以我就写了一个,以供需要用python语言读取ip的地理位置信息使用。

代码如下:

#coding:utf-8import osimport mathimport socketimport structimport iofrom io import SEEK_SETpath = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + "/qqzeng-ip-utf8.dat")class IpSearch(object):    fp = None    firstStartIpOffset = None    lastStartIpOffset = None    preStartOffset = None    preEndOffset = None    ipCount = None    prefixCount = None    prefixList = dict()    def __init__(self):        self.fp = io.open(path,"rb")        buff = self.fp.read(16)        self.firstStartIpOffset = self.bytesToLong(buff[0],buff[1],buff[2],buff[3])        self.lastStartIpOffset = self.bytesToLong(buff[4],buff[5],buff[6],buff[7])        self.preStartOffset = self.bytesToLong(buff[8],buff[9],buff[10],buff[11])        self.preEndOffset = self.bytesToLong(buff[12],buff[13],buff[14],buff[15])        self.ipCount = (self.lastStartIpOffset - self.firstStartIpOffset) / 12 + 1        self.prefixCount = (self.preEndOffset - self.preStartOffset) / 9 + 1                self.fp.seek(self.preStartOffset,SEEK_SET)        preBuff = self.fp.read(self.prefixCount*9)        for k in range(0,self.prefixCount):            i = k*9            startIndex = self.bytesToLong(preBuff[1+i],preBuff[2+i],preBuff[3+i],preBuff[4+i])            endIndex = self.bytesToLong(preBuff[5+i],preBuff[6+i],preBuff[7+i],preBuff[8+i])            self.prefixList[ord(preBuff[i])] = {                "start_index":startIndex,                "end_index":endIndex            }    def __del__(self):        if self.fp != None:            self.fp.close()    def get(self,ip):        if ip == '':            return ""        high = 0        low = 0        startIp = 0        endIp = 0        localOffset = 0        localLength = 0        prefix = ip.split(".")[0]        prefix = int(prefix)        ipnum = self.ip2unit(ip)        if prefix in self.prefixList.keys():            index = self.prefixList[prefix]            low = index["start_index"]            high = index["end_index"]        else:            return ""        left = low if low == high else self.binarySearch(low,high,ipnum)        left,startIp,endIp,localOffset,localLength = self.getIndex(left,startIp,endIp,localOffset,localLength)        if startIp <= ipnum and endIp >= ipnum:            return self.getLocal(localOffset,localLength)        else:            return ""        def getLocal(self,localOffset,localLength):        self.fp.seek(localOffset,SEEK_SET)        return self.fp.read(localLength)    def getIndex(self,left,startIp,endIp,localOffset,localLength):        leftOffset = self.firstStartIpOffset + left*12        self.fp.seek(leftOffset,SEEK_SET)        buff = self.fp.read(12)        startIp = self.bytesToLong(buff[0],buff[1],buff[2],buff[3])        endIp = self.bytesToLong(buff[4],buff[5],buff[6],buff[7])        r3 = (ord(buff[8]) << 0 | ord(buff[9]) << 8 | ord(buff[10]) << 16)        if r3 < 0:            r3 += 4294967296        localOffset = r3        localLength = ord(buff[11])        return [left,startIp,endIp,localOffset,localLength]    def binarySearch(self,low,high,k):        m = 0        while low <= high:            mid = (low + high)/2            endIpNum = self.getEndIpNum(mid)            if endIpNum >= k:                m = mid                if mid == 0:                    break                high = mid - 1            else:                low = mid + 1        return m    def getEndIpNum(self,left):        leftOffset = self.firstStartIpOffset + (left*12) + 4        self.fp.seek(leftOffset,SEEK_SET)        buf = self.fp.read(4)        return self.bytesToLong(buf[0],buf[1],buf[2],buf[3])    def ip2unit(self,ip):        lip = self.ip2long(ip)        if lip < 0:            lip += 4294967296        return lip    def ip2long(self,ip):        packedIP = socket.inet_aton(ip)        return struct.unpack("!L", packedIP)[0]     def bytesToLong(self,a,b,c,d):        iplong = (ord(a) << 0) | (ord(b) << 8) | (ord(c) << 16) | (ord(d) << 24)        if iplong < 0:            iplong += 4294967296        return iplongif __name__ == '__main__':    ipSearch = IpSearch()    print ipSearch.get("210.51.200.123").decode("utf-8").encode("gbk")    import time    startTime = time.time()    for i in range(0,100000):        ipSearch.get("210.51.200.123")    endTime = time.time()    print "time waste:",endTime-startTime


测试结果如下:


与百度查出的IP信息进行对比:


还挺不错的,对吧。

0 0
原创粉丝点击