bt种子爬虫程序和种子解析(大蟒蛇语言编写)

来源:互联网 发布:2016年淘宝卖什么好 编辑:程序博客网 时间:2024/04/29 09:45

最近亲眼看到快播被大批警察包围,感觉到快播注定要关闭很多东西,很多宅男宅女们又要寂寞了,于是乎,疯狂的研究DHT网络技术

看到网上也有开源的代码,这不,我拿来进行了二次重写,呵呵,上代码:

#encoding: utf-8import socketfrom hashlib import sha1from random import randintfrom struct import unpack, packfrom socket import inet_aton, inet_ntoafrom bisect import bisect_leftfrom threading import Timerfrom time import sleepimport MySQLdbfrom datetime import *import timefrom bencode import bencode, bdecodeBOOTSTRAP_NODES = [    ("router.bittorrent.com", 6881),    ("dht.transmissionbt.com", 6881),    ("router.utorrent.com", 6881)] TID_LENGTH = 4KRPC_TIMEOUT = 10REBORN_TIME = 5 * 60K = 8def entropy(bytes):    s = ""    for i in range(bytes):        s += chr(randint(0, 255))    return sdef random_id():    hash = sha1()    hash.update( entropy(20) )    return hash.digest()def decode_nodes(nodes):    n = []    length = len(nodes)    if (length % 26) != 0:         return n    for i in range(0, length, 26):        nid = nodes[i:i+20]        ip = inet_ntoa(nodes[i+20:i+24])        port = unpack("!H", nodes[i+24:i+26])[0]        n.append( (nid, ip, port) )    return ndef encode_nodes(nodes):    strings = []    for node in nodes:        s = "%s%s%s" % (node.nid, inet_aton(node.ip), pack("!H", node.port))        strings.append(s)    return "".join(strings)def intify(hstr):    return long(hstr.encode('hex'), 16)    def timer(t, f):    Timer(t, f).start()class BucketFull(Exception):    passclass KRPC(object):    def __init__(self):        self.types = {            "r": self.response_received,            "q": self.query_received        }        self.actions = {            "ping": self.ping_received,            "find_node": self.find_node_received,            "get_peers": self.get_peers_received,            "announce_peer": self.announce_peer_received,        }        self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)        self.socket.bind(("0.0.0.0", self.port))    def response_received(self, msg, address):        self.find_node_handler(msg)    def query_received(self, msg, address):        try:            self.actions[msg["q"]](msg, address)        except KeyError:            pass    def send_krpc(self, msg, address):        try:            self.socket.sendto(bencode(msg), address)        except:            passclass Client(KRPC):    def __init__(self, table):        self.table = table        timer(KRPC_TIMEOUT, self.timeout)        timer(REBORN_TIME, self.reborn)        KRPC.__init__(self)    def find_node(self, address, nid=None):        nid = self.get_neighbor(nid) if nid else self.table.nid        tid = entropy(TID_LENGTH)                msg = {            "t": tid,            "y": "q",            "q": "find_node",            "a": {"id": nid, "target": random_id()}        }        self.send_krpc(msg, address)    def find_node_handler(self, msg):        try:            nodes = decode_nodes(msg["r"]["nodes"])            for node in nodes:                (nid, ip, port) = node                if len(nid) != 20: continue                if nid == self.table.nid: continue                self.find_node( (ip, port), nid )        except KeyError:            pass    def joinDHT(self):        for address in BOOTSTRAP_NODES:             self.find_node(address)    def timeout(self):        if len( self.table.buckets ) < 2:            self.joinDHT()        timer(KRPC_TIMEOUT, self.timeout)    def reborn(self):        self.table.nid = random_id()        self.table.buckets = [ KBucket(0, 2**160) ]        timer(REBORN_TIME, self.reborn)    def start(self):        self.joinDHT()        while True:            try:                (data, address) = self.socket.recvfrom(65536)                msg = bdecode(data)                self.types[msg["y"]](msg, address)            except Exception:                pass    def get_neighbor(self, target):        return target[:10]+random_id()[10:]class Server(Client):    def __init__(self, master, table, port):        self.table = table        self.master = master        self.port = port        Client.__init__(self, table)    def ping_received(self, msg, address):        try:            nid = msg["a"]["id"]            msg = {                "t": msg["t"],                "y": "r",                "r": {"id": self.get_neighbor(nid)}            }            self.send_krpc(msg, address)            self.find_node(address, nid)        except KeyError:            pass    def find_node_received(self, msg, address):        try:            target = msg["a"]["target"]            neighbors = self.table.get_neighbors(target)                        nid = msg["a"]["id"]            msg = {                "t": msg["t"],                "y": "r",                "r": {                    "id": self.get_neighbor(target),                     "nodes": encode_nodes(neighbors)                }            }            self.table.append(KNode(nid, *address))            self.send_krpc(msg, address)            self.find_node(address, nid)        except KeyError:            pass    def get_peers_received(self, msg, address):        try:            infohash = msg["a"]["info_hash"]            neighbors = self.table.get_neighbors(infohash)            nid = msg["a"]["id"]            msg = {                "t": msg["t"],                "y": "r",                "r": {                    "id": self.get_neighbor(infohash),                     "nodes": encode_nodes(neighbors)                }            }            self.table.append(KNode(nid, *address))            self.send_krpc(msg, address)            self.master.log(infohash)            self.find_node(address, nid)        except KeyError:            pass    def announce_peer_received(self, msg, address):        try:            infohash = msg["a"]["info_hash"]            nid = msg["a"]["id"]            msg = {                 "t": msg["t"],                "y": "r",                "r": {"id": self.get_neighbor(infohash)}            }            self.table.append(KNode(nid, *address))            self.send_krpc(msg, address)            self.master.log(infohash)            self.find_node(address, nid)        except KeyError:            passclass KTable(object):    def __init__(self, nid):        self.nid = nid        self.buckets = [ KBucket(0, 2**160) ]    def append(self, node):        index = self.bucket_index(node.nid)        try:            bucket = self.buckets[index]            bucket.append(node)        except IndexError:            return        except BucketFull:            if not bucket.in_range(self.nid): return            self.split_bucket(index)            self.append(node)    def get_neighbors(self, target):        nodes = []        if len(self.buckets) == 0: return nodes        if len(target) != 20 : return nodes        index = self.bucket_index(target)        try:            nodes = self.buckets[index].nodes            min = index - 1            max = index + 1            while len(nodes) < K and ((min >= 0) or (max < len(self.buckets))):                if min >= 0:                    nodes.extend(self.buckets[min].nodes)                if max < len(self.buckets):                    nodes.extend(self.buckets[max].nodes)                min -= 1                max += 1            num = intify(target)            nodes.sort(lambda a, b, num=num: cmp(num^intify(a.nid), num^intify(b.nid)))            return nodes[:K]        except IndexError:            return nodes    def bucket_index(self, target):        return bisect_left(self.buckets, intify(target))    def split_bucket(self, index):        old = self.buckets[index]        point = old.max - (old.max - old.min)/2        new = KBucket(point, old.max)        old.max = point        self.buckets.insert(index + 1, new)        for node in old.nodes[:]:            if new.in_range(node.nid):                new.append(node)                old.remove(node)    def __iter__(self):        for bucket in self.buckets:            yield bucketclass KBucket(object):    __slots__ = ("min", "max", "nodes")    def __init__(self, min, max):        self.min = min        self.max = max        self.nodes = []    def append(self, node):        if node in self:            self.remove(node)            self.nodes.append(node)        else:            if len(self) < K:                self.nodes.append(node)            else:                raise BucketFull    def remove(self, node):        self.nodes.remove(node)    def in_range(self, target):        return self.min <= intify(target) < self.max    def __len__(self):        return len(self.nodes)    def __contains__(self, node):        return node in self.nodes    def __iter__(self):        for node in self.nodes:            yield node    def __lt__(self, target):        return self.max <= targetclass KNode(object):    __slots__ = ("nid", "ip", "port")        def __init__(self, nid, ip, port):        self.nid = nid        self.ip = ip        self.port = port    def __eq__(self, other):        return self.nid == other.nid#using exampleclass Master(object):    def __init__(self, f):        self.f = f        try:            self.conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)            self.cur=self.conn.cursor()        except MySQLdb.Error,e:            print "Mysql Error %d: %s" % (e.args[0], e.args[1])    def log(self, infohash):        try:            sql = "insert into bt_main_new(hash,name,length,date) values(%s,%s,%s,%s)"            date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())            re=self.cur.execute(sql,(infohash,'','',date))            self.conn.commit()            self.cur.close()            self.conn.close()            #print re        except MySQLdb.Error,e:            print "Mysql Error %d: %s" % (e.args[0], e.args[1])        self.f.write(infohash.encode("hex")+"\n")        self.f.flush()try:    d = date.today()    f = open("%s.log" % d, "a")    m = Master(f)    s = Server(Master(f), KTable(random_id()), 8006)    s.start()     except KeyboardInterrupt:    s.socket.close()    f.close()

本爬虫程序,会自动爬取得网络上分享的bt种子,写入文件盒数据库,爬取的只是个种子的hash码,还需要到网络上下载种子进行分析


下载种子,相信大家都知道国外有几个免费分享种子的网站,大家可以根据hash码去下载,分析,下面呈上我写的一个分析种子的程序:

#! /usr/bin/python# -*- coding: utf-8 -*-import MySQLdbfrom datetime import *import timeimport refrom time import sleepimport bencodeimport urllib2import base64try:    conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='bt',port=3306)    cur=conn.cursor()    sql = "select * from bt_main where name = '' order by id desc"    count = cur.execute(sql)    rows = cur.fetchall()    for row in rows:                if row[2].strip() != '':            continue                    id = row[0]        hash = row[1]        url = "http://haofuli.duapp.com/go/info.php?hash=%s" % hash        file = urllib2.urlopen(url).read()        if "error!" == file:            try:                sql = "update bt_main set isTrue = 0 where id = %s "                re = cur.execute(sql,(id))                conn.commit()            except MySQLdb.Error,e:                print "Mysql Error %d: %s" % (e.args[0], e.args[1])        else:            #decode            try:                fileEncode = bencode.bdecode(file)            except Exception,e:pass            if 'name.utf-8' in fileEncode['info']:                filename=fileEncode['info']['name.utf-8']            else:                filename = fileEncode['info']['name']            ##length            if "length" in fileEncode['info']:                length = fileEncode['info']['length']            else:                length = 0            try:                sql = "update bt_main set name = %s , length = %s , isTrue = 1 where id = %s"                re = cur.execute(sql,(base64.b64encode(filename),length,id))                conn.commit()            except MySQLdb.Error,e:                print "Mysql Error %d: %s" % (e.args[0], e.args[1])except MySQLdb.Error,e:    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

上面的只是简单的分析,对于多文件的,还没有处理。我最近在解析种子的时候,总是出现莫名的填充文件的问题,可能是版本过低吧,最近仍旧在解决。


BT种子站,本人用PHP做了一个BT种子站,域名再次就不说啦哈,csdn不让写。大家可以回复向我索取域名哈。

bt.dianfenxiang.com

0 0