python2抓取某虚拟币网数据的小程序

来源:互联网 发布:电脑语音变声软件 编辑:程序博客网 时间:2024/04/28 22:40
# -*- coding: utf-8 -*-'''注意python 2.x 与 python 3.x 的版本切换from __future__ import unicode_literalsfrom __future__ import division    ...if change code style to py3.x please insert this two line . still running on Python 2.7'''import reimport jsonimport urllib2, cookielibimport chardetimport socketclass Yunbi(object):    __author__ = 'shunzi'    __dateTime__ = '2017-6-29'    dataTrades = []    def __init__(self, **kw):        self.dataTrades = []    def getHtml(self, url):        '''登录和获取html资源'''        # 设置cookie        cookiejar = cookielib.CookieJar()        cookie = urllib2.HTTPCookieProcessor(cookiejar)        opener = urllib2.build_opener(cookie, urllib2.HTTPHandler())        urllib2.install_opener(opener)        try:            req = urllib2.Request(url)            response = urllib2.urlopen(req).read()            # 解决乱码问题            mychar = chardet.detect(response)            bianma = mychar['encoding']            print bianma            if bianma == 'utf-8' or bianma == 'UTF-8':                html = response                print 'xx'            else:                html = response.decode('gb2312', 'ignore').encode('utf-8')            return html        except:            print 'url 解析异常,请仔细检查确认后提交!'            return    def getTrades(self,html):        trades = re.findall(r'gon.trades=\[(.*?)\];gon.config', html, re.M | re.I)        # print type(trades[0])        # print trades[0]        # json_str = json.loads(trades[0])        # print type(json_str)        # print trades        # print "----------"        # print json_str        print type(trades)        print len(trades)        lines = str(trades[0]).split('},{')        print lines        dr = re.compile(r'\{', re.S)  # del all tags        lines[0]=dr.sub('', lines[0]).strip()        dr = re.compile(r'\}', re.S)  # del all tags        lines[len(lines)-1] = dr.sub('', lines[len(lines)-1]).strip()        print lines[0],'-',lines[len(lines)-1]        print '=================='        for i in range(len(lines)):            # tmpline = re.sub(r'\'',']', lines[i])            # lines[i] = re.sub(r'\'','[',tmpline)            # print tmpline            tmplines = lines[i].split(',')            # print lines[i].split(',')            # print lines[i]            tmpdict = {}            for ii in tmplines:                # print '------------'                # print ii                tmpElement = ii.split(':')                # print tmpElement                for i in range(len(tmpElement)):                    tmpi = re.sub(r"\"","",tmpElement[i])                    tmpElement[i]=tmpi                # print tmpElement                tmpdict[tmpElement[0]]=tmpElement[1]            self.dataTrades.append(tmpdict)            # print tmpdictif __name__ == '__main__':    '''主线程单元测试'''    url_yunbi = 'https://yunbi.com/markets/btccny#'    yunbi = Yunbi()  # 获取实例    yunbi.__init__()    html = yunbi.getHtml(url_yunbi)  # 获取html    yunbi.getTrades(html)    # data = yunbi.dataTrades[0]    # print data['tid']