Python 爬虫

来源:互联网 发布:办公软件word视频教程 编辑:程序博客网 时间:2024/05/30 02:25
#-*- coding:utf-8 -*-import urllibimport reimport MySQLdbimport sysreload(sys)urlz = 'http://www.4399.com'def getList(url):    html = urllib.urlopen(url).read()    #html = html.decode('gb2312','ignore').encode('utf-8')    html = html.decode('gbk')    reg = r'<li><a href="(.+)"><img alt="(.*)"  src="(.+)"><b>.+</b>'    st = r'<li><a href="(.+)"><img alt="(.*)"  name=".+"  lz_src="(.+)"><b>(.+)</b>'    lists = re.findall(reg,html)    sy = re.findall(st,html)    for each in sy:        lists.append(each)    return listsdef findFlash(url):    html = urllib.urlopen(urlz + url).read()    #html = html.decode('gb2312','ignore').encode('utf-8')    html = html.decode('gbk')    reg = r'src="/js/(.*?).js">'    st = r'_strGamePath="(.*?)"'    server = re.findall(reg,html)[0]    address = re.findall(st,html)[0]    if len(server) <= 0 :        return    if len(address) <= 0:        return     if server.count('server') > 0:        tmp = server[6:len(server)]        return 'http://%s.4399.com/4399swf%s' %(tmp,address)    else:        return 'http://%s.4399.com/4399swf%s' %(server,address)def Insert(Id,name,imageUrl,gameUrl):    conn = MySQLdb.connect(host = "127.0.0.1",user = "root",passwd = "123",db = "game")    cursor = conn.cursor(MySQLdb.cursors.DictCursor)        conn.set_character_set('utf8')    cursor.execute('SET NAMES utf8;')     cursor.execute('SET CHARACTER SET utf8;')    cursor.execute('SET character_set_connection=utf8;')        try:        sql = "insert into game values(%s, %s, %s,%s)"        tmp = (Id,name,imageUrl,gameUrl)        cursor.execute(sql,tmp)        conn.commit()    except Exception as e:        print e        conn.rollback()        cursor.close()    conn.close()    return li = ['http://www.4399.com/flash/game100.htm']for i in range(2,10):    li.append('http://www.4399.com/flash/game100_%d.htm' %i)index = 0for i in li:    for each in getList(i):        try:            print each            print each[1]            Insert(index , each[1].decode('utf-8'), each[2], findFlash(each[0]))            #Insert(index , unicode(each[1],'utf-8'), each[2], findFlash(each[0]))            index = index + 1        except Exception as e:            print e                

0 0
原创粉丝点击