Python 爬虫
来源:互联网 发布:办公软件word视频教程 编辑:程序博客网 时间:2024/05/30 02:25
#-*- coding:utf-8 -*-import urllibimport reimport MySQLdbimport sysreload(sys)urlz = 'http://www.4399.com'def getList(url): html = urllib.urlopen(url).read() #html = html.decode('gb2312','ignore').encode('utf-8') html = html.decode('gbk') reg = r'<li><a href="(.+)"><img alt="(.*)" src="(.+)"><b>.+</b>' st = r'<li><a href="(.+)"><img alt="(.*)" name=".+" lz_src="(.+)"><b>(.+)</b>' lists = re.findall(reg,html) sy = re.findall(st,html) for each in sy: lists.append(each) return listsdef findFlash(url): html = urllib.urlopen(urlz + url).read() #html = html.decode('gb2312','ignore').encode('utf-8') html = html.decode('gbk') reg = r'src="/js/(.*?).js">' st = r'_strGamePath="(.*?)"' server = re.findall(reg,html)[0] address = re.findall(st,html)[0] if len(server) <= 0 : return if len(address) <= 0: return if server.count('server') > 0: tmp = server[6:len(server)] return 'http://%s.4399.com/4399swf%s' %(tmp,address) else: return 'http://%s.4399.com/4399swf%s' %(server,address)def Insert(Id,name,imageUrl,gameUrl): conn = MySQLdb.connect(host = "127.0.0.1",user = "root",passwd = "123",db = "game") cursor = conn.cursor(MySQLdb.cursors.DictCursor) conn.set_character_set('utf8') cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') try: sql = "insert into game values(%s, %s, %s,%s)" tmp = (Id,name,imageUrl,gameUrl) cursor.execute(sql,tmp) conn.commit() except Exception as e: print e conn.rollback() cursor.close() conn.close() return li = ['http://www.4399.com/flash/game100.htm']for i in range(2,10): li.append('http://www.4399.com/flash/game100_%d.htm' %i)index = 0for i in li: for each in getList(i): try: print each print each[1] Insert(index , each[1].decode('utf-8'), each[2], findFlash(each[0])) #Insert(index , unicode(each[1],'utf-8'), each[2], findFlash(each[0])) index = index + 1 except Exception as e: print e
0 0
- python爬虫-->爬虫基础
- [爬虫] Python爬虫技巧
- Python爬虫
- python 爬虫
- python 爬虫
- python 爬虫
- python爬虫
- Python爬虫
- Python爬虫
- python 爬虫
- Python爬虫
- python爬虫
- python 爬虫
- python 爬虫
- python爬虫
- python爬虫
- python爬虫
- python 爬虫
- OpenStack设计与实现5——RESTful API和WSGI
- 设计模式随笔-工厂模式
- 学习笔记 C++ 封装(下)
- python3爬虫(1)--百度百科的页面爬取
- html 02 css控制之css选择器及其优先级
- Python 爬虫
- 2016CCPC东北-B.Mr. Frog’s Problem
- 14. Longest Common Prefix
- hibrenate @ManyToOne(fetch = FetchType.EAGER) 和 lazy 区别
- Android 自定义图形实例
- MATLAB图形用户界面设计GUI
- 我终于连接access到jsp中啦!
- java.lang.NoSuchMethodException异常解决
- 自定义对话框学习记录01