python 采集调类入库
来源:互联网 发布:淘宝客怎么找ab单 编辑:程序博客网 时间:2024/05/11 20:37
采集页面# -*- coding: UTF-8 -*-printfrom conf import *import urllibimport urllib2import MySQLdbimport reres = MysqldbHelper()class NewsTitle: #init def __init__(self): self.url = "http://news.baidu.com/" #convert div to ''# def tranTags(self, x):# pattern = re.compile('<div.*?</div>')# pattern = re.compile('<img.*?>')# res = re.sub(pattern, '', x)# return res# 去除span标签 # def clerSpan(self, x):# pattern = re.compile('<span.*?</span>')# res = re.sub(pattern, '', x)# return res# 去除JavaScript# def clerjav(self, x):# pattern = re.compile('javascript:.*?;')# res = re.sub(pattern, '', x)# return res def getPage(self): url = self.url request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read() def getTitle(self): page = self.getPage() pattern = re.compile('(<div id="pane-news" .*?)<div id="footerwrapper">',re.S) tit = re.search(pattern,page) patterncode = re.sub(r'<a .*?><img .*?</a>','',tit.group(1)) patterncode = re.sub(r'<a .*?>\n<img .*?\n</a>','',patterncode) return patterncode def getHref(self): hrefcode = self.getTitle() pattern = re.compile('<a href="(http://.*?)".*?>(.*?)</a>', re.S) itmes = re.findall(pattern, hrefcode) return itmes news = NewsTitle()new = news.getHref()# print newres = res.gettitle(new)# 入库 #db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")#cursor = db.cursor()# for item in new:# print item[0], news.tranTags(item[1])# urll=item[0]# vals=news.tranTags(item[1])# sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+urll+"'")# try: # cursor.execute(sql) # db.commit()# except:# # Rollback in case there is any error# db.rollback()#
类库
#!D:/Python/python.exe# -*- coding: UTF-8 -*-#print "Content-type:textml"import MySQLdbimport reimport ConfigParserconfig = ConfigParser.ConfigParser()# print dbhost,dbport,dbname,dbuser,dbpassword,dbcharset#class MysqldbHelper: def __init__(self): config.read('db.conf') dbhost = config.get("database", "dbhost") dbport = config.get("database", "dbport") dbname = config.get("database", "dbname") dbuser = config.get("database", "dbuser") dbpassword = config.get("database", "dbpassword") dbcharset = config.get("database", "dbcharset") conn=MySQLdb.connect(dbhost,dbuser,dbpassword,dbname) self.cursor = conn.cursor() #单个删除 def getdel(self,table,id): cursor = self.cursor try: sql="DELETE FROM "+table+" WHERE id="+id cursor.execute(sql) return 'true' except MySQLdb.Error as e: print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) ) def getguo(self, x): pattern = re.compile('<div.*?</div>') res = re.sub(pattern, '', x) return res #查询表中所有数据 def getselect(self,table): cursor = self.cursor try: sql="SELECT * FROM "+table cursor.execute(sql) result = cursor.fetchall() return result except MySQLdb.Error as e: print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) ) def getdell(self,table,id): cursor = self.cursor try: sql="DELETE FROM "+table+" WHERE id in ("+id+")" cursor.execute(sql) return 'true' except MySQLdb.Error as e: print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) ) #添加标题 和 链接 def gettitle(self,new): res = MysqldbHelper() cursor = self.cursor add = new for i in add: print i[0],res.getguo(i[1]) val = res.getguo(i[1]) sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+val.decode('GB2312','ignore').encode('utf8')+"'","'"+i[0]+"'") cursor.execute(sql)
数据库
#配置数据库[database]dbhost=localhostdbport=3306dbname=month11dbuser=rootdbpassword=rootdbcharset=utf8
采集页面
# -*- coding: UTF-8 -*-import urllibimport urllib2import reimport MySQLdbprintclass News: #init def __init__(self): self.url = "http://news.baidu.com/" #convert div to '' def tranTags(self, x): pattern = re.compile('<div.*?</div>') res = re.sub(pattern, '', x) return res #getPage def getPage(self): url = self.url request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read() #get navCode def getNavCode(self): page = self.getPage() pattern = re.compile('(<div id="menu".*?)<i class="slogan"></i>', re.S) navCode = re.search(pattern, page) return navCode.group(1) #get nav def getNav(self): navCode = self.getNavCode() pattern = re.compile('<a href="(http://.*?/).*?>(.*?)</a>', re.S) itmes = re.findall(pattern, navCode) return itmes # for item in itmes: # print item[0], self.tranTags(item[1]) # 入库 db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")cursor = db.cursor()news = News()new = news.getNav()for i in new: print i[0],news.tranTags(i[1]) vals=news.tranTags(i[1]) sql = """INSERT INTO aaa(name,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+i[0]+"'") try: cursor.execute(sql) db.commit() except: # Rollback in case there is any error db.rollback()
0 0
- python 采集调类入库
- 采集入库
- 采集入库
- 入库&采集入库
- python从零写一个采集器:入库MySQL
- php 采集入库
- 图片采集入库
- 图片采集入库
- 网页采集+pdo入库
- yii 采集,入库,展示
- 采集入库 定时执行
- 网页采集+PDO入库
- file_get_contents采集加入库
- 采集数据入库
- 小偷程序和采集入库
- 采集网页内容,pdo入库,定时采集
- 浅谈自动采集程序及入库
- 浅谈自动采集程序及入库
- 【Qt开发】【Gstreamer开发】Qt error: glibconfig.h: No such file or directory #include <glibconfig.h>
- Activity的六种关闭方式
- 学会使用 Gson @SerializedName
- oracle11g broker使用
- final 关键字
- python 采集调类入库
- 【ARM-Linux开发】"libxml/parser.h: 没有那个文件或目录"解决方案
- Python学习笔记(11)--模块的建立以及调用
- javascript中的原型(prototype)及原型链的继承方式
- 时间转换
- 用PHP实现机器学习:朴素贝叶斯算法
- idea 搭建 springMVC+mybatis+maven 项目(一)
- Android视频框架 Vitamio 打造自己的万能播放器
- DBCP连接池出现连接异常解决方案;