python 采集调类入库

来源：互联网发布：淘宝客怎么找ab单编辑：程序博客网时间：2024/05/11 20:37

采集页面# -*- coding: UTF-8 -*-printfrom conf import *import urllibimport urllib2import MySQLdbimport reres = MysqldbHelper()class NewsTitle:    #init    def __init__(self):        self.url = "http://news.baidu.com/"    #convert div to ''#    def tranTags(self, x):#       pattern = re.compile('<div.*?</div>')#       pattern = re.compile('<img.*?>')#       res = re.sub(pattern, '', x)#       return res#      去除span标签       #    def clerSpan(self, x):#        pattern = re.compile('<span.*?</span>')#        res = re.sub(pattern, '', x)#        return res#        去除JavaScript#    def clerjav(self, x):#        pattern = re.compile('javascript:.*?;')#        res = re.sub(pattern, '', x)#        return res    def getPage(self):        url = self.url        request = urllib2.Request(url)        response = urllib2.urlopen(request)        return response.read()    def getTitle(self):        page = self.getPage()        pattern = re.compile('(<div id="pane-news" .*?)<div id="footerwrapper">',re.S)        tit = re.search(pattern,page)        patterncode = re.sub(r'<a .*?><img .*?</a>','',tit.group(1))        patterncode = re.sub(r'<a .*?>\n<img .*?\n</a>','',patterncode)        return patterncode      def getHref(self):        hrefcode = self.getTitle()        pattern = re.compile('<a href="(http://.*?)".*?>(.*?)</a>', re.S)        itmes = re.findall(pattern, hrefcode)        return itmes        news = NewsTitle()new = news.getHref()# print newres = res.gettitle(new)# 入库         #db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")#cursor = db.cursor()# for item in new:#     print item[0], news.tranTags(item[1])#     urll=item[0]#     vals=news.tranTags(item[1])#     sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+urll+"'")#     try:       #         cursor.execute(sql)   #         db.commit()#     except:#         # Rollback in case there is any error#         db.rollback()#

类库

#!D:/Python/python.exe# -*- coding: UTF-8 -*-#print "Content-type:textml"import MySQLdbimport reimport ConfigParserconfig = ConfigParser.ConfigParser()# print dbhost,dbport,dbname,dbuser,dbpassword,dbcharset#class MysqldbHelper:      def __init__(self):                  config.read('db.conf')                  dbhost = config.get("database", "dbhost")                  dbport = config.get("database", "dbport")                  dbname = config.get("database", "dbname")                  dbuser = config.get("database", "dbuser")                  dbpassword = config.get("database", "dbpassword")                  dbcharset = config.get("database", "dbcharset")                  conn=MySQLdb.connect(dbhost,dbuser,dbpassword,dbname)                  self.cursor = conn.cursor()      #单个删除      def getdel(self,table,id):                  cursor = self.cursor                  try:                        sql="DELETE FROM "+table+" WHERE id="+id                        cursor.execute(sql)                        return 'true'                  except MySQLdb.Error as e:                          print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )            def getguo(self, x):                   pattern = re.compile('<div.*?</div>')                   res = re.sub(pattern, '', x)                   return res      #查询表中所有数据      def getselect(self,table):                  cursor = self.cursor                  try:                        sql="SELECT * FROM "+table                        cursor.execute(sql)                        result = cursor.fetchall()                        return result                  except MySQLdb.Error as e:                          print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )            def getdell(self,table,id):                  cursor = self.cursor                  try:                        sql="DELETE FROM "+table+" WHERE id in ("+id+")"                        cursor.execute(sql)                        return 'true'                  except MySQLdb.Error as e:                          print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )            #添加标题 和 链接      def gettitle(self,new):                  res = MysqldbHelper()                  cursor = self.cursor                  add = new                  for i in add:                        print i[0],res.getguo(i[1])                        val = res.getguo(i[1])                        sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+val.decode('GB2312','ignore').encode('utf8')+"'","'"+i[0]+"'")                          cursor.execute(sql)

数据库

#配置数据库[database]dbhost=localhostdbport=3306dbname=month11dbuser=rootdbpassword=rootdbcharset=utf8

采集页面

# -*- coding: UTF-8 -*-import urllibimport urllib2import reimport MySQLdbprintclass News:    #init    def __init__(self):        self.url = "http://news.baidu.com/"    #convert div to ''    def tranTags(self, x):        pattern = re.compile('<div.*?</div>')        res = re.sub(pattern, '', x)        return res    #getPage    def getPage(self):        url = self.url        request = urllib2.Request(url)        response = urllib2.urlopen(request)        return response.read()    #get navCode    def getNavCode(self):        page = self.getPage()        pattern = re.compile('(<div id="menu".*?)<i class="slogan"></i>', re.S)        navCode = re.search(pattern, page)        return navCode.group(1)            #get nav    def getNav(self):        navCode = self.getNavCode()        pattern = re.compile('<a href="(http://.*?/).*?>(.*?)</a>', re.S)        itmes = re.findall(pattern, navCode)        return itmes        # for item in itmes:        #     print item[0], self.tranTags(item[1])  # 入库         db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")cursor = db.cursor()news = News()new = news.getNav()for i in new:    print i[0],news.tranTags(i[1])    vals=news.tranTags(i[1])    sql = """INSERT INTO aaa(name,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+i[0]+"'")    try:               cursor.execute(sql)           db.commit()    except:        # Rollback in case there is any error        db.rollback()

0 0