HTMLParser解析网页，提取链接地址、标题名称，并插入数据库

来源：互联网发布：linux emergency mode 编辑：程序博客网时间：2024/06/04 19:24
前提：
在mysql中，创建stu数据库
#coding:utf-8import MySQLdbimport urllib2import reimport timeimport sysfrom HTMLParser import HTMLParser as hp'''1.获取 网易财经 页面内容2.解析网页中链接、标题，放入一个列表中3.如果已存在my163表，则先删除该表，否则再创建表my163，包括编号、链接地址、标题内容、插入数据库时间，将链接、标题插入数据库'''#数据操作，将数据插入数据库class dealdata:    def __init__(self):        self.db = MySQLdb.connect("localhost",'root',"root")        self.db.select_db('stu')        self.cur = self.db.cursor()    #初始化表    def createtable(self):        try:            deltable="drop table if exists my163;"            tablesql="create table my163 (id int not null auto_increment primary key,link  varchar(255) not null,content Text,time datetime);"            self.cur.execute(deltable)            self.db.commit()            self.cur.execute(tablesql)            self.db.commit()            #判断表是否创建成功            self.cur.execute("SHOW TABLES LIKE '%my163%'")        except Exception,e:            sys.exit(u"创建表失败")        else:            sys.stdout.write(u"创建表my163成功")    #将数据插入表中    def insert_data(self,record):        insertsql = "insert into my163 (link,content,time) values (%s,%s,%s);"        for link,content in record:            self.cur.execute(insertsql,(link,content,time.strftime('%Y-%m-%d %H:%M:%S')))    def commitdata(self):        self.db.commit()    #查询前10条记录    def fetchdata(self):        self.cur.execute("""select * from my163""")        self.cur.scroll(0,mode='absolute')        #获取前10条记录        records=self.cur.fetchmany(10)        print "get 10 record"        for idnum,href,content,t in records:            print href,content#.decode("utf-8")        print "get 10 records "    #关闭链接    def closeconnection(self):        self.cur.close()        self.db.close()#获取网易财经页面内容def getpage():    req= urllib2.Request("http://money.163.com")    opener = urllib2.build_opener()    f = opener.open(req)    return f.read()#使用HTMLParser类解析html网页内容class htmlparser(hp):    def __init__(self):        hp.__init__(self)        #标记href字段        self.href = None        #标记href的链接地址        self.content=None        #存放链接地址、及显示数据，如 （http://money.163.com/blog/ 博客）        self.parseresult=[]    def handle_starttag(self,tag,attrs):        #处理 标签a        if tag=='a':            for link,content in attrs:                if link=="href" and content.startswith("http"):                    self.href='a'                    self.content=content    def handle_data(self,data):        if self.href=='a' and len(data.strip()):            print self.content,data            self.parseresult.append((self.content,data))    # 标签a 结束后就设置标记self.href=None    def handle_endtag(self,tag):        if tag=='a':            self.href=Noneif __name__ == '__main__':    #获取页面源代码，并解析    pagesource = getpage()    html = htmlparser()    html.feed(pagesource)    html.close()    #将出结果插入数据库    mydb=dealdata()    mydb.createtable()    mydb.insert_data(html.parseresult)    mydb.commitdata()    mydb.fetchdata()    mydb.closeconnection()'''部分输出结果：http://money.163.com/photoview/50ST0025/12924.html 河南房企狂撒10万现金营销http://money.163.com/photoview/50ST0025/12919.html 老农自造“兰博基尼”送孙子http://money.163.com/photoview/50ST0025/12916.html 冬奥冠军父母烤串修鞋谋生http://money.163.com/special/00253JGA/MoreJigou.html 基金研究http://money.163.com/special/00253JGA/MoreJigou.html 阅读全部http://money.163.com/14/0217/09/9L9BH7FD00253JGA.html 华宝证券：债市需求旺盛 分http://money.163.com/14/0218/14/9LCF742200253JGA.html 凯石投资2月新发基金报告：http://money.163.com/14/0217/16/9LA3GPMK00253JGA.html 好买：大盘蓝筹反弹有机会http://money.163.com/14/0214/09/9L1KAJPN00253JGA.html 好买：余额宝理财通零钱宝百'''
0 0