极客学院爬虫re基本知识

来源:互联网 发布:python 退火算法 编辑:程序博客网 时间:2024/06/05 08:18

放这里比放电脑里查看方便


极客学院爬虫re基本知识代码

# -*- coding: utf-8 -*-"""Created on Sat May  7 07:58:13 2016@author: s"""import reold_url = 'http://www.jikexueyuan.com/course/android/?pageNum=2'total_page = 20f = open(r'...\text.txt', 'rb')#'...'为pathhtml = f.read().decode()#print(html)f.close()title = re.search('<title>(.*?)</title>', html).group(1)#print(title)url_links = re.findall('<a href="(.*?)">', html, re.S)for i in url_links:    print(i)#ul_con = re.search('<ul>(.*?)</ul>', html, re.S).group(1)ul_con = re.findall('<ul>(.*?)</ul>', html, re.S)[0]#print(ul_con)con_char = re.findall('">(.*?)</a>', ul_con, re.S)for each_line in con_char:    print(each_line)for i in range(2, total_page+1):    new_url = re.sub('pageNum=\d', 'pageNum=%d'%i, old_url, re.S)    print(new_url)
0 0
原创粉丝点击