正则表达式的使用

来源:互联网 发布:阿里云 网站端口 编辑:程序博客网 时间:2024/06/05 23:56
2016-2-27

常用符号
#-*-coding:utf8-*-import re# from re import findall,search,Sprint('hello world')secret_code = 'feafefexxixx23fe23xxlovexxafeifvaxxyouxx32fe'a = 'xz123'b = re.findall('x', a)# .类似于占位符print(b)a = 'xyxy123'# *显示x的位置b = re.findall('x*', a)print(b)b = re.findall('x?', a)print(b)# *贪心查找最多的在之间b = re.findall('xx.*xx', secret_code)print(b)# *?最多种的组合c = re.findall('xx.*?xx', secret_code)print(c)# 需要的放在()里,不需要的放在()外d = re.findall('xx(.*?)xx', secret_code)print(d)for each in d:print(each)s = '''sdfxxhelloxxfsdfxxworldxxasdf'''# 第二行的xx变成开始的xx,换行符也是e = re.findall('xx(.*?)xx', s, re.S)# S hello\n worldprint(e)# 对比searchfindall的使用s2 = 'asdfxxixx123xxlovexxdfd'# group 代表了()的个数f = re.search('xx(.*?)xx123xx(.*?)xx', s2).group(2)print(f)f2 = re.search('xx(.*?)xx123xx(.*?)xx', s2)#print(f2[0][1])#subs = '123abcssfasdfas123'#  123之间的字符换成789output = re.sub('123(.*?)123', '123%d123'%789, s)print(output)# (\d+)匹配数字a = 'asdfasf1234567fasd55fas'b = re.findall('(\d+)', a)print(b)
#-*-coding:utf8-*-import reold_url = 'http://www.pythontab.com/html/2013/pythonhexinbiancheng001.html'total_page = 20f = open('test.txt', 'r')html = f.read()f.close()# search 爬到符合的地方就会停下# findall 则会遍历整个文档title = re.search('<title>(.*?)</title>', html, re.S).group(1)print(title)links = re.findall('href="(.*?)"', html, re.S)for each in links:print(each)# 先抓大在抓小# text_field = re.findall('<ul>(.*?)</ul>', html, re.S)# the_text = re.findall('')# 翻页for i in range(2,total_page+1):new_link = re.sub('pythonhexinbiancheng00\d+', 'pythonhexinbiancheng00%d'%i, old_url, re.S)print(new_link)

0 0
原创粉丝点击