正则表达式——Python版

来源：互联网发布：淘宝上最好卖的是什么编辑：程序博客网时间：2024/05/19 19:42

正则表达式是python程序猿经常考察的一个地方，其应用也比较广泛，主要功能是定义格式去匹配查找给定的字符串内容，找出符合我们要求的字符串内容；re模块就是正则模块，其提供了所有的正则表达式功能；

正则里面的特殊字符类：

\d匹配一个数字字符。等价于 [0-9]。\D匹配一个非数字字符。等价于 [^0-9]。\s匹配任何空白字符，包括空格、制表符、换页符等等。等价于 [ \f\n\r\t\v]。\S匹配任何非空白字符。等价于 [^ \f\n\r\t\v]。\w匹配包括下划线的任何单词字符。等价于'[A-Za-z0-9_]'。\W匹配任何非单词字符。等价于 '[^A-Za-z0-9_]'。

正则表达式经常用到的元字符： . ^ $ * + ? {} [] \ | () 下面举例介绍他们的用法

import res='abc's=r'abc're.findall(s,"aaaaaaaaa")Out[5]: []re.findall(s,"abcaaaaaaaaaaaaaabc")Out[6]: ['abc', 'abc']st="top tip tap tqp tep"res=r"top"re.findall(res,st)Out[9]: ['top']res=r"tip"re.findall(res,st)Out[11]: ['tip']res=r"t[io]p"re.findall(res,st)Out[13]: ['top', 'tip']res=r"t[^io]p"         # 非的意思  除此之外re.findall(res,st)Out[15]: ['tap', 'tqp', 'tep']s="hello world,hello boy"r=r"hello"re.findall(r,s)Out[18]: ['hello', 'hello']r=r"^hello"re.findall(r,s)  # ^ 代表匹配行首Out[20]: ['hello']s="world,hello boy"re.findall(r,s)  # ^ 代表匹配行首Out[22]: []r=r"boy$"   # $匹配行尾re.findall(r,s)  Out[24]: ['boy']r=r"t[abc$]"re.findall(r,'ta')Out[26]: ['ta']re.findall(r,'tb')Out[27]: ['tb']re.findall(r,'tc')Out[28]: ['tc']re.findall(r,'t$')Out[29]: ['t$']re.findall(r,'tax')Out[30]: ['ta']r=r"t[abc^]"re.findall(r,'t^')Out[32]: ['t^']r=r"x[0-9]x"re.findall(r,'x1x x2x x9x')Out[34]: ['x1x', 'x2x', 'x9x']r=r"x[a-zA-Z0-9]x"         # 大小写字母与数字

r=r"\^abc"re.findall(r,"^abc ^abc ^abc")Out[37]: ['^abc', '^abc', '^abc']r=r"\d"          #  所有的数字re.findall(r,'123456')Out[39]: ['1', '2', '3', '4', '5', '6']# \w 表示大小写字母和数字 【a-zA-Z0-9】r=r"^010-a{8}"   # 重复八次re.findall(r,'010-87654321')Out[42]: []re.findall(r,'010-aaaaaaaa')Out[43]: ['010-aaaaaaaa']r=r"^010-\d{8}"   # 重复八次re.findall(r,'010-87654321')Out[45]: ['010-87654321']r=r"ab*"          # 大于等于0次re.findall(r,"abbbbbbb")Out[48]: ['abbbbbbb']re.findall(r,"a")Out[49]: ['a']r=r"ab+"         # 大于等于1次re.findall(r,'abbbbbbbb')Out[52]: ['abbbbbbbb']re.findall(r,'a')Out[53]: []r=r"^010-?\d{8}$"re.findall(r,"010-12345678")Out[55]: ['010-12345678']re.findall(r,"01012345678")Out[56]: ['01012345678']       #  ？可有可无r=r"a{1,3}"re.findall(r,'a')Out[58]: ['a']re.findall(r,'aaa')Out[59]: ['aaa']re.findall(r,'aaaa')Out[60]: ['aaa', 'a']       #  最多3次

r1=r"\d{3,4}-?\d{8}"re.findall(r1,"010-12345678")Out[62]: ['010-12345678']p_tel=re.compile(r1)          #  编译之后使用方便p_tel.findall('010-12345678')Out[64]: ['010-12345678']p_tel.findall('010-123456789')Out[65]: ['010-12345678']csvt_re=re.compile(r'csvt',re.I)    #  不管大小写csvt_re.findall("CSVT")Out[67]: ['CSVT']csvt_re.findall("CSsT")Out[68]: []csvt_re.findall("CSvT")Out[69]: ['CSvT']csvt_re.match('csvt hello')               # 匹配开头Out[70]: <_sre.SRE_Match at 0x3b63648>csvt_re.match('hello')csvt_re.match('hello csvt')csvt_re.search('hello csvt')                # 全部扫描Out[73]: <_sre.SRE_Match at 0x3b637e8>csvt_re.search('hello hello')csvt_re.findall('hello csvt hello csvt csvt')Out[75]: ['csvt', 'csvt', 'csvt']csvt_re.finditer('hello csvt hello csvt csvt')   #  返回一个对象Out[76]: <callable-iterator at 0x3b47908>x=csvt_re.finditer('hello csvt hello csvt csvt')xOut[78]: <callable-iterator at 0x3b47f28>x.next()Out[79]: <_sre.SRE_Match at 0x3b63bf8>x=csvt_re.match('hello csvt')     # 使用match如何看到那个结果x=csvt_re.match('csvt hello')xOut[82]: <_sre.SRE_Match at 0x3b63ed0>x.group()                         # 查看match的结果Out[83]: 'csvt'#  match 匹配开头s="hello csvt"s.replace('csvt','python')       # 替代Out[86]: 'hello python'rs=r"c..t"re.sub(rs,'python','csft ckst clot cccc')Out[88]: 'python python python cccc're.split(r'[\+\-\*]',"123+456-789*000")Out[89]: ['123', '456', '789', '000']r1=r"csvt.net"re.findall(r1,'csvt.net')Out[91]: ['csvt.net']re.findall(r1,'csvtonet')Out[92]: ['csvtonet']re.findall(r1,'csvt\nnet',re.S)Out[93]: ['csvt\nnet']re.findall(r1,'csvt\tnet',re.S)           # 特殊字符Out[94]: ['csvt\tnet']s="""hello csvtcsvt hellohello csvt hellocsvt hehe"""r2=r"^csvt"re.findall(r2,s)Out[97]: []sOut[98]: '\nhello csvt\ncsvt hello\nhello csvt hello\ncsvt hehe\n're.findall(r2,s,re.M)           # 多行数据要加 MOut[99]: ['csvt', 'csvt']tel=r"""\d{3,4}-?\d{8}"""re.findall(tel,'010-12345678',re.X)   #  多行正则要加 XOut[101]: ['010-12345678']

email=r"\w{3}@\w+(\.com|\.cn)"        # 邮件匹配正则 很好！re.match(email,'zzz@gmail.com')Out[103]: <_sre.SRE_Match at 0x3a52dc8>s="""lsddk lsdjf hello src=jj yes lsdjlsdj src=123 yes jdfsrc=45 yeshello src=089 yes lj"""r3=r"hello src=.+ yes"re.findall(r3,s)Out[106]: ['hello src=jj yes', 'hello src=089 yes']r3=r"hello src=(.+) yes"            # 把感兴趣的值提取出来re.findall(r3,s)Out[108]: ['jj', '089']

match

#!/usr/bin/python# -*- coding: UTF-8 -*-import reprint(re.match('www','www.runoob.com').span())print(re.match('com','www.runoob.com'))

run C:\Anaconda\zz.py(0, 3)None

#!/usr/bin/pythonimport reline = "Cats are smarter than dogs"matchObj = re.match( r'(.*) are (.*?) .*', line, re.M|re.I)       # 多行 不区分大小写if matchObj:   print "matchObj.group() : ", matchObj.group()   print "matchObj.group(1) : ", matchObj.group(1)   print "matchObj.group(2) : ", matchObj.group(2)else:   print "No match!!"

以上实例执行结果如下：

matchObj.group() :  Cats are smarter than dogs matchObj.group(1) :  Cats             # 不算 are？matchObj.group(2) :  smarter

search 的用法

#!/usr/bin/python# -*- coding: UTF-8 -*- import reprint(re.search('www', 'www.runoob.com').span())  # 在起始位置匹配print(re.search('com', 'www.runoob.com').span())         # 不在起始位置匹配

以上实例运行输出结果为：

(0, 3)(11, 14)

0 0