正则表达式

来源：互联网发布：金英杰免费网络课登录编辑：程序博客网时间：2024/05/29 02:46

Python正则表达式代码示例

使用match()方法匹配字符串，从字符串起始部分对模式进行匹配。成功返回一个匹配对象，否则返回None，匹配对象的group()方法能够用户显示那个成功的匹配

import rem=re.match('foo','foo')#模式匹配字符串if m is not None:#如果匹配成功，就输出匹配内容    m.group()

m#确认返回的匹配对象

<_sre.SRE_Match object; span=(0, 3), match='foo'>

m.group()

'foo'

#如下是一个失败的匹配，它返回Nonem=re.match('foo','bar')#模式并不能匹配字符串if m is not None:m.group()#单行版本的if语句#如果没有if语句，会出现AttributeError异常（None是返回值，并没有group()属性【方法】）

m=re.match('foo','food on the table')#匹配成功,尽管字符串比模式要长，但从字符串的起始位置开始匹配就会成功m.group()

'foo'

使用search()在一个字符串中查找模式（搜索与匹配的对比）

match()和search()不同在于match()试图从字符串的其实部分开始匹配模式，而search()可以从任意位置开始匹配

m=re.match('foo','seafood')#匹配失败 试图将模式中的'f'匹配到字符串的首字母's'上if m is not None:    m.group()

m=re.search('foo','seafood')#使用search()代替if m is not None:m.group()m.group()

'foo'

匹配多个字符串

bt='bat|bet|bit'   #正则表达式模式：bat、bet、bitm=re.match(bt,'bat')  #'bat'是一个匹配m.group()

'bat'

m=re.match(bt,'blt')   #对'blt'没有匹配# m.group()#匹配失败，print(m)

None

m=re.match(bt,'He bit me!') #不能匹配字符串if m is not None:m.group()

m=re.search(bt,'He bit me!')#通过搜索查找'bit'm.group()

'bit'

匹配任何单个字符

点号（.）不能匹配一个换行符\n或者非字符，也就是说，一个空字符串

anyend='.end'm=re.match(anyend,'bend') #点号匹配'b'm.group()

'bend'

m=re.match(anyend,'end')  #不匹配任何字符print(m)

None

m=re.match(anyend,'\nend')   #除了\n之外的任何字符print(m)

None

m=re.search('.end','The end.')  #在搜索中匹配' 'm.group()

' end'

#下面的示例在正则表达式中搜索一个真正的句号（小数点），通过使用一个反斜线对据点的功能进行转义patt314='3.14'    #表示正则表达式的点号pi_patt='3\.14'   #表示字面量的点号（dec. point）m=re.match(pi_patt,'3.14')  #精确匹配m.group()

'3.14'

m=re.match(patt314,'3014')  #点号匹配'0'm.group()

'3014'

m=re.match(patt314,'3.14')#点号匹配'.'m.group()

'3.14'

创建字符集（[]）

m=re.match('[cr][23][dp][o2]','c3po') #匹配'c3po'm.group()

'c3po'

m=re.match('[cr][23][dp][o2]','c2do')#匹配'c2do'm.group()

'c2do'

m=re.match('r2d2|c3po','c2do')#不匹配'c2do'print(m)

None

m=re.match('r2d2|c3po','r2d2')#匹配'r2d2'm.group()

'r2d2'

重复、特殊字符以及分组

patt='\w+@(\w+\.)?\w+\.com're.match(patt,'nobody@xxx.com').group()

'nobody@xxx.com'

re.match(patt,'nobody@www.xxx.com').group()

'nobody@www.xxx.com'

#对上面的正则模式进行拓展，允许任意数量的中间子域名存在，注意细节变化，将'?'改为'*'patt='\w+@(\w+\.)*\w+\.com're.match(patt,'nobody@www.xxx.yyy.zzzz.com').group()

'nobody@www.xxx.yyy.zzzz.com'

#（）具有匹配和保存子组的作用m=re.match('\w\w\w-\d\d\d','abc-123')m.group()

'abc-123'

m=re.match('\w\w\w-\d\d\d','abc-xyz')print(m)

None

#注意如果使用group()方法访问每个独立的子组以及groups()方法以获取一个包含所有匹配子组的元组m=re.match('(\w\w\w)-(\d\d\d)','abc-123')  #完全匹配m.group()

'abc-123'

m.group(1)  #子组1

'abc'

m.group(2)  #子组2

'123'

m.groups()  #全部子组

('abc', '123')

#下面示例展示了不同的分组排列m=re.match('ab','ab')  #没有子组m.group() #完全匹配

'ab'

m.groups() #所有子组

()

m=re.match('(ab)','ab') #一个子组m.group() #完整匹配

'ab'

m.group(1) #子组1

'ab'

m.groups() #全部子组

('ab',)

m=re.match('(a)(b)','ab')  #两个子组m.group()  #完整匹配

'ab'

m.group(1)  #子组1

'a'

m.group(2)  #子组2

'b'

m.groups()  #所有子组

('a', 'b')

m=re.match('(a(b))','ab') #两个子组m.group()  #完整匹配

'ab'

m.group(1)  #子组1

'ab'

m.group(2)  #子组2

'b'

m.groups() #所有子组

('ab', 'b')

匹配字符串的起始和结尾以及单词边界

#该操作符更多用于表示搜索而不是匹配，因为match()总是从字符串开始位置进行匹配#下面的正则字符串使用了原始字符串，即以字母r开头的字符串m=re.search('^The','The end.') #匹配m.group()

'The'

m=re.search('^The','end. The')  #不作为起始print(m)

None

m=re.search(r'\bthe','bite the dog') #在边界m.group()

'the'

m=re.search(r'\bthe','bitethe dog')#有边界print(m)

None

m=re.search(r'\Bthe','bitethe dog')#没有边界m.group()

'the'

使用findall()和finditer()查找每一次出现的位置

findall()查询字符串中某个正则表达式模式全部的非重复出现情况，返回值总是一个列表。如果没有匹配部分则返回一个空列表

re.findall('car','car')

['car']

re.findall('car','scary')

['car']

re.findall('car','carry the barcardi to the car')

['car', 'car', 'car']

#finditer()返回的是一个迭代器s='This and that.'re.findall(r'(th\w+) and (th\w+)',s,re.I)

[('This', 'that')]

[g.groups() for g in re.finditer(r'(th\w+) and (th\w+)',s,re.I)]

[('This', 'that')]

for  m in re.finditer(r'(th\w+) and (th\w+)',s,re.I):    print(m.group())

This and that

使用sub()和subn()搜索与替换

#subn()和sub()一样，但subn()还返回一个表示替换的总数，替换后的字符串和表示替换总数的数字一起作为一个拥有两个元素的元组返回re.sub('X','Mr. Smith','attn: X\n\nDear X,\n')

'attn: Mr. Smith\n\nDear Mr. Smith,\n'

re.subn('X','Mr. Smith','attn: X\n\nDear X,\n')

('attn: Mr. Smith\n\nDear Mr. Smith,\n', 2)

print(re.sub('X','Mr. Smith','attn: X\n\nDear X,\n'))

attn: Mr. SmithDear Mr. Smith,

re.sub('[ae]','X','abcdef')

'XbcdXf'

re.subn('[ae]','X','abcdef')

('XbcdXf', 2)

在限定模式上使用split()分隔字符串

import reDATA=(    'Mountain View,CA 94040',    'Sunnyvale,CA',    'Los Altos,94023',    'Cupertino 95014',    'Palo Alto CA',)for datum in DATA:    print(re.split(',|(?=(?:\d{5}|[A-Z]{2}))',datum))#如果空格紧跟在五个数组（ZIP编码）或者两个大写字母    # （英国联邦州缩写）之后，就用split语句分割空格。

['Mountain View', 'CA 94040']['Sunnyvale', 'CA']['Los Altos', '94023']['Cupertino 95014']['Palo Alto CA']C:\Users\ZJL\AppData\Local\Programs\Python\Python35\lib\re.py:203: FutureWarning: split() requires a non-empty pattern match.  return _compile(pattern, flags).split(string, maxsplit)

扩展符号

import rere.findall(r'(?i)yes','yes? Yes. YES!!')

['yes', 'Yes', 'YES']

re.findall(r'(?i)th\w+','The quickest way is through this tunnel')

['The', 'through', 'this']

re.findall(r'(?im)(^th[\w ]+)','''This line is the first,annother linethat line,it's the best''')

['This line is the first', 'that line']

#下一组演示使用re.S/DOTALL。该标记表明点号(.)能够用来表示\n符号re.findall(r'th.+','''The first linethe second linethe third line''')

['the second line', 'the third line']

re.findall(r'(?s)th.+','''The first linethe second linethe third line''')

['the second line\nthe third line\n']

#re.X/VERBOSE标记，可以抑制正则表达式中的空白符。使正则表达式更具可读性re.search(r'''(?x)\((\d{3})\)#区号[ ]      #空白符(\d{3})  #前缀-       #横线(\d{4})   #终点数字''','(800) 555-1212').groups()

('800', '555', '1212')

#(?:...)可以对部分正则表达式进行分组，但是不会保存该分组用于后续的检索或者应用re.findall(r'http://(?:\w+\.)*(\w+\.com)','http://google.com http://www.google.com http://code.google.com ')

['google.com', 'google.com', 'google.com']

re.search(r'\((?P<areacode>\d{3})\) (?P<prefix>\d{3})-(?:\d{4})','(800) 555-1212').groupdict()

{'areacode': '800', 'prefix': '555'}

re.sub(r'\((?P<areacode>\d{3})\) (?P<prefix>\d{3})-(?:\d{4})','(\g<areacode>) \g<prefix>-xxxx)','(800) 555-1212')

'(800) 555-xxxx)'

#(?=...)和(?!...)符号在目标字符串中实现一个前视匹配，而不必实际上使用这些字符串。前者是正向前视断言，后者#是负向前视断言，具体作用看例子意会re.findall(r'\w+(?= van Rossum)','''Guido van RossumTim PetersAlex MartelliJust van RossumRaymond Hettinger''')

['Guido', 'Just']

re.findall(r'(?m)^\s+(?!noreply|postmaster)(\w+)',           '''        sales@phptr.com        postmaster@phptr.com        eng@phptr.com        noreply@phptr.com        admin@phptr.com                         ''')

['sales', 'eng', 'admin']

['%s@aw.com' %e.group(1) for e in  re.finditer(r'(?m)^\s+(?!noreply|postmaster)(\w+)',           '''        sales@phptr.com        postmaster@phptr.com        eng@phptr.com        noreply@phptr.com        admin@phptr.com                         ''')]

['sales@aw.com', 'eng@aw.com', 'admin@aw.com']

#下面一个示例展示了使用条件正则表达式匹配bool(re.search(r'(?:(x)|y)(?(1)y|x)','xy'))

True

bool(re.search(r'(?:(x)|y)(?(1)y|x)','xx'))

False

#\w和\W字母数字字符集同时受re.L/LOCALE和Unicode(re.U/UNICODE)标记所影响

关于使用Python原始字符串，退格符\b和正则表达式\b之间的差异，下面举例说明

m=re.match('\bblow','blow')#backspace、no matchif m:m.group()

m=re.match('\\bblow','blow')#escaped\,now it worksif m:m.group()m.group()

'blow'

m=re.match(r'\bblow','blow')#user raw string insteadm.group()

'blow'

一些正则表达式示例

#这段代码作用是列出所有登录当前系统中的用户信息,linux系统下有用import os import rewith os.popen('who','r') as f:#在windows中使用tasklist命令，类似    for eachLine in f:        print(re.split(r'\s\s+|\t',eachLine.strip()))

更长的正则表达式示例

#下面带是用于正则表达式练习的数据生成器from random import randrange,choicefrom string import ascii_lowercase as lcfrom sys import maxsizefrom time import ctimetlds=('com','edu','net','org','gov')for i in range(randrange(5,11)):    dtint=randrange(8381231212) #pick date 这里不能使用maxsizez做上限，太大超出参数范围了    dtstr=ctime(dtint)  #date string    llen=randrange(4,8)  #login is shorter    login=''.join(choice(lc) for j in range(llen))    dlen=-randrange(llen,13)  #domain is longer    dom=''.join(choice(lc) for j in range(dlen))    print('%s::%s@%s.%s::%d-%d-%d' %(dtstr,login,dom,choice(tlds),dtint,llen,dlen))

Sun Oct 14 04:16:10 2153::sowi@.edu::5799672970-4--5Sun Dec 14 15:54:33 2064::rmdlm@.gov::2996466873-5--5Fri Mar 23 12:26:49 2001::wylys@.org::985321609-5--8Tue Jan  9 10:49:31 2148::yyujmo@.org::5617824571-6--12Fri Apr  2 18:32:58 2162::wnagmca@.gov::6066873178-7--12Wed Jan  3 01:39:36 2091::onfqs@.org::3818597976-5--8Tue Dec 13 10:45:55 1988::gklecis@.org::597984355-7--11

阅读全文

0 0