Contents
11.5. 初始化正则表达式¶
11.5.1. re.compile(pattern[,flags]) 编译正则表达式,可以重复使用,减少正则表达式的解析和验证¶
pattern #正则表达式的匹配模式;
flags #可选参数,编译标志。 re.I 忽略大小写
代码示例1
s = '''Life can be good;
Life can be bad;
Life is mostly cheerful;
But sometimes sad.
'''
r = re.compile("b\w*", re.I) #编译正则表达式,忽略大小写
new = r.sub("*", s) #使用sub()替换字符
print(new) #输出结果,可以看到所有以“b”开头的单词都被替换
new2 = r.sub("*", s, 2) #只在字符串中替换两次
print(new2)
r = re.compile('b\w*') #重新编译,不忽略大小写
new = r.subn("*", s)
print(new[0])
print(new[1]) #输出替换的次数
new3 = r.subn("*", s, 1) #只在字符串中替换一次
11.5.2. 搜寻时使用大括号设定比对次数¶
import re
def serachStr(pattern, msg):
txt = re.search(pattern, msg)
if txt == None:
print("Match fail....", txt) # 匹配成功
else:
print("Match successful...", txt.group()) # 匹配失败
msg1 = "son"
msg2 = "son" * 2
msg3 = "son" * 3
msg4 = "son" * 4
msg5 = "son" * 5
pattern = '(son){3,5}'
serachStr(pattern, msg1)
serachStr(pattern, msg2)
serachStr(pattern, msg3)
serachStr(pattern, msg4)
serachStr(pattern, msg5)
# Match fail.... None
# Match fail.... None
# Match successful... sonsonson
# Match successful... sonsonsonson
# Match successful... sonsonsonsonson
11.5.3. 贪婪匹配和懒惰匹配¶
#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/5/21 17:31
# filename: 贪婪匹配和懒惰匹配.py
import re
# 使用贪婪匹配
m = re.search(r'\d{5,8}', '87654321')
print(m) #<_sre.SRE_Match object; span=(0, 8), match='87654321'>
print(m.group()) #87654321
# 使用惰性匹配
m = re.search(r'\d{5,8}?', '87654321') #<_sre.SRE_Match object; span=(0, 5), match='87654'>
print(m)
print(m.group()) #87654
代码示例2
import re
def serachStr(pattern, msg):
txt = re.search(pattern, msg)
if txt == None:
print("Match fail....", txt) # 匹配成功
else:
print("Match successful...", txt.group()) # 匹配失败
msg1 = "son" * 5
pattern1 = '(son){3,5}'
pattern2 = '(son){3,5}?'
serachStr(pattern1, msg1) #Match successful... sonsonsonsonson # 贪婪模式,最长匹配
serachStr(pattern2, msg1) #Match successful... sonsonson # 非贪婪模式,最短匹配
11.5.4. 正则的分组¶
代码示例
#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/12 17:01
# filename: 正则表达式分组1.py
import re
# 用()表示的就是要提取的分组^(\d{3})\-(\d{3,8})$ 分别定义了两个组。可以直接从匹配的字符串中提取出区号和本地号码
m = re.match(r'^(\d{3})\-(\d{3,8})$', '010-12345')
print(m.group(0)) # 010-12345
print(m.group(1)) # 010
print(m.group(2)) # 12345
命名分组¶
代码示例
#!/usr/bin/env python
# -*- coding:utf8 -*-
import re
s = "Phone No . 010-87654321"
r = re.compile(r'(\d+)-(\d+)')
m = r.search(s)
print(m) # <_sre.SRE_Match object; span=(11, 23), match='010-87654321'>
print(m.group(1)) # 010
print(m.group(2)) # 87654321
print(m.groups()) # ('010', '87654321')
r2 = re.compile(r'(?P<Area>\d+)-(?P<No>\d+)')
m = r2.search(s)
print(m) ##<_sre.SRE_Match object; span=(11, 23), match='010-87654321'>
print(m.groupdict(2)) # {'Area': '010', 'No': '87654321'}
print(m.group("No")) # 87654321
print(m.group("Area")) # 010
11.5.5. 代码示例¶
#!/usr/bin/env python
#-*- coding:utf8 -*-
import re
s = '''
Life can be good;
Life can be bad;
LIfe is mostly cheerful;
But sometimes sad.
'''
r = re.compile(r'be(?=\sgood)') #编译正则表达式,只匹配单词"good"的"be"
m = r.search(s)
print(m) #<_sre.SRE_Match object; span=(10, 12), match='be'>
m.span() #编译正则表达式,只匹配其后单词"good"的"be"
print(r.findall(s)) #搜索字符串 #['be']
r = re.compile('be') #查看m
r.findall(s)
r = re.compile(r'be(?!\sgood)')
m = r.search(s)
print(m) #<_sre.SRE_Match object; span=(28, 30), match='be'>
r = re.compile(r"(?:can\s)be(\sgood)") #使用组来匹配"be good"
m = r.search(s)
print(m) #<_sre.SRE_Match object; span=(6, 17), match='can be good'>
print(m.groups()) #(' good',)
print(m.group(1)) # good
r = re.compile(r'(?P<first>\w)(?P=first)') #使用组名重复,此处匹配具有两个重复字母的单词
print(r.findall(s)) #输出匹配到的字母 #['o', 'e']
r = re.compile(r'(?<=can\s)b\w*\b') #匹配以字母“b”开头在“can”之后的单词
print(r.findall(s)) #输出匹配到的单词 #['be', 'be']
r = re.compile(r"(?<!can\s)b\w*\b") #匹配以字母"b"开头不在"can"之后单词
print(r.findall(s)) #['bad']
r = re.compile(r'(?<!can\s)(?i)b\w*\b') #重新编译忽略大小写
print(r.findall(s)) #['bad', 'But']
11.5.6. 匹配对象与索引的使用¶
代码示例
#!/usr/bin/env python
# -*- coding:utf8 -*-
import re # 导入re模块
s = """
life can be dreams,
Life can be great thoughts;
Life can mean a person,
Sitting in a court.
"""
# 编译正则表达式,匹配所有包含字母“a”的单词
r = re.compile('\\b(?P<first>\w+)a(\w+)\\b')
m = r.search(s)
print(m.groupdict()) # {'first': 'c'}
print(m.groups()) # ('c', 'n')
# 从指定位置开始重新搜索
m = r.search(s, 9)
print(m.group()) # dreams
print(m.group((1))) # dre
print(m.group((2))) # ms
print(m.group(1, 2)) # ('dre', 'ms')
print(m.groupdict()) # {'first': 'dre'}
print(m.groups()) # ('dre', 'ms')
11.5.7. 匹配对象与组的使用¶
代码示例
#!/usr/bin/env python
#-*- coding:utf8 -*-
import re #导入re模块
s = """
life can be dreams,
Life can be great thoughts;
Life can mean a person,
Sitting in a court.
"""
r = re.compile('\\b(?P<first>\w+)a(\w+)\\b') #编译正则表达式,匹配所有包含字母“a”的单词
m = r.search(s)
print(m.groupdict())
print(m.groups())
m = r.search(s,9) #从指定位置开始重新搜索
print(m.group())
print(m.group((1)))
print(m.group((2)))
print(m.group(1,2))
print(m.groupdict())
print(m.groups())
11.5.8. 零宽断言¶
str = 'aaa111aaa , bbb222&, 333ccc'
re.compile('\d+(?=[a-z]+)').findall(str) # 前向界定 (?=exp) 找出连续的数字并且最后一个数字跟着至少一个a-z ['111', '333']
re.compile(r"\d+(?![a-z]+)").findall(str) # 前向否定界定 (?!exp) 找出连续数字,且最后一个数字后不能跟a-z ['11', '222', '33']
re.compile(r"(?<=[a-z])\d+").findall(str) # 反向界定 (?<=exp) 逆序环视 找出连续的数字,且第一个数字前面是a-z ['111', '222']
re.compile(r"(?<![a-z])\d+").findall(str) # 反向否定界定 (?<!exp) 否定逆序环视 找出连续的数字,且第一个数字前不能是a-z ['11', '22', '333']
re.compile(r"(?:\d+)").findall(str) # 无捕获的匹配 (?:exp)
s= 'Tom:9527 , Sharry:0003 '
re.match( r'(?P<name>\w+):(?P<num>\d+)' , s).group(0) # 捕获组 <num>第二个标签变量[9527] 获取 group("num") 等同 group(2)[9527], group(0)全部[Tom:9527]