11.5. 初始化正则表达式

11.5.1. re.compile(pattern[,flags]) 编译正则表达式,可以重复使用,减少正则表达式的解析和验证

pattern         #正则表达式的匹配模式;
flags           #可选参数,编译标志。  re.I 忽略大小写

代码示例1

s = '''Life can be good;
Life can be bad;
Life is mostly cheerful;
But sometimes sad.
'''

r = re.compile("b\w*", re.I)     #编译正则表达式,忽略大小写
new = r.sub("*", s)              #使用sub()替换字符
print(new)                       #输出结果,可以看到所有以“b”开头的单词都被替换

new2 = r.sub("*", s, 2)          #只在字符串中替换两次
print(new2)

r = re.compile('b\w*')           #重新编译,不忽略大小写
new = r.subn("*", s)
print(new[0])

print(new[1])                     #输出替换的次数
new3 = r.subn("*", s, 1)          #只在字符串中替换一次

11.5.2. 搜寻时使用大括号设定比对次数

import re


def serachStr(pattern, msg):
    txt = re.search(pattern, msg)
    if txt == None:
        print("Match fail....", txt)  # 匹配成功
    else:
        print("Match successful...", txt.group())  # 匹配失败


msg1 = "son"
msg2 = "son" * 2
msg3 = "son" * 3
msg4 = "son" * 4
msg5 = "son" * 5

pattern = '(son){3,5}'
serachStr(pattern, msg1)
serachStr(pattern, msg2)
serachStr(pattern, msg3)
serachStr(pattern, msg4)
serachStr(pattern, msg5)

# Match fail.... None
# Match fail.... None
# Match successful... sonsonson
# Match successful... sonsonsonson
# Match successful... sonsonsonsonson

11.5.3. 贪婪匹配和懒惰匹配

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/5/21 17:31
# filename: 贪婪匹配和懒惰匹配.py
import re

# 使用贪婪匹配
m = re.search(r'\d{5,8}', '87654321')
print(m)            #<_sre.SRE_Match object; span=(0, 8), match='87654321'>
print(m.group())    #87654321

# 使用惰性匹配
m = re.search(r'\d{5,8}?', '87654321')      #<_sre.SRE_Match object; span=(0, 5), match='87654'>
print(m)
print(m.group())                            #87654

代码示例2

import re


def serachStr(pattern, msg):
    txt = re.search(pattern, msg)
    if txt == None:
        print("Match fail....", txt)  # 匹配成功
    else:
        print("Match successful...", txt.group())  # 匹配失败



msg1 = "son" * 5

pattern1 = '(son){3,5}'
pattern2 = '(son){3,5}?'
serachStr(pattern1, msg1)   #Match successful... sonsonsonsonson            # 贪婪模式,最长匹配
serachStr(pattern2, msg1)   #Match successful... sonsonson                  # 非贪婪模式,最短匹配

11.5.4. 正则的分组

代码示例

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/12 17:01
# filename: 正则表达式分组1.py
import re

# 用()表示的就是要提取的分组^(\d{3})\-(\d{3,8})$ 分别定义了两个组。可以直接从匹配的字符串中提取出区号和本地号码

m = re.match(r'^(\d{3})\-(\d{3,8})$', '010-12345')
print(m.group(0))               # 010-12345
print(m.group(1))               # 010
print(m.group(2))               # 12345

命名分组

代码示例

#!/usr/bin/env python
# -*- coding:utf8 -*-
import re

s = "Phone No . 010-87654321"
r = re.compile(r'(\d+)-(\d+)')
m = r.search(s)

print(m)                    # <_sre.SRE_Match object; span=(11, 23), match='010-87654321'>
print(m.group(1))           # 010
print(m.group(2))           # 87654321
print(m.groups())           # ('010', '87654321')

r2 = re.compile(r'(?P<Area>\d+)-(?P<No>\d+)')
m = r2.search(s)
print(m)                    ##<_sre.SRE_Match object; span=(11, 23), match='010-87654321'>
print(m.groupdict(2))       # {'Area': '010', 'No': '87654321'}
print(m.group("No"))        # 87654321
print(m.group("Area"))      # 010

11.5.5. 代码示例

#!/usr/bin/env python
#-*- coding:utf8 -*-


import re

s = '''
Life can be good;
Life can be bad;
LIfe is mostly cheerful;
But sometimes sad.
'''

r = re.compile(r'be(?=\sgood)')             #编译正则表达式,只匹配单词"good"的"be"
m = r.search(s)
print(m)                #<_sre.SRE_Match object; span=(10, 12), match='be'>

m.span()                                    #编译正则表达式,只匹配其后单词"good"的"be"
print(r.findall(s))                         #搜索字符串      #['be']
r = re.compile('be')                        #查看m
r.findall(s)

r = re.compile(r'be(?!\sgood)')
m = r.search(s)
print(m)            #<_sre.SRE_Match object; span=(28, 30), match='be'>

r = re.compile(r"(?:can\s)be(\sgood)")      #使用组来匹配"be good"
m = r.search(s)
print(m)            #<_sre.SRE_Match object; span=(6, 17), match='can be good'>
print(m.groups())   #(' good',)
print(m.group(1))   # good


r = re.compile(r'(?P<first>\w)(?P=first)')  #使用组名重复,此处匹配具有两个重复字母的单词
print(r.findall(s))                         #输出匹配到的字母       #['o', 'e']

r = re.compile(r'(?<=can\s)b\w*\b')         #匹配以字母“b”开头在“can”之后的单词
print(r.findall(s))                         #输出匹配到的单词       #['be', 'be']

r = re.compile(r"(?<!can\s)b\w*\b")         #匹配以字母"b"开头不在"can"之后单词
print(r.findall(s))         #['bad']

r = re.compile(r'(?<!can\s)(?i)b\w*\b')     #重新编译忽略大小写
print(r.findall(s))                 #['bad', 'But']

11.5.6. 匹配对象与索引的使用

代码示例

#!/usr/bin/env python
# -*- coding:utf8 -*-

import re  # 导入re模块

s = """
life can be dreams,
Life can be great thoughts;
Life can mean a person,
Sitting in a court.
"""
# 编译正则表达式,匹配所有包含字母“a”的单词
r = re.compile('\\b(?P<first>\w+)a(\w+)\\b')
m = r.search(s)
print(m.groupdict())                # {'first': 'c'}
print(m.groups())                   # ('c', 'n')
# 从指定位置开始重新搜索
m = r.search(s, 9)
print(m.group())                    # dreams
print(m.group((1)))                 # dre
print(m.group((2)))                 # ms
print(m.group(1, 2))                # ('dre', 'ms')
print(m.groupdict())                # {'first': 'dre'}
print(m.groups())                   # ('dre', 'ms')

11.5.7. 匹配对象与组的使用

代码示例

#!/usr/bin/env python
#-*- coding:utf8 -*-

import re                   #导入re模块
s = """
life can be dreams,
Life can be great thoughts;
Life can mean a person,
Sitting in a court.
"""

r = re.compile('\\b(?P<first>\w+)a(\w+)\\b')        #编译正则表达式,匹配所有包含字母“a”的单词
m = r.search(s)
print(m.groupdict())
print(m.groups())
m = r.search(s,9)                               #从指定位置开始重新搜索
print(m.group())
print(m.group((1)))
print(m.group((2)))
print(m.group(1,2))
print(m.groupdict())
print(m.groups())

11.5.8. 零宽断言

str = 'aaa111aaa , bbb222&, 333ccc'

re.compile('\d+(?=[a-z]+)').findall(str)          # 前向界定 (?=exp) 找出连续的数字并且最后一个数字跟着至少一个a-z ['111', '333']

re.compile(r"\d+(?![a-z]+)").findall(str)         # 前向否定界定 (?!exp)  找出连续数字,且最后一个数字后不能跟a-z  ['11', '222', '33']

re.compile(r"(?<=[a-z])\d+").findall(str)         # 反向界定 (?<=exp) 逆序环视 找出连续的数字,且第一个数字前面是a-z  ['111', '222']

re.compile(r"(?<![a-z])\d+").findall(str)         # 反向否定界定 (?<!exp) 否定逆序环视  找出连续的数字,且第一个数字前不能是a-z  ['11', '22', '333']

re.compile(r"(?:\d+)").findall(str)               # 无捕获的匹配 (?:exp)

s= 'Tom:9527 , Sharry:0003 '

re.match( r'(?P<name>\w+):(?P<num>\d+)' , s).group(0)   # 捕获组 <num>第二个标签变量[9527] 获取 group("num") 等同 group(2)[9527], group(0)全部[Tom:9527]