re模块详细介绍

Posted 2020-09-28 sunnymn

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了re模块详细介绍相关的知识，希望对你有一定的参考价值。

\w  匹配字母、数字及下划线
\W  匹配非字母、数字及下划线
\s  匹配任意空白字符
\S  匹配任意非空白字符
\d  匹配任意数字，等价于[0-9]
\D  匹配任意非数字
\A  匹配字符串开始
\Z  匹配字符串结束
\n  匹配一个换行符
\t  匹配一个制表符
^   匹配字符串的开头
$   匹配字符串的结尾
.   匹配任意字符，除了换行符，当re.DOTALL标记被指定时，则可以匹配包括换行符的任意字符。
[...] 用来表示一组字符，单独列出：[amk]匹配‘a‘或‘m‘或‘k‘
[^...]不在[]中的字符：[^amk]匹配除了a,m,k之外的字符
*   匹配0个或多个的表达式
+   匹配1个或多个的表达式
?   匹配0个或1个由前面的正则表达式定义的片段，非贪婪方式
{n}   精确匹配n个前面表达式
{n,m} 精确匹配n到m次由前面的正则表达式定义的片段，贪婪方式
a|b   匹配a或b
()    匹配括号内的表达式，也表示一个组
.*  默认为贪婪匹配
.*? 为非贪婪匹配：推荐使用
总结：尽量精简，详细的如下
    尽量使用泛型匹配模式：.*
    尽量使用非贪婪模式：.*?
    使用括号得到匹配目标：用group(n)去取得结果
    有换行符就用re.S修改模式
------------------------------------------------------------------------------------------------------------------
例子：

# coding=utf-8

import re
----- /w  /W
ret = re.findall(‘\w‘,‘hello egon 123‘)
print(ret)
ret = re.findall(‘\W‘,‘hello egon 123‘)
print(ret)

------/s  /S
ret = re.findall(‘\s‘,‘hello egon 123‘)
print(ret)
ret = re.findall(‘\S‘,‘hello egon 123‘)
print(ret)

-----\d  \D
ret = re.findall(‘\d‘,‘hello egon 123‘)
print(ret)
ret = re.findall(‘\D‘,‘hello egon 123‘)
print(ret)

-----\A  \Z
ret = re.findall(‘\Ah‘,‘hello egon 123‘)
print(ret)
ret = re.findall(‘123\Z‘,‘hello egon 123‘)
print(ret)

-----\n  \t
ret = re.findall(r‘\n‘,‘hello egon \n123‘)
print(ret)
ret= re.findall(r‘\t‘,‘hello egon \t123‘)
print(ret)

----^  $
print(re.findall(‘^h‘,‘hello egon 123‘))
print(re.findall(‘123$‘,‘hello egon 123‘))

---- .
print(re.findall(‘a.b‘,‘alb‘))

----？
print(re.findall(‘ab?‘,‘a‘))
print(re.findall(‘ab?‘,‘abbb‘))

匹配包含小数在内的数字
print(re.findall(‘\d+\.?\d*‘,‘asdfasdf123as1.13dfa12adsf1asdf3‘))
[‘123‘, ‘1.13‘, ‘12‘, ‘1‘, ‘3‘]

---- .* 默认为贪婪匹配
print(re.findall(‘a.*b‘,‘a1b22222222b‘))#[‘a1b22222222b‘]

----- .*?为非贪婪匹配：推荐使用
print(re.findall(‘a(.*?)b‘,‘a1b22222222b‘))#[‘l‘]

---- +
print(re.findall(‘ab+‘,‘a‘))#[]
print(re.findall(‘ab+‘,‘abbbb123bbb‘))#[‘abbbb‘]

---- {n,m}
print(re.findall(‘ab{2}‘,‘abbbb‘))#[‘abb‘]
print(re.findall(‘ab{2,4}‘,‘abbbb‘))#[‘abbbb‘]
print(re.findall(‘ab{1,}‘,‘abbbb‘))#[‘abbbb‘]
print(re.findall(‘ab{2}‘,‘abbbb‘))#[‘abb‘]

----- []
print(re.findall(‘a[l*-]b‘,‘alb a*b a-b‘))#[‘alb‘, ‘a*b‘, ‘a-b‘]#[]内的都为普通字符了，且如果-没有被转意的话，应该放到[]的开头或结尾
print(re.findall(‘a[^1*-]b‘,‘a1b a*b a-b a=b‘))#[‘a=b‘]#[]内的^代表的意思是取反
print(re.findall(‘a[a-z]b‘,‘alb a*b a-b a=b aeb‘))#[‘alb‘, ‘aeb‘]
print(re.findall(‘a[a-zA-Z]b‘,‘a2b a*b a-b a=b aeb aEb‘))#[‘aeb‘, ‘aEb‘]

------- \
print(re.findall(r‘a\\c‘,‘a\c‘))#[‘a\\c‘]

re_str_patt = "\\\\d\\+"
print(re_str_patt) #\\d\+
reObj = re.compile(re_str_patt)
print(reObj.findall("\\d+"))#[‘\\d+‘]

-------- ():分组
print(re.findall(‘(ab)+123‘,‘ababab123‘))#[‘ab‘],匹配到末尾的ab123中的ab
print(re.findall(‘(?:ab)+123‘,‘ababab123‘))#[‘ababab123‘],findall的结果不是匹配的全部内容，而是组内的内容,?:可以让结果为匹配的全部内容

--------- |
print(re.findall(‘compan(?:y|ies)‘,‘Too many companies have gone bankrupt, and the next one is my company‘))
[‘companies‘, ‘company‘]


------re模块提供的方法介绍
findall
print(re.findall(‘e‘,‘alex make love‘))#[‘e‘, ‘e‘, ‘e‘]],返回所有满足匹配条件的结果,放在列表里
search
print(re.search(‘e‘,‘alex make love‘).group())#e,只到找到第一个匹配然后返回一个包含匹配信息的对象,该对象可以通过调用group()方法得到匹配的字符串,如果字符串没有匹配，则返回None。
match
print(re.match(‘e‘,‘alex make love‘))#None,同search，不过在字符串开始处进行匹配，完全可以用search+^ 代替match
split
print(re.split(‘[ab]‘,‘abcd‘))#[‘‘, ‘‘, ‘cd‘]，先按‘a‘分割得到‘‘和‘bcd‘,再对‘‘和‘bcd‘分别按‘b‘分割
sub
print(re.sub(‘a‘,‘A‘,‘alex make love‘))#Alex mAke love

print(re.sub(‘a‘,‘A‘,‘alex make love‘,1))#Alex make love

print(re.sub(‘a‘,‘A‘,‘alex make love‘,2))#Alex mAke love

print(re.sub(‘^(\w+)(.*?\s)(\w+)(.*?\s)(\w+)(.*?)$‘,r‘\5\2\3\4\1‘,‘alex make love‘))#love make alex

print(re.subn(‘a‘,‘A‘,‘alex make love‘))#(‘Alex mAke love‘, 2)结果带有总替换个数

compile
obj = re.compile(‘\d{2}‘)
s = ‘abc123eeee‘
print(obj.findall(s))#[‘12‘]
print(obj.search(s).group())#12

--------补充

print(re.findall(‘<(?P<tag_name>\w+)>\w+</(?P=tag_name)>‘,‘<h1>hello</h1>‘))#[‘h1‘]

print(re.search(‘<(?P<tag_name>\w+)>\w+</(?P=tag_name)>‘,‘<h1>hello</h1>‘).group())#<h1>hello</h1>

print(re.search(r‘<(\w+)>\w+</(\w+)>‘,‘<h1>hello</h1>‘).group())#<h1>hello</h1>
print(re.search(r‘<(\w+)>\w+</\1>‘,‘<h1>hello</h1>‘).group())#<h1>hello</h1>

print(re.findall(‘-?\d+\.\d*|(-?\d+)‘,‘1-2*(60+(-40.35/5)-(-4*3))‘))
找出所有整数[‘1‘, ‘-2‘, ‘60‘, ‘‘, ‘5‘, ‘-4‘, ‘3‘]
print(re.findall(‘-?\d+\.?\d*‘,‘1-2*(60+(-40.35/5)-(-4*3))‘))#[‘1‘, ‘-2‘, ‘60‘, ‘-40.35‘, ‘5‘, ‘-4‘, ‘3‘]

expression=‘1-2*((60+2*(-3-40.0/5)*(9-2*5/3+7/3*99/4*2998+10*568/14))-(-4*3)/(16-3*2))‘
print(re.search(‘\(([\+\-\*/]\d+\.?\d*)+\)‘,expression).group())##(-3-40.0/5)

最常规匹配
content=‘Hello 123 456 World_This is a Regex Demo‘
res=re.match(‘Hello\s\d\d\d\s\d{3}\s\w{10}.*Demo‘,content)
print(res)
print(res.group())
print(res.span())

泛匹配
content=‘Hello 123 456 World_This is a Regex Demo‘
res=re.match(‘^Hello.*‘,content)
print(res.group())

匹配目标,获得指定数据

content=‘Hello 123 456 World_This is a Regex Demo‘
res=re.match(‘^Hello\s(\d+)\s(\d+)\s.*Demo‘,content)
print(res.group()) #取所有匹配的内容
print(res.group(1)) #取匹配的第一个括号内的内容
print(res.group(2)) #去匹配的第二个括号内的内容

贪婪匹配：.*代表匹配尽可能多的字符
content=‘Hello 123 456 World_This is a Regex Demo‘
res = re.match(‘^He.*(\d+).*Demo$‘,content)
print(res.group(1))#6,因为.*会尽可能多的匹配，然后后面跟至少一个数字

非贪婪匹配： ？匹配尽可能少的字符
content=‘Hello 123 456 World_This is a Regex Demo‘
res = re.match(‘^He.*?(\d+).*Demo$‘,content)
print(res.group(1))#123

匹配模式：不能匹配换行符
content=‘‘‘Hello 123 456 World_This
is a Regex Demo
‘‘‘
res = re.match(‘He.*?(\d+).*?Demo$‘,content)
print(res)#None
res = re.match(‘He.*?(\d+).*?Demo$‘,content,re.S)
print(res.group(1))#123

转义:\
content=‘price is $5.00‘
res=re.match(‘price is $5.00‘,content)
print(res)#None

res=re.match(‘price is \$5\.00‘,content)
print(res.group())#price is $5.00

以上是关于re模块详细介绍的主要内容，如果未能解决你的问题，请参考以下文章