perl regular
时间:2007-03-29 来源:我爱spring
Perl is widely renowned for excellence in text processing, and regular
expressions are one of the big factors behind this fame. Perl regular
expressions display an efficiency and flexibility unknown in most
other computer languages. Mastering even the basics of regular
expressions will allow you to manipulate text with surprising ease.
$greeting = "World";
if ("Hello World" =~ /$greeting/) {
print "It matches\n";
}
else {
print "It doesn't match\n";
$_ = "Hello World";
if (/World/) {
print "It matches\n";
}
else {
print "It doesn't match\n";
"Hello World" =~ m!World!; # matches, delimited by '!'
"Hello World" =~ m{World}; # matches, note the matching '{}'
"/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin',
# '/' becomes an ordinary char
"Hello World" =~ /world/; # doesn't match
"Hello World" =~ /o W/; # matches
"Hello World" =~ /oW/; # doesn't match
"Hello World" =~ /World /; # doesn't match
metacharacters: {}[]()^$.|*+?\
"2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter
"2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary +
"The interval is [0,1)." =~ /[0,1)./ # is a syntax error!
"The interval is [0,1)." =~ /\[0,1\)\./ # matches
"/usr/bin/perl" =~ /\/usr\/bin\/perl/; # matches
"/usr/bin/perl" =~ m!/usr/bin/perl!; # easier to read
$foo = 'house';
'housecat' =~ /$foo/; # matches
'cathouse' =~ /cat$foo/; # matches
'housecat' =~ /${foo}cat/; # matches
% cat > simple_grep
#!/usr/bin/perl
$regexp = shift;
while (<>) {
print if /$regexp/;
}
^D
% chmod +x simple_grep
% simple_grep abba /usr/dict/words
Babbage
cabbage
cabbages
sabbath
Sabbathize
Sabbathizes
sabbatical
scabbard
scabbards
"keeper" =~ /^keep$/; # doesn't match
"keeper" =~ /^keeper$/; # matches
"" =~ /^$/; # ^$ matches an empty string
/cat/; # matches 'cat'
/[bcr]at/; # matches 'bat, 'cat', or 'rat'
/item[0123456789]/; # matches 'item0' or ... or 'item9'
"abc" =~ /[cab]/; # matches 'a'
/[yY][eE][sS]/; # match 'yes' in a case-insensitive way
# 'yes', 'Yes', 'YES', etc.
/[\]c]def/; # matches ']def' or 'cdef'
$x = 'bcr';
/[$x]at/; # matches 'bat', 'cat', or 'rat'
/[\$x]at/; # matches '$at' or 'xat'
/[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat'
/item[0-9]/; # matches 'item0' or ... or 'item9'
/[0-9bx-z]aa/; # matches '0aa', ..., '9aa',
# 'baa', 'xaa', 'yaa', or 'zaa'
/[0-9a-fA-F]/; # matches a hexadecimal digit
/[0-9a-zA-Z_]/; # matches a "word" character,
# like those in a perl variable name
/[^a]at/; # doesn't match 'aat' or 'at', but matches
# all other 'bat', 'cat, '0at', '%at', etc.
/[^0-9]/; # matches a non-numeric character
/[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary
-
\d is a digit and represents [0-9]
-
\s is a whitespace character and represents [\ \t\r\n\f]
-
\w is a word character (alphanumeric or _) and represents [0-9a-zA-Z_]
-
\D is a negated \d; it represents any character but a digit [^0-9]
-
\S is a negated \s; it represents any non-whitespace character [^\s]
-
\W is a negated \w; it represents any non-word character [^\w]
-
The period '.' matches any character but "\n"
/\d\d:\d\d:\d\d/; # matches a hh:mm:ss time format
/[\d\s]/; # matches any digit or whitespace character
/\w\W\w/; # matches a word char, followed by a
# non-word char, followed by a word char
/..rt/; # matches any two chars, followed by 'rt'
/end\./; # matches 'end.'
/end[.]/; # same thing, matches 'end.'
$x = "Housecat catenates house and cat";
$x =~ /cat/; # matches cat in 'housecat'
$x =~ /\bcat/; # matches cat in 'catenates'
$x =~ /cat\b/; # matches cat in 'housecat'
$x =~ /\bcat\b/; # matches 'cat' at end of string
$x = "There once was a girl\nWho programmed in Perl\n";
$x =~ /^Who/; # doesn't match, "Who" not at start of string
$x =~ /^Who/s; # doesn't match, "Who" not at start of string
$x =~ /^Who/m; # matches, "Who" at start of second line
$x =~ /^Who/sm; # matches, "Who" at start of second line
$x =~ /girl.Who/; # doesn't match, "." doesn't match "\n"
$x =~ /girl.Who/s; # matches, "." matches "\n"
$x =~ /girl.Who/m; # doesn't match, "." doesn't match "\n"
$x =~ /girl.Who/sm; # matches, "." matches "\n"
$x =~ /^Who/m; # matches, "Who" at start of second line
$x =~ /\AWho/m; # doesn't match, "Who" is not at start of string
$x =~ /girl$/m; # matches, "girl" at end of first line
$x =~ /girl\Z/m; # doesn't match, "girl" is not at end of string
$x =~ /Perl\Z/m; # matches, "Perl" is at newline before end
$x =~ /Perl\z/m; # doesn't match, "Perl" is not at end of string
"cats and dogs" =~ /cat|dog|bird/; # matches "cat"
"cats and dogs" =~ /dog|cat|bird/; # matches "cat"
/(a|b)b/; # matches 'ab' or 'bb'
/(ac|b)b/; # matches 'acb' or 'bb'
/(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere
/(a|[bc])d/; # matches 'ad', 'bd', or 'cd'
/house(cat|)/; # matches either 'housecat' or 'house'
/house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or
# 'house'. Note groups can be nested.
# extract hours, minutes, seconds
if ($time =~ /(\d\d):(\d\d):(\d\d)/) { # match hh:mm:ss format
$hours = $1;
$minutes = $2;
$seconds = $3;
}
-
a? = match 'a' 1 or 0 times
-
a* = match 'a' 0 or more times, i.e., any number of times
-
a+ = match 'a' 1 or more times, i.e., at least once
-
a{n,m} = match at least n times, but not more than m times.
-
a{n,} = match at least n or more times
-
a{n} = match exactly n times
$x = "the cat in the hat";
$x =~ /^(.*)(cat)(.*)$/; # matches,
# $1 = 'the '
# $2 = 'cat'
# $3 = ' in the hat'
$x =~ /^(.*)(at)(.*)$/; # matches,
# $1 = 'the cat in the h'
# $2 = 'at'
# $3 = '' (0 matches)
$x = "The programming republic of Perl";
$x =~ /^(.+)(e|r)(.*)$/; # matches,
# $1 = 'The programming republic of Pe'
# $2 = 'r'
# $3 = 'l'
$x =~ /(m{1,2})(.*)$/; # matches,
# $1 = 'mm'
# $2 = 'ing republic of Perl'
$x =~ /.*(m{1,2})(.*)$/; # matches,
# $1 = 'm'
# $2 = 'ing republic of Perl'
$x =~ /(.?)(m{1,2})(.*)$/; # matches,
# $1 = 'a'
# $2 = 'mm'
# $3 = 'ing republic of Perl'
-
a?? = match 'a' 0 or 1 times. Try 0 first, then 1.
-
a*? = match 'a' 0 or more times, i.e., any number of times, but as few times as possible
-
a+? = match 'a' 1 or more times, i.e., at least once, but as few times as possible
-
a{n,m}? = match at least n times, not more than m times, as few times as possible
-
a{n,}? = match at least n times, but as few times as possible
-
a{n}? = match exactly n times. Because we match exactly n times, a{n}? is equivalent to a{n} and is just there for notational consistency
$x = "The programming republic of Perl";
$x =~ /^(.+?)(e|r)(.*)$/; # matches,
# $1 = 'Th'
# $2 = 'e'
# $3 = ' programming republic of Perl'
$x =~ /(m{1,2}?)(.*?)$/; # matches,
# $1 = 'm'
# $2 = 'ming republic of Perl'
$x =~ /(.*?)(m{1,2}?)(.*)$/; # matches,
# $1 = 'The progra'
# $2 = 'm'
# $3 = 'ming republic of Perl'
$x =~ /(.??)(m{1,2})(.*)$/; # matches,
# $1 = 'a'
# $2 = 'mm'
# $3 = 'ing republic of Perl'
$x = "the cat in the hat";
$x =~ /^(.*)(at)(.*)$/; # matches,
# $1 = 'the cat in the h'
# $2 = 'at'
# $3 = '' (0 matches)
$x = "cat dog house"; # 3 words
@words = ($x =~ /(\w+)/g); # matches,
# $word[0] = 'cat'
# $word[1] = 'dog'
# $word[2] = 'house
% perl -e 'use re "debug"; "abc" =~ /a*b+c/;'