Skip to content

Commit 224fe75

Browse files
ntBreMichaReiserAlexWaygoodsbrugman
authored
[ruff] Implement unnecessary-regular-expression (RUF055) (#14659)
Co-authored-by: Micha Reiser <[email protected]> Co-authored-by: Alex Waygood <[email protected]> Co-authored-by: Simon Brugman <[email protected]>
1 parent dc29f52 commit 224fe75

File tree

8 files changed

+464
-0
lines changed

8 files changed

+464
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import re
2+
3+
s = "str"
4+
5+
# this should be replaced with s.replace("abc", "")
6+
re.sub("abc", "", s)
7+
8+
9+
# this example, adapted from https://docs.python.org/3/library/re.html#re.sub,
10+
# should *not* be replaced because repl is a function, not a string
11+
def dashrepl(matchobj):
12+
if matchobj.group(0) == "-":
13+
return " "
14+
else:
15+
return "-"
16+
17+
18+
re.sub("-", dashrepl, "pro----gram-files")
19+
20+
# this one should be replaced with s.startswith("abc") because the Match is
21+
# used in an if context for its truth value
22+
if re.match("abc", s):
23+
pass
24+
if m := re.match("abc", s): # this should *not* be replaced
25+
pass
26+
re.match("abc", s) # this should not be replaced because match returns a Match
27+
28+
# this should be replaced with "abc" in s
29+
if re.search("abc", s):
30+
pass
31+
re.search("abc", s) # this should not be replaced
32+
33+
# this should be replaced with "abc" == s
34+
if re.fullmatch("abc", s):
35+
pass
36+
re.fullmatch("abc", s) # this should not be replaced
37+
38+
# this should be replaced with s.split("abc")
39+
re.split("abc", s)
40+
41+
# these currently should not be modified because the patterns contain regex
42+
# metacharacters
43+
re.sub("ab[c]", "", s)
44+
re.match("ab[c]", s)
45+
re.search("ab[c]", s)
46+
re.fullmatch("ab[c]", s)
47+
re.split("ab[c]", s)
48+
49+
# test that all of the metacharacters prevent the rule from triggering, also
50+
# use raw strings in line with RUF039
51+
re.sub(r"abc.", "", s)
52+
re.sub(r"^abc", "", s)
53+
re.sub(r"abc$", "", s)
54+
re.sub(r"abc*", "", s)
55+
re.sub(r"abc+", "", s)
56+
re.sub(r"abc?", "", s)
57+
re.sub(r"abc{2,3}", "", s)
58+
re.sub(r"abc\n", "", s) # this one could be fixed but is not currently
59+
re.sub(r"abc|def", "", s)
60+
re.sub(r"(a)bc", "", s)
61+
62+
# and these should not be modified because they have extra arguments
63+
re.sub("abc", "", s, flags=re.A)
64+
re.match("abc", s, flags=re.I)
65+
re.search("abc", s, flags=re.L)
66+
re.fullmatch("abc", s, flags=re.M)
67+
re.split("abc", s, maxsplit=2)
68+
69+
# this should trigger an unsafe fix because of the presence of comments
70+
re.sub(
71+
# pattern
72+
"abc",
73+
# repl
74+
"",
75+
s, # string
76+
)

crates/ruff_linter/src/checkers/ast/analyze/expression.rs

+3
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,9 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
10811081
if checker.enabled(Rule::AirflowDagNoScheduleArgument) {
10821082
airflow::rules::dag_no_schedule_argument(checker, expr);
10831083
}
1084+
if checker.enabled(Rule::UnnecessaryRegularExpression) {
1085+
ruff::rules::unnecessary_regular_expression(checker, call);
1086+
}
10841087
}
10851088
Expr::Dict(dict) => {
10861089
if checker.any_enabled(&[

crates/ruff_linter/src/codes.rs

+1
Original file line numberDiff line numberDiff line change
@@ -984,6 +984,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
984984
(Ruff, "040") => (RuleGroup::Preview, rules::ruff::rules::InvalidAssertMessageLiteralArgument),
985985
(Ruff, "041") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryNestedLiteral),
986986
(Ruff, "048") => (RuleGroup::Preview, rules::ruff::rules::MapIntVersionParsing),
987+
(Ruff, "055") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryRegularExpression),
987988
(Ruff, "100") => (RuleGroup::Stable, rules::ruff::rules::UnusedNOQA),
988989
(Ruff, "101") => (RuleGroup::Stable, rules::ruff::rules::RedirectedNOQA),
989990

crates/ruff_linter/src/rules/ruff/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ mod tests {
409409
#[test_case(Rule::MapIntVersionParsing, Path::new("RUF048_1.py"))]
410410
#[test_case(Rule::UnrawRePattern, Path::new("RUF039.py"))]
411411
#[test_case(Rule::UnrawRePattern, Path::new("RUF039_concat.py"))]
412+
#[test_case(Rule::UnnecessaryRegularExpression, Path::new("RUF055.py"))]
412413
fn preview_rules(rule_code: Rule, path: &Path) -> Result<()> {
413414
let snapshot = format!(
414415
"preview__{}_{}",

crates/ruff_linter/src/rules/ruff/rules/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ pub(crate) use test_rules::*;
3333
pub(crate) use unnecessary_iterable_allocation_for_first_element::*;
3434
pub(crate) use unnecessary_key_check::*;
3535
pub(crate) use unnecessary_nested_literal::*;
36+
pub(crate) use unnecessary_regular_expression::*;
3637
pub(crate) use unraw_re_pattern::*;
3738
pub(crate) use unsafe_markup_use::*;
3839
pub(crate) use unused_async::*;
@@ -79,6 +80,7 @@ pub(crate) mod test_rules;
7980
mod unnecessary_iterable_allocation_for_first_element;
8081
mod unnecessary_key_check;
8182
mod unnecessary_nested_literal;
83+
mod unnecessary_regular_expression;
8284
mod unraw_re_pattern;
8385
mod unsafe_markup_use;
8486
mod unused_async;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
use ruff_diagnostics::{AlwaysFixableViolation, Applicability, Diagnostic, Edit, Fix};
2+
use ruff_macros::{derive_message_formats, ViolationMetadata};
3+
use ruff_python_ast::{
4+
Arguments, CmpOp, Expr, ExprAttribute, ExprCall, ExprCompare, ExprContext, Identifier,
5+
};
6+
use ruff_python_semantic::{Modules, SemanticModel};
7+
use ruff_text_size::TextRange;
8+
9+
use crate::checkers::ast::Checker;
10+
11+
/// ## What it does
12+
///
13+
/// Checks for uses of the `re` module that can be replaced with builtin `str` methods.
14+
///
15+
/// ## Why is this bad?
16+
///
17+
/// Performing checks on strings directly can make the code simpler, may require
18+
/// less escaping, and will often be faster.
19+
///
20+
/// ## Example
21+
///
22+
/// ```python
23+
/// re.sub("abc", "", s)
24+
/// ```
25+
///
26+
/// Use instead:
27+
///
28+
/// ```python
29+
/// s.replace("abc", "")
30+
/// ```
31+
///
32+
/// ## Details
33+
///
34+
/// The rule reports the following calls when the first argument to the call is
35+
/// a plain string literal, and no additional flags are passed:
36+
///
37+
/// - `re.sub`
38+
/// - `re.match`
39+
/// - `re.search`
40+
/// - `re.fullmatch`
41+
/// - `re.split`
42+
///
43+
/// For `re.sub`, the `repl` (replacement) argument must also be a string literal,
44+
/// not a function. For `re.match`, `re.search`, and `re.fullmatch`, the return
45+
/// value must also be used only for its truth value.
46+
///
47+
/// ## Fix safety
48+
///
49+
/// This rule's fix is marked as unsafe if the affected expression contains comments. Otherwise,
50+
/// the fix can be applied safely.
51+
///
52+
/// ## References
53+
/// - [Python Regular Expression HOWTO: Common Problems - Use String Methods](https://docs.python.org/3/howto/regex.html#use-string-methods)
54+
#[derive(ViolationMetadata)]
55+
pub(crate) struct UnnecessaryRegularExpression {
56+
replacement: String,
57+
}
58+
59+
impl AlwaysFixableViolation for UnnecessaryRegularExpression {
60+
#[derive_message_formats]
61+
fn message(&self) -> String {
62+
"Plain string pattern passed to `re` function".to_string()
63+
}
64+
65+
fn fix_title(&self) -> String {
66+
format!("Replace with `{}`", self.replacement)
67+
}
68+
}
69+
70+
/// RUF055
71+
pub(crate) fn unnecessary_regular_expression(checker: &mut Checker, call: &ExprCall) {
72+
// adapted from unraw_re_pattern
73+
let semantic = checker.semantic();
74+
75+
if !semantic.seen_module(Modules::RE) {
76+
return;
77+
}
78+
79+
let Some(qualified_name) = semantic.resolve_qualified_name(&call.func) else {
80+
return;
81+
};
82+
83+
let ["re", func] = qualified_name.segments() else {
84+
return;
85+
};
86+
87+
// skip calls with more than `pattern` and `string` arguments (and `repl`
88+
// for `sub`)
89+
let Some(re_func) = ReFunc::from_call_expr(semantic, call, func) else {
90+
return;
91+
};
92+
93+
// For now, restrict this rule to string literals
94+
let Some(string_lit) = re_func.pattern.as_string_literal_expr() else {
95+
return;
96+
};
97+
98+
// For now, reject any regex metacharacters. Compare to the complete list
99+
// from https://docs.python.org/3/howto/regex.html#matching-characters
100+
let has_metacharacters = string_lit
101+
.value
102+
.to_str()
103+
.contains(['.', '^', '$', '*', '+', '?', '{', '[', '\\', '|', '(']);
104+
105+
if has_metacharacters {
106+
return;
107+
}
108+
109+
// Here we know the pattern is a string literal with no metacharacters, so
110+
// we can proceed with the str method replacement
111+
let new_expr = re_func.replacement();
112+
113+
let repl = checker.generator().expr(&new_expr);
114+
let diagnostic = Diagnostic::new(
115+
UnnecessaryRegularExpression {
116+
replacement: repl.clone(),
117+
},
118+
call.range,
119+
);
120+
121+
let fix = Fix::applicable_edit(
122+
Edit::range_replacement(repl, call.range),
123+
if checker
124+
.comment_ranges()
125+
.has_comments(call, checker.source())
126+
{
127+
Applicability::Unsafe
128+
} else {
129+
Applicability::Safe
130+
},
131+
);
132+
133+
checker.diagnostics.push(diagnostic.with_fix(fix));
134+
}
135+
136+
/// The `re` functions supported by this rule.
137+
#[derive(Debug)]
138+
enum ReFuncKind<'a> {
139+
Sub { repl: &'a Expr },
140+
Match,
141+
Search,
142+
Fullmatch,
143+
Split,
144+
}
145+
146+
#[derive(Debug)]
147+
struct ReFunc<'a> {
148+
kind: ReFuncKind<'a>,
149+
pattern: &'a Expr,
150+
string: &'a Expr,
151+
}
152+
153+
impl<'a> ReFunc<'a> {
154+
fn from_call_expr(
155+
semantic: &SemanticModel,
156+
call: &'a ExprCall,
157+
func_name: &str,
158+
) -> Option<Self> {
159+
// the proposed fixes for match, search, and fullmatch rely on the
160+
// return value only being used for its truth value
161+
let in_if_context = semantic.in_boolean_test();
162+
163+
match (func_name, call.arguments.len()) {
164+
// `split` is the safest of these to fix, as long as metacharacters
165+
// have already been filtered out from the `pattern`
166+
("split", 2) => Some(ReFunc {
167+
kind: ReFuncKind::Split,
168+
pattern: call.arguments.find_argument("pattern", 0)?,
169+
string: call.arguments.find_argument("string", 1)?,
170+
}),
171+
// `sub` is only safe to fix if `repl` is a string. `re.sub` also
172+
// allows it to be a function, which will *not* work in the str
173+
// version
174+
("sub", 3) => {
175+
let repl = call.arguments.find_argument("repl", 1)?;
176+
if !repl.is_string_literal_expr() {
177+
return None;
178+
}
179+
Some(ReFunc {
180+
kind: ReFuncKind::Sub { repl },
181+
pattern: call.arguments.find_argument("pattern", 0)?,
182+
string: call.arguments.find_argument("string", 2)?,
183+
})
184+
}
185+
("match", 2) if in_if_context => Some(ReFunc {
186+
kind: ReFuncKind::Match,
187+
pattern: call.arguments.find_argument("pattern", 0)?,
188+
string: call.arguments.find_argument("string", 1)?,
189+
}),
190+
("search", 2) if in_if_context => Some(ReFunc {
191+
kind: ReFuncKind::Search,
192+
pattern: call.arguments.find_argument("pattern", 0)?,
193+
string: call.arguments.find_argument("string", 1)?,
194+
}),
195+
("fullmatch", 2) if in_if_context => Some(ReFunc {
196+
kind: ReFuncKind::Fullmatch,
197+
pattern: call.arguments.find_argument("pattern", 0)?,
198+
string: call.arguments.find_argument("string", 1)?,
199+
}),
200+
_ => None,
201+
}
202+
}
203+
204+
fn replacement(&self) -> Expr {
205+
match self.kind {
206+
// string.replace(pattern, repl)
207+
ReFuncKind::Sub { repl } => {
208+
self.method_expr("replace", vec![self.pattern.clone(), repl.clone()])
209+
}
210+
// string.startswith(pattern)
211+
ReFuncKind::Match => self.method_expr("startswith", vec![self.pattern.clone()]),
212+
// pattern in string
213+
ReFuncKind::Search => self.compare_expr(CmpOp::In),
214+
// string == pattern
215+
ReFuncKind::Fullmatch => self.compare_expr(CmpOp::Eq),
216+
// string.split(pattern)
217+
ReFuncKind::Split => self.method_expr("split", vec![self.pattern.clone()]),
218+
}
219+
}
220+
221+
/// Return a new compare expr of the form `self.pattern op self.string`
222+
fn compare_expr(&self, op: CmpOp) -> Expr {
223+
Expr::Compare(ExprCompare {
224+
left: Box::new(self.pattern.clone()),
225+
ops: Box::new([op]),
226+
comparators: Box::new([self.string.clone()]),
227+
range: TextRange::default(),
228+
})
229+
}
230+
231+
/// Return a new method call expression on `self.string` with `args` like
232+
/// `self.string.method(args...)`
233+
fn method_expr(&self, method: &str, args: Vec<Expr>) -> Expr {
234+
let method = Expr::Attribute(ExprAttribute {
235+
value: Box::new(self.string.clone()),
236+
attr: Identifier::new(method, TextRange::default()),
237+
ctx: ExprContext::Load,
238+
range: TextRange::default(),
239+
});
240+
Expr::Call(ExprCall {
241+
func: Box::new(method),
242+
arguments: Arguments {
243+
args: args.into_boxed_slice(),
244+
keywords: Box::new([]),
245+
range: TextRange::default(),
246+
},
247+
range: TextRange::default(),
248+
})
249+
}
250+
}

0 commit comments

Comments
 (0)