fix: handle identifier with regex characters

Provides a test for and the solution to GitHub issue #242. Namely, an identifier which contains regex characters (e.g. "[TODO]") is properly handled by having the parser look for literal "[" and "]" characters rather than treating those characters as part of a regex pattern. The word boundary regex pattern '\b' does NOT properly handle this, so a slightly different pattern is used to identify the boundary.
2026-01-22 22:06:43 +00:00 · 2024-11-12 16:07:05 +00:00 · 2024-11-12 16:07:05 +00:00 · 2d98b5c359
commit 2d98b5c359
parent 59c6b5395c
5 changed files with 30 additions and 10 deletions
--- a/TodoParser.py
+++ b/TodoParser.py
@ -563,13 +563,14 @@ class TodoParser(object):
        title_identifier_actual = None
        title_identifier = None
        for identifier in self.identifiers:
-            title_pattern = re.compile(fr'(^.*?)(\s*?)(\b{re.escape(identifier)}\b)(\(([^)]+)\))?\s*:?\s*(.+)', re.IGNORECASE)
+            title_pattern = re.compile(fr'(^|(^.*?)(\s*?)\s)({re.escape(identifier)})(\(([^)]+)\))?\s*(:|\s)\s*(.+)',
                                       re.IGNORECASE)
            title_search = title_pattern.search(comment)
            if title_search:
-                title_identifier_actual = title_search.group(3)
+                title_identifier_actual = title_search.group(4)
                title_identifier = identifier
-                ref = title_search.group(5) # may be empty, which is OK
+                ref = title_search.group(6) # may be empty, which is OK
-                title = title_search.group(6)
+                title = title_search.group(8)
                break
        return title, ref, title_identifier, title_identifier_actual
--- a/main.py
+++ b/main.py
@ -76,7 +76,7 @@ def process_diff(diff, client=Client(), insert_issue_urls=False, parser=TodoPars
                    if line_number < len(file_lines):
                        # Duplicate the line to retain the comment syntax.
                        old_line = file_lines[line_number]
-                        remove = fr'(?i:{raw_issue.identifier}).*{re.escape(raw_issue.title)}'
+                        remove = fr'(?i:{re.escape(raw_issue.identifier)}).*{re.escape(raw_issue.title)}'
                        insert = f'Issue URL: {client.get_issue_url(new_issue_number)}'
                        new_line = re.sub('^.*'+remove, raw_issue.prefix + insert, old_line)
                        # make sure the above operation worked as intended
--- a/tests/test_new.diff
+++ b/tests/test_new.diff
@ -58,7 +58,7 @@ index 0000000..525e25d
 +    # TODO: Come up with a more imaginative greeting
 +    print('Hello world')
 +
-+    # TODO: Do more stuff
+    # [TODO]: Do more stuff
 +    #  This function should probably do something more interesting
 +    #  labels: help wanted
 +
--- a/tests/test_process_diff.py
+++ b/tests/test_process_diff.py
@ -78,7 +78,7 @@ class IssueUrlInsertionTest(unittest.TestCase):
                     "Skipping because 'SKIP_PROCESS_DIFF_TEST' is 'true'")
    def test_url_insertion(self):
        self._setUp(['test_new.diff'])
-        self._standardTest(80)
+        self._standardTest(79)
    def test_line_numbering_with_deletions(self):
        self._setUp(['test_new_py.diff', 'test_edit_py.diff'])
--- a/tests/test_todo_parser.py
+++ b/tests/test_todo_parser.py
@ -45,7 +45,7 @@ class NewIssueTest(unittest.TestCase):
    def test_python_issues(self):
        # Includes 4 tests for Starlark.
-        self.assertEqual(count_issues_for_file_type(self.raw_issues, 'python'), 8)
+        self.assertEqual(count_issues_for_file_type(self.raw_issues, 'python'), 7)
    def test_yaml_issues(self):
        self.assertEqual(count_issues_for_file_type(self.raw_issues, 'yaml'), 2)
@ -80,7 +80,7 @@ class NewIssueTest(unittest.TestCase):
        self.assertEqual(count_issues_for_file_type(self.raw_issues, 'julia'), 2)
    def test_starlark_issues(self):
-        self.assertEqual(count_issues_for_file_type(self.raw_issues, 'python'), 8)
+        self.assertEqual(count_issues_for_file_type(self.raw_issues, 'python'), 7)
    def test_autohotkey_issues(self):
        self.assertEqual(count_issues_for_file_type(self.raw_issues, 'autohotkey'), 1)
@ -133,7 +133,9 @@ class CustomOptionsTest(unittest.TestCase):
    def setUp(self):
        parser = TodoParser(options={"identifiers":
                                     [{"name": "FIX", "labels": []},
-                                      {"name": "TODO", "labels": []}]})
+                                      {"name": "[TODO]", "labels": []},
                                      {"name": "TODO", "labels": []}
                                      ]})
        self.raw_issues = []
        with open('syntax.json', 'r') as syntax_json:
            parser.syntax_dict = json.load(syntax_json)
@ -157,6 +159,23 @@ class CustomOptionsTest(unittest.TestCase):
        self.assertEqual(len(matching_issues), 0,
                         msg=print_unexpected_issues(matching_issues))
    # See GitHub issue #242
    def test_regex_identifier_chars(self):
        """
        Verify that the presence of regex characters in the identifier
        doesn't confuse the parser
        An identifier such as "[TODO]" should be matched literally, not treating
        the "[" and "]" characters as part of a regular expression pattern.
        """
        matching_issues = get_issues_for_fields(self.raw_issues,
                                                {
                                                    "file_name": "example_file.py",
                                                    "identifier": "[TODO]"
                                                })
        self.assertEqual(len(matching_issues), 1,
                         msg=print_unexpected_issues(matching_issues))
    # See GitHub issue #235
    @unittest.expectedFailure
    def test_multiple_identifiers(self):