From 46218738c9482c4495035003bf1adf0f04f12f06 Mon Sep 17 00:00:00 2001
From: Robert Alonso <17463757+rgalonso@users.noreply.github.com>
Date: Tue, 12 Nov 2024 03:26:05 +0000
Subject: [PATCH] fix: get correct line numbering of old and new lines

track context of old and new lines separately to
get proper line numbers relative to old and
new version of a file

Closes GitHub #236
---
 TodoParser.py              | 186 ++++++++++++++++++++-----------------
 tests/test_process_diff.py |   2 -
 2 files changed, 103 insertions(+), 85 deletions(-)

diff --git a/TodoParser.py b/TodoParser.py
index 801702e..859be83 100644
--- a/TodoParser.py
+++ b/TodoParser.py
@@ -11,9 +11,7 @@ import itertools
 class TodoParser(object):
     """Parser for extracting information from a given diff file."""
     FILE_HUNK_PATTERN = r'(?<=diff)(.*?)(?=diff\s--git\s)'
-    HEADER_PATTERN = r'(?<=--git).*?(?=$\n(index|new|deleted))'
-    LINE_PATTERN = r'^.*$'
-    FILENAME_PATTERN = re.compile(r'(?<=a/).+?(?=\sb/)')
+    HEADERS_PATTERN = re.compile(r'(?<=--git) a/(.*?) b/(.*?)$\n(?=((new|deleted).*?$\n)?index ([0-9a-f]+)\.\.([0-9a-f]+))', re.MULTILINE)
     LINE_NUMBERS_PATTERN = re.compile(r'@@[\d\s,\-+]*\s@@.*')
     LINE_NUMBERS_INNER_PATTERN = re.compile(r'@@[\d\s,\-+]*\s@@')
     ADDITION_PATTERN = re.compile(r'(?<=^\+).*')
@@ -154,15 +152,10 @@ class TodoParser(object):
         # Iterate through each section extracted above.
         for hunk in extracted_file_hunks:
             # Extract the file information so we can figure out the Markdown language and comment syntax.
-            header_search = re.search(self.HEADER_PATTERN, hunk, re.MULTILINE)
-            if not header_search:
+            headers = self.HEADERS_PATTERN.search(hunk)
+            if not headers:
                 continue
-            files = header_search.group(0)
-
-            filename_search = re.search(self.FILENAME_PATTERN, files)
-            if not filename_search:
-                continue
-            curr_file = filename_search.group(0)
+            curr_file = headers.group(2)
             if self._should_ignore(curr_file):
                 continue
             curr_markers, curr_markdown_language = self._get_file_details(curr_file)
@@ -175,15 +168,18 @@ class TodoParser(object):
             for i, line_numbers in enumerate(line_numbers_iterator):
                 line_numbers_inner_search = re.search(self.LINE_NUMBERS_INNER_PATTERN, line_numbers.group(0))
                 line_numbers_str = line_numbers_inner_search.group(0).strip('@@ -')
-                start_line = line_numbers_str.split(' ')[1].strip('+')
-                start_line = int(start_line.split(',')[0])
+                deleted_start_line = line_numbers_str.split(' ')[0]
+                deleted_start_line = int(deleted_start_line.split(',')[0])
+                added_start_line = line_numbers_str.split(' ')[1].strip('+')
+                added_start_line = int(added_start_line.split(',')[0])
 
                 # Put this information into a temporary dict for simplicity.
                 block = {
                     'file': curr_file,
                     'markers': curr_markers,
                     'markdown_language': curr_markdown_language,
-                    'start_line': start_line,
+                    'deleted_start_line': deleted_start_line,
+                    'added_start_line': added_start_line,
                     'hunk': hunk,
                     'hunk_start': line_numbers.end(),
                     'hunk_end': None
@@ -209,8 +205,25 @@ class TodoParser(object):
 
         # Now for each code block, check for comments, then those comments for TODOs.
         for block in code_blocks:
-            # convert hunk string into newline-separated list (excluding first element which is always null and not actually first line of hunk)
-            bl=block['hunk'].split('\n')[1:]
+            # for both the set of deleted lines and set of new lines, convert hunk string into
+            # newline-separated list (excluding first element which is always null and not
+            # actually first line of hunk)
+            old=[]
+            new=[]
+            for line in block['hunk'].split('\n')[1:]:
+                if line: # if not empty
+                    match line[0]:
+                        case '-':
+                            old.append(line)
+                        case '+':
+                            new.append(line)
+                        case _:
+                            if line != '\\ No newline at end of file':
+                                old.append(line)
+                                new.append(line)
+                elif line != '\\ No newline at end of file':
+                    old.append(line)
+                    new.append(line)
 
             for marker in block['markers']:
                 # initialize list
@@ -250,19 +263,22 @@ class TodoParser(object):
 
                     # create regex object to search for comments
                     compiled_pattern=re.compile(comment_pattern)
-                    # for each element of list, enumerate it and if value is a regex match, include it in list that is returned,
-                    # where each element of the list is a dictionary that is the start end end lines of the match (relative to
-                    # start of the hunk) and the matching string itself
-                    comments_and_positions = [{'start': i, 'end': i, 'comment': x} for i, x in enumerate(bl) if compiled_pattern.search(x)]
-                    if len(comments_and_positions) > 0:
-                        # create filtered list which consolidates contiguous lines
-                        contiguous_comments_and_positions=[comments_and_positions[0]]
-                        for j, x in enumerate(comments_and_positions[1:]):
-                            if x['start'] == (comments_and_positions[j]['end'] + 1):
-                                contiguous_comments_and_positions[-1]['end']+=1
-                                contiguous_comments_and_positions[-1]['comment'] += '\n' + x['comment']
-                            else:
-                                contiguous_comments_and_positions.append(x)
+                    # analyze the set of old lines and new lines separately, so that we don't, for example,
+                    # accidentally treat deleted lines as if they were being added in this diff
+                    for block_lines in [old, new]:
+                        # for each element of list, enumerate it and if value is a regex match, include it in list that is returned,
+                        # where each element of the list is a dictionary that is the start and end lines of the match (relative to
+                        # start of the hunk) and the matching string itself
+                        comments_and_positions = [{'start': i, 'end': i, 'comment': x} for i, x in enumerate(block_lines) if compiled_pattern.search(x)]
+                        if len(comments_and_positions) > 0:
+                            # append filtered list which consolidates contiguous lines
+                            contiguous_comments_and_positions.append(comments_and_positions[0])
+                            for j, x in enumerate(comments_and_positions[1:]):
+                                if x['start'] == (comments_and_positions[j]['end'] + 1):
+                                    contiguous_comments_and_positions[-1]['end']+=1
+                                    contiguous_comments_and_positions[-1]['comment'] += '\n' + x['comment']
+                                else:
+                                    contiguous_comments_and_positions.append(x)
                 else:
                     # pattern consists of one group, the first comment block encountered
                     pattern=(r'([+\-\s]\s*' + marker['pattern']['start'] + r'.*?'
@@ -270,67 +286,70 @@ class TodoParser(object):
                     # compile above pattern
                     compiled_pattern = re.compile(pattern, re.DOTALL)
 
-                    # search for the pattern within the hunk and
-                    # return a list of iterators to all of the matches
-                    match_iters = compiled_pattern.finditer(block['hunk'])
+                    # analyze the set of old lines and new lines separately, so that we don't, for example,
+                    # accidentally treat deleted lines as if they were being added in this diff
+                    for block_lines in [old, new]:
+                        # convert list to string
+                        block_lines_str = '\n'.join(block_lines)
+                        # search for the pattern within the hunk and
+                        # return a list of iterators to all of the matches
+                        match_iters = compiled_pattern.finditer(block_lines_str)
 
-                    # split off into overlapping pairs. i.e. ['A', 'B', C'] => [('A', 'B'), ('B', 'C')]
-                    pairs = itertools.pairwise(match_iters)
+                        # split off into overlapping pairs. i.e. ['A', 'B', C'] => [('A', 'B'), ('B', 'C')]
+                        pairs = itertools.pairwise(match_iters)
 
-                    for i, pair in enumerate(pairs):
-                        # get start/end index (within hunk) of previous section
-                        prev_span = pair[0].span()
+                        for i, pair in enumerate(pairs):
+                            # get start/end index (within hunk) of previous section
+                            prev_span = pair[0].span()
 
-                        # if first iteration, special handling
-                        if i == 0:
-                            # set start line and comment string of first section
+                            # if first iteration, special handling
+                            if i == 0:
+                                # set start line and comment string of first section
+                                contiguous_comments_and_positions.append({
+                                            'start': block_lines_str.count('\n', 0, prev_span[0]),
+                                            'end': 0,
+                                            'comment': pair[0].group(0)
+                                        })
+                                # get number of lines in first section
+                                num_lines_in_first_section = block_lines_str.count('\n', prev_span[0], prev_span[1])
+                                # set end line of first section relative to its start
+                                contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section
+
+                            # get start/end index (within hunk) of current section
+                            curr_span = pair[1].span()
+                            # determine number of lines between previous end and current start
+                            num_lines_from_prev_section_end_line = block_lines_str.count('\n', prev_span[1], curr_span[0])
+                            # set start line of current section based on previous end
                             contiguous_comments_and_positions.append({
-                                        # -1 to ignore first newline, which isn't actually part of the hunk
-                                        'start': block['hunk'].count('\n', 0, prev_span[0]) - 1,
+                                        'start': contiguous_comments_and_positions[-1]['end'] + num_lines_from_prev_section_end_line,
                                         'end': 0,
-                                        'comment': pair[0].group(0)
+                                        'comment': pair[1].group(0)
                                     })
-                            # get number of lines in first section
-                            num_lines_in_first_section = block['hunk'].count('\n', prev_span[0], prev_span[1])
-                            # set end line of first section relative to its start
-                            contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section
+                            # get number of lines in current section
+                            num_lines_in_curr_section = block_lines_str.count('\n', curr_span[0], curr_span[1])
+                            # set end line of current section relative to its start
+                            contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_curr_section
 
-                        # get start/end index (within hunk) of current section
-                        curr_span = pair[1].span()
-                        # determine number of lines between previous end and current start
-                        num_lines_from_prev_section_end_line = block['hunk'].count('\n', prev_span[1], curr_span[0])
-                        # set start line of current section based on previous end
-                        contiguous_comments_and_positions.append({
-                                    'start': contiguous_comments_and_positions[-1]['end'] + num_lines_from_prev_section_end_line,
-                                    'end': 0,
-                                    'comment': pair[1].group(0)
-                                })
-                        # get number of lines in current section
-                        num_lines_in_curr_section = block['hunk'].count('\n', curr_span[0], curr_span[1])
-                        # set end line of current section relative to its start
-                        contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_curr_section
+                        # handle potential corner case where there was only one match
+                        # and therefore it couldn't be paired off
+                        if len(contiguous_comments_and_positions) == 0:
+                            # redo the search, this time returning the
+                            # result directly rather than an iterator
+                            match = compiled_pattern.search(block_lines_str)
 
-                    # handle potential corner case where there was only one match
-                    # and therefore it couldn't be paired off
-                    if len(contiguous_comments_and_positions) == 0:
-                        # redo the search, this time returning the
-                        # result directly rather than an iterator
-                        match = compiled_pattern.search(block['hunk'])
-
-                        if match:
-                            # get start/end index (within hunk) of this section
-                            span = match.span()
-                            # set start line and comment string of first section
-                            contiguous_comments_and_positions.append({
-                                        # -1 to ignore first newline, which isn't actually part of the hunk
-                                        'start': block['hunk'].count('\n', 0, span[0]) - 1,
-                                        'end': 0,
-                                        'comment': match.group(0)
-                                    })
-                            # get number of lines in first section
-                            num_lines_in_first_section = block['hunk'].count('\n', span[0], span[1])
-                            # set end line of first section relative to its start
-                            contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section
+                            if match:
+                                # get start/end index (within hunk) of this section
+                                span = match.span()
+                                # set start line and comment string of first section
+                                contiguous_comments_and_positions.append({
+                                            'start': block_lines_str.count('\n', 0, span[0]),
+                                            'end': 0,
+                                            'comment': match.group(0)
+                                        })
+                                # get number of lines in first section
+                                num_lines_in_first_section = block_lines_str.count('\n', span[0], span[1])
+                                # set end line of first section relative to its start
+                                contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section
 
                 for comment_and_position in contiguous_comments_and_positions:
                     extracted_issues = self._extract_issue_if_exists(comment_and_position, marker, block)
@@ -400,7 +419,8 @@ class TodoParser(object):
                     body=[],
                     hunk=hunk_info['hunk'],
                     file_name=hunk_info['file'],
-                    start_line=hunk_info['start_line'] + comment_block['start'] + line_number_within_comment_block,
+                    start_line=((hunk_info['deleted_start_line'] if line_status == LineStatus.DELETED else hunk_info['added_start_line'])
+                                + comment_block['start'] + line_number_within_comment_block),
                     start_line_within_hunk=comment_block['start'] + line_number_within_comment_block + 1,
                     num_lines=1,
                     markdown_language=hunk_info['markdown_language'],
diff --git a/tests/test_process_diff.py b/tests/test_process_diff.py
index 689d47a..f318bb2 100644
--- a/tests/test_process_diff.py
+++ b/tests/test_process_diff.py
@@ -80,8 +80,6 @@ class IssueUrlInsertionTest(unittest.TestCase):
         self._setUp(['test_new.diff'])
         self._standardTest(80)
 
-    # See GitHub issue #236
-    @unittest.expectedFailure
     def test_line_numbering_with_deletions(self):
         self._setUp(['test_new_py.diff', 'test_edit_py.diff'])
         with self.subTest("Issue URL insertion"):