fix: get correct line numbering of old and new lines

track context of old and new lines separately to
get proper line numbers relative to old and
new version of a file

Closes GitHub #236
This commit is contained in:
Robert Alonso 2024-11-12 03:26:05 +00:00
parent f54fbebd3b
commit 46218738c9
2 changed files with 103 additions and 85 deletions

View File

@ -11,9 +11,7 @@ import itertools
class TodoParser(object): class TodoParser(object):
"""Parser for extracting information from a given diff file.""" """Parser for extracting information from a given diff file."""
FILE_HUNK_PATTERN = r'(?<=diff)(.*?)(?=diff\s--git\s)' FILE_HUNK_PATTERN = r'(?<=diff)(.*?)(?=diff\s--git\s)'
HEADER_PATTERN = r'(?<=--git).*?(?=$\n(index|new|deleted))' HEADERS_PATTERN = re.compile(r'(?<=--git) a/(.*?) b/(.*?)$\n(?=((new|deleted).*?$\n)?index ([0-9a-f]+)\.\.([0-9a-f]+))', re.MULTILINE)
LINE_PATTERN = r'^.*$'
FILENAME_PATTERN = re.compile(r'(?<=a/).+?(?=\sb/)')
LINE_NUMBERS_PATTERN = re.compile(r'@@[\d\s,\-+]*\s@@.*') LINE_NUMBERS_PATTERN = re.compile(r'@@[\d\s,\-+]*\s@@.*')
LINE_NUMBERS_INNER_PATTERN = re.compile(r'@@[\d\s,\-+]*\s@@') LINE_NUMBERS_INNER_PATTERN = re.compile(r'@@[\d\s,\-+]*\s@@')
ADDITION_PATTERN = re.compile(r'(?<=^\+).*') ADDITION_PATTERN = re.compile(r'(?<=^\+).*')
@ -154,15 +152,10 @@ class TodoParser(object):
# Iterate through each section extracted above. # Iterate through each section extracted above.
for hunk in extracted_file_hunks: for hunk in extracted_file_hunks:
# Extract the file information so we can figure out the Markdown language and comment syntax. # Extract the file information so we can figure out the Markdown language and comment syntax.
header_search = re.search(self.HEADER_PATTERN, hunk, re.MULTILINE) headers = self.HEADERS_PATTERN.search(hunk)
if not header_search: if not headers:
continue continue
files = header_search.group(0) curr_file = headers.group(2)
filename_search = re.search(self.FILENAME_PATTERN, files)
if not filename_search:
continue
curr_file = filename_search.group(0)
if self._should_ignore(curr_file): if self._should_ignore(curr_file):
continue continue
curr_markers, curr_markdown_language = self._get_file_details(curr_file) curr_markers, curr_markdown_language = self._get_file_details(curr_file)
@ -175,15 +168,18 @@ class TodoParser(object):
for i, line_numbers in enumerate(line_numbers_iterator): for i, line_numbers in enumerate(line_numbers_iterator):
line_numbers_inner_search = re.search(self.LINE_NUMBERS_INNER_PATTERN, line_numbers.group(0)) line_numbers_inner_search = re.search(self.LINE_NUMBERS_INNER_PATTERN, line_numbers.group(0))
line_numbers_str = line_numbers_inner_search.group(0).strip('@@ -') line_numbers_str = line_numbers_inner_search.group(0).strip('@@ -')
start_line = line_numbers_str.split(' ')[1].strip('+') deleted_start_line = line_numbers_str.split(' ')[0]
start_line = int(start_line.split(',')[0]) deleted_start_line = int(deleted_start_line.split(',')[0])
added_start_line = line_numbers_str.split(' ')[1].strip('+')
added_start_line = int(added_start_line.split(',')[0])
# Put this information into a temporary dict for simplicity. # Put this information into a temporary dict for simplicity.
block = { block = {
'file': curr_file, 'file': curr_file,
'markers': curr_markers, 'markers': curr_markers,
'markdown_language': curr_markdown_language, 'markdown_language': curr_markdown_language,
'start_line': start_line, 'deleted_start_line': deleted_start_line,
'added_start_line': added_start_line,
'hunk': hunk, 'hunk': hunk,
'hunk_start': line_numbers.end(), 'hunk_start': line_numbers.end(),
'hunk_end': None 'hunk_end': None
@ -209,8 +205,25 @@ class TodoParser(object):
# Now for each code block, check for comments, then those comments for TODOs. # Now for each code block, check for comments, then those comments for TODOs.
for block in code_blocks: for block in code_blocks:
# convert hunk string into newline-separated list (excluding first element which is always null and not actually first line of hunk) # for both the set of deleted lines and set of new lines, convert hunk string into
bl=block['hunk'].split('\n')[1:] # newline-separated list (excluding first element which is always null and not
# actually first line of hunk)
old=[]
new=[]
for line in block['hunk'].split('\n')[1:]:
if line: # if not empty
match line[0]:
case '-':
old.append(line)
case '+':
new.append(line)
case _:
if line != '\\ No newline at end of file':
old.append(line)
new.append(line)
elif line != '\\ No newline at end of file':
old.append(line)
new.append(line)
for marker in block['markers']: for marker in block['markers']:
# initialize list # initialize list
@ -250,13 +263,16 @@ class TodoParser(object):
# create regex object to search for comments # create regex object to search for comments
compiled_pattern=re.compile(comment_pattern) compiled_pattern=re.compile(comment_pattern)
# analyze the set of old lines and new lines separately, so that we don't, for example,
# accidentally treat deleted lines as if they were being added in this diff
for block_lines in [old, new]:
# for each element of list, enumerate it and if value is a regex match, include it in list that is returned, # for each element of list, enumerate it and if value is a regex match, include it in list that is returned,
# where each element of the list is a dictionary that is the start end end lines of the match (relative to # where each element of the list is a dictionary that is the start and end lines of the match (relative to
# start of the hunk) and the matching string itself # start of the hunk) and the matching string itself
comments_and_positions = [{'start': i, 'end': i, 'comment': x} for i, x in enumerate(bl) if compiled_pattern.search(x)] comments_and_positions = [{'start': i, 'end': i, 'comment': x} for i, x in enumerate(block_lines) if compiled_pattern.search(x)]
if len(comments_and_positions) > 0: if len(comments_and_positions) > 0:
# create filtered list which consolidates contiguous lines # append filtered list which consolidates contiguous lines
contiguous_comments_and_positions=[comments_and_positions[0]] contiguous_comments_and_positions.append(comments_and_positions[0])
for j, x in enumerate(comments_and_positions[1:]): for j, x in enumerate(comments_and_positions[1:]):
if x['start'] == (comments_and_positions[j]['end'] + 1): if x['start'] == (comments_and_positions[j]['end'] + 1):
contiguous_comments_and_positions[-1]['end']+=1 contiguous_comments_and_positions[-1]['end']+=1
@ -270,9 +286,14 @@ class TodoParser(object):
# compile above pattern # compile above pattern
compiled_pattern = re.compile(pattern, re.DOTALL) compiled_pattern = re.compile(pattern, re.DOTALL)
# analyze the set of old lines and new lines separately, so that we don't, for example,
# accidentally treat deleted lines as if they were being added in this diff
for block_lines in [old, new]:
# convert list to string
block_lines_str = '\n'.join(block_lines)
# search for the pattern within the hunk and # search for the pattern within the hunk and
# return a list of iterators to all of the matches # return a list of iterators to all of the matches
match_iters = compiled_pattern.finditer(block['hunk']) match_iters = compiled_pattern.finditer(block_lines_str)
# split off into overlapping pairs. i.e. ['A', 'B', C'] => [('A', 'B'), ('B', 'C')] # split off into overlapping pairs. i.e. ['A', 'B', C'] => [('A', 'B'), ('B', 'C')]
pairs = itertools.pairwise(match_iters) pairs = itertools.pairwise(match_iters)
@ -285,20 +306,19 @@ class TodoParser(object):
if i == 0: if i == 0:
# set start line and comment string of first section # set start line and comment string of first section
contiguous_comments_and_positions.append({ contiguous_comments_and_positions.append({
# -1 to ignore first newline, which isn't actually part of the hunk 'start': block_lines_str.count('\n', 0, prev_span[0]),
'start': block['hunk'].count('\n', 0, prev_span[0]) - 1,
'end': 0, 'end': 0,
'comment': pair[0].group(0) 'comment': pair[0].group(0)
}) })
# get number of lines in first section # get number of lines in first section
num_lines_in_first_section = block['hunk'].count('\n', prev_span[0], prev_span[1]) num_lines_in_first_section = block_lines_str.count('\n', prev_span[0], prev_span[1])
# set end line of first section relative to its start # set end line of first section relative to its start
contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section
# get start/end index (within hunk) of current section # get start/end index (within hunk) of current section
curr_span = pair[1].span() curr_span = pair[1].span()
# determine number of lines between previous end and current start # determine number of lines between previous end and current start
num_lines_from_prev_section_end_line = block['hunk'].count('\n', prev_span[1], curr_span[0]) num_lines_from_prev_section_end_line = block_lines_str.count('\n', prev_span[1], curr_span[0])
# set start line of current section based on previous end # set start line of current section based on previous end
contiguous_comments_and_positions.append({ contiguous_comments_and_positions.append({
'start': contiguous_comments_and_positions[-1]['end'] + num_lines_from_prev_section_end_line, 'start': contiguous_comments_and_positions[-1]['end'] + num_lines_from_prev_section_end_line,
@ -306,7 +326,7 @@ class TodoParser(object):
'comment': pair[1].group(0) 'comment': pair[1].group(0)
}) })
# get number of lines in current section # get number of lines in current section
num_lines_in_curr_section = block['hunk'].count('\n', curr_span[0], curr_span[1]) num_lines_in_curr_section = block_lines_str.count('\n', curr_span[0], curr_span[1])
# set end line of current section relative to its start # set end line of current section relative to its start
contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_curr_section contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_curr_section
@ -315,20 +335,19 @@ class TodoParser(object):
if len(contiguous_comments_and_positions) == 0: if len(contiguous_comments_and_positions) == 0:
# redo the search, this time returning the # redo the search, this time returning the
# result directly rather than an iterator # result directly rather than an iterator
match = compiled_pattern.search(block['hunk']) match = compiled_pattern.search(block_lines_str)
if match: if match:
# get start/end index (within hunk) of this section # get start/end index (within hunk) of this section
span = match.span() span = match.span()
# set start line and comment string of first section # set start line and comment string of first section
contiguous_comments_and_positions.append({ contiguous_comments_and_positions.append({
# -1 to ignore first newline, which isn't actually part of the hunk 'start': block_lines_str.count('\n', 0, span[0]),
'start': block['hunk'].count('\n', 0, span[0]) - 1,
'end': 0, 'end': 0,
'comment': match.group(0) 'comment': match.group(0)
}) })
# get number of lines in first section # get number of lines in first section
num_lines_in_first_section = block['hunk'].count('\n', span[0], span[1]) num_lines_in_first_section = block_lines_str.count('\n', span[0], span[1])
# set end line of first section relative to its start # set end line of first section relative to its start
contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section contiguous_comments_and_positions[-1]['end'] = contiguous_comments_and_positions[-1]['start'] + num_lines_in_first_section
@ -400,7 +419,8 @@ class TodoParser(object):
body=[], body=[],
hunk=hunk_info['hunk'], hunk=hunk_info['hunk'],
file_name=hunk_info['file'], file_name=hunk_info['file'],
start_line=hunk_info['start_line'] + comment_block['start'] + line_number_within_comment_block, start_line=((hunk_info['deleted_start_line'] if line_status == LineStatus.DELETED else hunk_info['added_start_line'])
+ comment_block['start'] + line_number_within_comment_block),
start_line_within_hunk=comment_block['start'] + line_number_within_comment_block + 1, start_line_within_hunk=comment_block['start'] + line_number_within_comment_block + 1,
num_lines=1, num_lines=1,
markdown_language=hunk_info['markdown_language'], markdown_language=hunk_info['markdown_language'],

View File

@ -80,8 +80,6 @@ class IssueUrlInsertionTest(unittest.TestCase):
self._setUp(['test_new.diff']) self._setUp(['test_new.diff'])
self._standardTest(80) self._standardTest(80)
# See GitHub issue #236
@unittest.expectedFailure
def test_line_numbering_with_deletions(self): def test_line_numbering_with_deletions(self):
self._setUp(['test_new_py.diff', 'test_edit_py.diff']) self._setUp(['test_new_py.diff', 'test_edit_py.diff'])
with self.subTest("Issue URL insertion"): with self.subTest("Issue URL insertion"):