From 2f1ea87bd0138fcc19f37a1cc665fc0332388ef0 Mon Sep 17 00:00:00 2001
From: alstr <alstr@users.noreply.github.com>
Date: Fri, 20 Mar 2020 13:50:01 +0000
Subject: [PATCH] Improve diff parsing

Create link to each TODO start line
Allow multiline TODOs
---
 main.py | 99 ++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 74 insertions(+), 25 deletions(-)

diff --git a/main.py b/main.py
index f3bc141..321c005 100644
--- a/main.py
+++ b/main.py
@@ -15,6 +15,7 @@ def main():
     repo = os.getenv('INPUT_REPO')
     before = os.getenv('INPUT_BEFORE')
     sha = os.getenv('INPUT_SHA')
+    comment_marker = os.getenv('COMMENT_MARKER')
     label = os.getenv('INPUT_LABEL')
     params = {
         'access_token': os.getenv('INPUT_TOKEN')
@@ -30,17 +31,74 @@ def main():
     if diff_request.status_code == 200:
         diff = diff_request.text
 
-        # Check for additions in the diff.
-        addition_pattern = re.compile(r'(?<=^\+).*', re.MULTILINE)
-        additions = addition_pattern.findall(diff)
-        new_issues = []
+        header_pattern = re.compile(r'(?<=diff\s--git\s).+')
+        hunk_start_pattern = re.compile(r'((?<=^@@\s).+(?=\s@@))')
+        line_num_pattern = re.compile(r'(?<=\+).+')
+        addition_pattern = re.compile(r'(?<=^\+\s).+')
+        deletion_pattern = re.compile(r'(?<=^-\s).+')
+        todo_pattern = re.compile(r'(?<=' + label + r'\s).+')
+        comment_pattern = re.compile(r'(?<=' + comment_marker + r'\s).+')
 
-        # Filter the additions down to newly added TODOs.
-        for addition in additions:
-            todo_pattern = re.compile(r'(?<=' + label + r'\s).*')
-            todos = todo_pattern.search(addition)
-            if todos:
-                new_issues.append(todos.group(0))
+        new_issues = []
+        closed_issues = []
+
+        # Read the diff file one line at a time, checking for additions/deletions in each hunk.
+        with open(diff, 'r') as diff_file:
+            curr_file = None
+            hunk_start = None
+            hunk_index = None
+            recording = False
+
+            for n, line in enumerate(diff_file):
+                # First look for a diff header so we can determine the file the changes relate to.
+                header_search = header_pattern.search(line)
+                if header_search:
+                    files = header_search.group(0).split(' ')
+                    curr_file = files[1][2:]
+                else:
+                    # Look for hunks so we can get the line numbers for the changes.
+                    hunk_search = hunk_start_pattern.search(line)
+                    if hunk_search:
+                        hunk = hunk_search.group(0)
+                        line_nums = line_num_pattern.search(hunk).group(0).split(',')
+                        hunk_start = int(line_nums[0])
+                        hunk_index = n
+                    else:
+                        # Look for additions and deletions (specifically TODOs) within each hunk.
+                        addition_search = addition_pattern.search(line)
+                        if addition_search:
+                            addition = addition_search.group(0)
+                            todo_search = todo_pattern.search(addition)
+                            if todo_search:
+                                # Start recording so we can capture multiline TODOs.
+                                recording = True
+                                todo = todo_search.group(0)
+                                new_issue = {
+                                    'labels': ['todo'],
+                                    'todo': todo,
+                                    'body': todo,
+                                    'file': curr_file,
+                                    'line_num': hunk_start + (n - hunk_index - 1)
+                                }
+                                new_issues.append(new_issue)
+                                continue
+                            elif recording:
+                                # If we are recording, check if the current line continues the last.
+                                comment_search = comment_pattern.search(addition)
+                                if comment_search:
+                                    comment = comment_search.group(0).lstrip()
+                                    last_issue = new_issues[len(new_issues) - 1]
+                                    last_issue['body'] += '\n' + comment
+                                continue
+                        else:
+                            deletion_search = deletion_pattern.search(line)
+                            if deletion_search:
+                                deletion = deletion_search.group(0)
+                                todo_search = todo_pattern.search(deletion)
+                                if todo_search:
+                                    closed_issues.append(todo_search.group(0))
+                if recording:
+                    recording = False
 
         # Create new issues for any newly added TODOs.
         issues_url = f'{base_url}{repo}/issues'
@@ -48,27 +106,18 @@ def main():
             'Content-Type': 'application/json',
         }
         for issue in new_issues:
-            title = issue
+            title = issue['todo']
             # Truncate the title if it's longer than 50 chars.
             if len(title) > 50:
-                title = issue[:50] + '...'
-            new_issue_body = {'title': title, 'body': issue, 'labels': ['todo']}
+                title = title[:50] + '...'
+            file = issue['file']
+            line = issue['line_num']
+            body = issue['body'] + '\n' + f'https://github.com/{ repo }/blob/{ sha }/{ file }#L{ line }'
+            new_issue_body = {'title': title, 'body': body, 'labels': ['todo']}
             requests.post(url=issues_url, headers=issue_headers, params=params, data=json.dumps(new_issue_body))
             # Don't add too many issues too quickly.
             sleep(1)
 
-        # Check for deletions in the diff.
-        deletion_pattern = re.compile(r'(?<=^-).*', re.MULTILINE)
-        deletions = deletion_pattern.findall(diff)
-        closed_issues = []
-
-        # Filter the deletions down to removed TODOs.
-        for deletion in deletions:
-            todo_pattern = re.compile(r'(?<=' + label + r'\s).*')
-            todos = todo_pattern.search(deletion)
-            if todos:
-                closed_issues.append(todos.group(0))
-
         if len(closed_issues) > 0:
             # Get the list of current issues.
             list_issues_request = requests.get(issues_url, headers=issue_headers, params=params)