[Release] Correctly translate contributors name in release notes

This commit involves three main changes: (1) It separates the translation of contributor names from the generation of the contributors list. This is largely motivated by the Github API limit; even if we exceed this limit, we should at least be able to proceed manually as before. This is why the translation logic is abstracted into its own script translate-contributors.py. (2) When we look for candidate replacements for invalid author names, we should look for the assignees of the associated JIRAs too. As a result, the intermediate file must keep track of these. (3) This provides an interactive mode with which the user can sit at the terminal and manually pick the candidate replacement that he/she thinks makes the most sense. As before, there is a non-interactive mode that picks the first candidate that the script considers "valid." TODO: We should have a known_contributors file that stores known mappings so we don't have to go through all of this translation every time. This is also valuable because some contributors simply cannot be automatically translated.
2014-12-03 19:08:29 -08:00 · 2014-12-03 19:08:29 -08:00 · a4dfb4efef
parent 657a88835d
commit a4dfb4efef
4 changed files with 229 additions and 55 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@
 *.ipr
 *.iml
 *.iws
+*.pyc
 .idea/
 .idea_modules/
 sbt/*.jar
@ -49,6 +50,8 @@ dependency-reduced-pom.xml
 checkpoint
 derby.log
 dist/
+dev/create-release/*txt
+dev/create-release/*new
 spark-*-bin-*.tgz
 unit-tests.log
 /lib/
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@ -26,8 +26,6 @@ from releaseutils import *

 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
 START_COMMIT = os.environ.get("START_COMMIT", "37b100")
 END_COMMIT = os.environ.get("END_COMMIT", "3693ae")

@ -40,8 +38,6 @@ if not START_COMMIT or not END_COMMIT:
        END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")

 # Verify provided arguments
-if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided")
-if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided")
 start_commit_line = get_one_line(START_COMMIT)
 end_commit_line = get_one_line(END_COMMIT)
 num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
@ -60,14 +56,6 @@ if response.lower() != "y" and response:
    sys.exit("Ok, exiting")
 print "==================================================================================\n"

-# Setup JIRA and github clients. We use two JIRA clients, one with authentication
-# and one without, because authentication is slow and required only when we query
-# JIRA user details but not Spark issues
-jira_options = { "server": JIRA_API_BASE }
-jira_client = JIRA(options = jira_options)
-jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github()
-
 # Find all commits within this range
 print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
 commits = get_one_line_commits(START_COMMIT, END_COMMIT)
@ -105,13 +93,17 @@ if releases or reverts or nojiras:
    if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
    if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
    print "==================== Warning: the above commits will be ignored ==================\n"
-response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits))
-if response.lower() != "y":
+response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
+if response.lower() != "y" and response:
    sys.exit("Ok, exiting.")

 # Keep track of warnings to tell the user at the end
 warnings = []

+# Mapping from the invalid author name to its associated JIRA issues
+# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
+invalid_authors = {}
+
 # Populate a map that groups issues and components by author
 # It takes the form: Author name -> { Contribution type -> Spark components }
 # For instance,
@ -127,16 +119,23 @@ warnings = []
 # }
 #
 author_info = {}
+jira_options = { "server": JIRA_API_BASE }
+jira_client = JIRA(options = jira_options)
 print "\n=========================== Compiling contributor list ==========================="
 for commit in filtered_commits:
    commit_hash = re.findall("^[a-z0-9]+", commit)[0]
    issues = re.findall("SPARK-[0-9]+", commit.upper())
-    # Translate the author in case the github username is not an actual name
-    # Also guard against any special characters used in the name
-    # Note the JIRA client we use here must have authentication enabled
    author = get_author(commit_hash)
-    author = unidecode.unidecode(unicode(author, "UTF-8"))
-    author = translate_author(author, github_client, jira_client_auth, warnings)
+    author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
+    # If the author name is invalid, keep track of it along
+    # with all associated issues so we can translate it later
+    if is_valid_author(author):
+        author = capitalize_author(author)
+    else:
+        if author not in invalid_authors:
+            invalid_authors[author] = set()
+        for issue in issues:
+            invalid_authors[author].add(issue)
    date = get_date(commit_hash)
    # Parse components from the commit message, if any
    commit_components = find_components(commit, commit_hash)
@ -147,7 +146,7 @@ for commit in filtered_commits:
            author_info[author] = {}
        if issue_type not in author_info[author]:
            author_info[author][issue_type] = set()
-        for component in all_components:
+        for component in components:
            author_info[author][issue_type].add(component)
    # Find issues and components associated with this commit
    for issue in issues:
@ -168,7 +167,6 @@ print "=========================================================================
 # Each line takes the format "Author name - semi-colon delimited contributions"
 # e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
 # e.g. Tathagata Das - Bug fixes and new features in Streaming
-contributors_file_name = "contributors.txt"
 contributors_file = open(contributors_file_name, "w")
 authors = author_info.keys()
 authors.sort()
@ -192,11 +190,23 @@ for author in authors:
    # Do not use python's capitalize() on the whole string to preserve case
    assert contribution
    contribution = contribution[0].capitalize() + contribution[1:]
+    # If the author name is invalid, use an intermediate format that
+    # can be translated through translate-contributors.py later
+    # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
+    if author in invalid_authors and invalid_authors[author]:
+        author = author + "/" + "/".join(invalid_authors[author])
    line = "%s - %s" % (author, contribution)
    contributors_file.write(line + "\n")
 contributors_file.close()
 print "Contributors list is successfully written to %s!" % contributors_file_name

+# Prompt the user to translate author names if necessary
+if invalid_authors:
+    warnings.append("Found the following invalid authors:")
+    for a in invalid_authors:
+        warnings.append("\t%s" % a)
+    warnings.append("Please run './translate-contributors.py' to translate them.")
+
 # Log any warnings encountered in the process
 if warnings:
    print "\n============ Warnings encountered while creating the contributor list ============"
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@ -44,6 +44,9 @@ except ImportError:
    print "Install using 'sudo pip install unidecode'"
    sys.exit(-1)

+# Contributors list file name
+contributors_file_name = "contributors.txt"
+
 # Utility functions run git commands (written with Git 1.8.5)
 def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
 def get_author(commit_hash):
@ -69,7 +72,8 @@ known_issue_types = {
    "build": "build fixes",
    "improvement": "improvements",
    "new feature": "new features",
-    "documentation": "documentation"
+    "documentation": "documentation",
+    "test": "test"
 }

 # Maintain a mapping for translating component names when creating the release notes
@ -182,36 +186,3 @@ def capitalize_author(author):
    words = [w[0].capitalize() + w[1:] for w in words if w]
    return " ".join(words)

-# Maintain a mapping of translated author names as a cache
-translated_authors = {}
-
-# Format the given author in a format appropriate for the contributors list.
-# If the author is not an actual name, search github and JIRA for potential
-# replacements and log all candidates as a warning.
-def translate_author(github_author, github_client, jira_client, warnings):
-    if is_valid_author(github_author):
-        return capitalize_author(github_author)
-    # If the translated author is already cached, just return it
-    if github_author in translated_authors:
-        return translated_authors[github_author]
-    # Otherwise, author name is not found, so we need to search for an alternative name
-    candidates = set()
-    github_name = get_github_name(github_author, github_client)
-    jira_name = get_jira_name(github_author, jira_client)
-    if is_valid_author(github_name): github_name = capitalize_author(github_name)
-    if is_valid_author(jira_name): jira_name = capitalize_author(jira_name)
-    if github_name: candidates.add(github_name)
-    if jira_name: candidates.add(jira_name)
-    # Only use the github name as a replacement automatically
-    # The JIRA name may not make sense because it can belong to someone else
-    if is_valid_author(github_name):
-        candidates_message = " (another candidate is %s)" % jira_name if jira_name else ""
-        warnings.append("Replacing github user %s with %s%s" % (github_author, github_name, candidates_message))
-        translated_authors[github_name] = github_name
-        return translated_authors[github_name]
-    # No direct replacement, so return the original author and list any candidates found
-    candidates_message = " (candidates: %s)" % nice_join(candidates) if candidates else ""
-    warnings.append("Unable to find a replacement for github user %s%s" % (github_author, candidates_message))
-    translated_authors[github_author] = github_author
-    return translated_authors[github_author]
-
--- a/dev/create-release/translate-contributors.py
+++ b/dev/create-release/translate-contributors.py
@ -0,0 +1,190 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script translates invalid authors in the contributors list generated
+# by generate-contributors.py. When the script encounters an author name that
+# is considered invalid, it searches Github and JIRA in an attempt to search
+# for replacements. This tool runs in two modes:
+#
+# (1) Interactive mode: For each invalid author name, this script presents
+# all candidate replacements to the user and awaits user response. In this
+# mode, the user may also input a custom name. This is the default.
+#
+# (2) Non-interactive mode: For each invalid author name, this script replaces
+# the name with the first valid candidate it can find. If there is none, it
+# uses the original name. This can be enabled through the --non-interactive flag.
+
+import os
+import sys
+
+from releaseutils import *
+
+# You must set the following before use!
+JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
+JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
+JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
+if not JIRA_USERNAME or not JIRA_PASSWORD:
+    sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
+
+# Write new contributors list to <old_file_name>.new
+if not os.path.isfile(contributors_file_name):
+    print "Contributors file %s does not exist!" % contributors_file_name
+    print "Have you run ./generate-contributors.py yet?"
+    sys.exit(1)
+contributors_file = open(contributors_file_name, "r")
+new_contributors_file_name = contributors_file_name + ".new"
+new_contributors_file = open(new_contributors_file_name, "w")
+warnings = []
+
+# In non-interactive mode, this script will choose the first replacement that is valid
+INTERACTIVE_MODE = True
+if len(sys.argv) > 1:
+    options = set(sys.argv[1:])
+    if "--non-interactive" in options:
+        INTERACTIVE_MODE = False
+if INTERACTIVE_MODE:
+    print "Running in interactive mode. To disable this, provide the --non-interactive flag."
+
+# Setup Github and JIRA clients
+jira_options = { "server": JIRA_API_BASE }
+jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
+github_client = Github()
+
+# Generate candidates for the given author. This should only be called if the given author
+# name does not represent a full name as this operation is somewhat expensive. Under the
+# hood, it makes several calls to the Github and JIRA API servers to find the candidates.
+#
+# This returns a list of (candidate name, source) 2-tuples. E.g.
+# [
+#   (NOT_FOUND, "No full name found for Github user andrewor14"),
+#   ("Andrew Or", "Full name of JIRA user andrewor14"),
+#   ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"),
+#   ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"),
+#   (NOT_FOUND, "No assignee found for SPARK-1763")
+# ]
+NOT_FOUND = "Not found"
+def generate_candidates(author, issues):
+    candidates = []
+    # First check for full name of Github user
+    github_name = get_github_name(new_author, github_client)
+    if github_name:
+        candidates.append((github_name, "Full name of Github user %s" % new_author))
+    else:
+        candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author))
+    # Then do the same for JIRA user
+    jira_name = get_jira_name(new_author, jira_client)
+    if jira_name:
+        candidates.append((jira_name, "Full name of JIRA user %s" % new_author))
+    else:
+        candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author))
+    # Then do the same for the assignee of each of the associated JIRAs
+    # Note that a given issue may not have an assignee, or the assignee may not have a full name
+    for issue in issues:
+        jira_issue = jira_client.issue(issue)
+        jira_assignee = jira_issue.fields.assignee
+        if jira_assignee:
+            user_name = jira_assignee.name
+            display_name = jira_assignee.displayName
+            if display_name:
+                candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name)))
+            else:
+                candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name)))
+        else:
+            candidates.append((NOT_FOUND, "No assignee found for %s" % issue))
+    # Guard against special characters in candidate names
+    # Note that the candidate name may already be in unicode (JIRA returns this)
+    for i, (candidate, source) in enumerate(candidates):
+        try:
+            candidate = unicode(candidate, "UTF-8")
+        except TypeError:
+            # already in unicode
+            pass
+        candidate = unidecode.unidecode(candidate).strip()
+        candidates[i] = (candidate, source)
+    return candidates
+
+# Translate each invalid author by searching for possible candidates from Github and JIRA
+# In interactive mode, this script presents the user with a list of choices and have the user
+# select from this list. Additionally, the user may also choose to enter a custom name.
+# In non-interactive mode, this script picks the first valid author name from the candidates
+# If no such name exists, the original name is used (without the JIRA numbers).
+print "\n========================== Translating contributor list =========================="
+for line in contributors_file:
+    author = line.split(" - ")[0]
+    print "Processing author %s" % author
+    if not author:
+        print "    ERROR: Expected the following format <author> - <contributions>"
+        print "    ERROR: Actual = %s" % line
+    if not is_valid_author(author):
+        new_author = author.split("/")[0]
+        issues = author.split("/")[1:]
+        candidates = generate_candidates(new_author, issues)
+        # Print out potential replacement candidates along with the sources, e.g.
+        #   [X] No full name found for Github user andrewor14
+        #   [0] Andrew Or - Full name of JIRA user andrewor14
+        #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
+        #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
+        #   [X] No assignee found for SPARK-1763
+        #   [3] Custom
+        candidate_names = []
+        for candidate, source in candidates:
+            if candidate == NOT_FOUND:
+                print "    [X] %s" % source
+            else:
+                index = len(candidate_names)
+                candidate_names.append(candidate)
+                print "    [%d] %s - %s" % (index, candidate, source)
+        custom_index = len(candidate_names)
+        # In interactive mode, additionally provide "custom" option and await user response
+        if INTERACTIVE_MODE:
+            print "    [%d] Custom" % custom_index
+            response = raw_input("    Your choice: ")
+            while not response.isdigit() or int(response) > custom_index:
+                response = raw_input("    Please enter an integer between 0 and %d: " % custom_index)
+            response = int(response)
+            if response == custom_index:
+                new_author = raw_input("    Please type a custom name for this author: ")
+            else:
+                new_author = candidate_names[response]
+        # In non-interactive mode, just pick the first candidate
+        else:
+            valid_candidate_names = [name for name, _ in candidates\
+                if is_valid_author(name) and name != NOT_FOUND]
+            if valid_candidate_names:
+                new_author = valid_candidate_names[0]
+        # Finally, capitalize the author and replace the original one with it
+        # If the final replacement is still invalid, log a warning
+        if is_valid_author(new_author):
+            new_author = capitalize_author(new_author)
+        else:
+            warnings.append("Unable to find a valid name %s for author %s" % (new_author, author))
+        print "    * Replacing %s with %s" % (author, new_author)
+        line = line.replace(author, new_author)
+    new_contributors_file.write(line)
+print "==================================================================================\n"
+contributors_file.close()
+new_contributors_file.close()
+
+print "Translated contributors list successfully written to %s!" % new_contributors_file_name
+
+# Log any warnings encountered in the process
+if warnings:
+    print "\n========== Warnings encountered while translating the contributor list ==========="
+    for w in warnings: print w
+    print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name
+    print "==================================================================================\n"
+