[Release] Correctly translate contributors name in release notes

This commit involves three main changes:

(1) It separates the translation of contributor names from the
generation of the contributors list. This is largely motivated
by the Github API limit; even if we exceed this limit, we should
at least be able to proceed manually as before. This is why the
translation logic is abstracted into its own script
translate-contributors.py.

(2) When we look for candidate replacements for invalid author
names, we should look for the assignees of the associated JIRAs
too. As a result, the intermediate file must keep track of these.

(3) This provides an interactive mode with which the user can
sit at the terminal and manually pick the candidate replacement
that he/she thinks makes the most sense. As before, there is a
non-interactive mode that picks the first candidate that the
script considers "valid."

TODO: We should have a known_contributors file that stores
known mappings so we don't have to go through all of this
translation every time. This is also valuable because some
contributors simply cannot be automatically translated.
This commit is contained in:
Andrew Or 2014-12-03 19:08:29 -08:00
parent 657a88835d
commit a4dfb4efef
4 changed files with 229 additions and 55 deletions

3
.gitignore vendored
View file

@ -5,6 +5,7 @@
*.ipr
*.iml
*.iws
*.pyc
.idea/
.idea_modules/
sbt/*.jar
@ -49,6 +50,8 @@ dependency-reduced-pom.xml
checkpoint
derby.log
dist/
dev/create-release/*txt
dev/create-release/*new
spark-*-bin-*.tgz
unit-tests.log
/lib/

View file

@ -26,8 +26,6 @@ from releaseutils import *
# You must set the following before use!
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
START_COMMIT = os.environ.get("START_COMMIT", "37b100")
END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
@ -40,8 +38,6 @@ if not START_COMMIT or not END_COMMIT:
END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
# Verify provided arguments
if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided")
if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided")
start_commit_line = get_one_line(START_COMMIT)
end_commit_line = get_one_line(END_COMMIT)
num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
@ -60,14 +56,6 @@ if response.lower() != "y" and response:
sys.exit("Ok, exiting")
print "==================================================================================\n"
# Setup JIRA and github clients. We use two JIRA clients, one with authentication
# and one without, because authentication is slow and required only when we query
# JIRA user details but not Spark issues
jira_options = { "server": JIRA_API_BASE }
jira_client = JIRA(options = jira_options)
jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
github_client = Github()
# Find all commits within this range
print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
commits = get_one_line_commits(START_COMMIT, END_COMMIT)
@ -105,13 +93,17 @@ if releases or reverts or nojiras:
if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
print "==================== Warning: the above commits will be ignored ==================\n"
response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits))
if response.lower() != "y":
response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
if response.lower() != "y" and response:
sys.exit("Ok, exiting.")
# Keep track of warnings to tell the user at the end
warnings = []
# Mapping from the invalid author name to its associated JIRA issues
# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
invalid_authors = {}
# Populate a map that groups issues and components by author
# It takes the form: Author name -> { Contribution type -> Spark components }
# For instance,
@ -127,16 +119,23 @@ warnings = []
# }
#
author_info = {}
jira_options = { "server": JIRA_API_BASE }
jira_client = JIRA(options = jira_options)
print "\n=========================== Compiling contributor list ==========================="
for commit in filtered_commits:
commit_hash = re.findall("^[a-z0-9]+", commit)[0]
issues = re.findall("SPARK-[0-9]+", commit.upper())
# Translate the author in case the github username is not an actual name
# Also guard against any special characters used in the name
# Note the JIRA client we use here must have authentication enabled
author = get_author(commit_hash)
author = unidecode.unidecode(unicode(author, "UTF-8"))
author = translate_author(author, github_client, jira_client_auth, warnings)
author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
# If the author name is invalid, keep track of it along
# with all associated issues so we can translate it later
if is_valid_author(author):
author = capitalize_author(author)
else:
if author not in invalid_authors:
invalid_authors[author] = set()
for issue in issues:
invalid_authors[author].add(issue)
date = get_date(commit_hash)
# Parse components from the commit message, if any
commit_components = find_components(commit, commit_hash)
@ -147,7 +146,7 @@ for commit in filtered_commits:
author_info[author] = {}
if issue_type not in author_info[author]:
author_info[author][issue_type] = set()
for component in all_components:
for component in components:
author_info[author][issue_type].add(component)
# Find issues and components associated with this commit
for issue in issues:
@ -168,7 +167,6 @@ print "=========================================================================
# Each line takes the format "Author name - semi-colon delimited contributions"
# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
# e.g. Tathagata Das - Bug fixes and new features in Streaming
contributors_file_name = "contributors.txt"
contributors_file = open(contributors_file_name, "w")
authors = author_info.keys()
authors.sort()
@ -192,11 +190,23 @@ for author in authors:
# Do not use python's capitalize() on the whole string to preserve case
assert contribution
contribution = contribution[0].capitalize() + contribution[1:]
# If the author name is invalid, use an intermediate format that
# can be translated through translate-contributors.py later
# E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
if author in invalid_authors and invalid_authors[author]:
author = author + "/" + "/".join(invalid_authors[author])
line = "%s - %s" % (author, contribution)
contributors_file.write(line + "\n")
contributors_file.close()
print "Contributors list is successfully written to %s!" % contributors_file_name
# Prompt the user to translate author names if necessary
if invalid_authors:
warnings.append("Found the following invalid authors:")
for a in invalid_authors:
warnings.append("\t%s" % a)
warnings.append("Please run './translate-contributors.py' to translate them.")
# Log any warnings encountered in the process
if warnings:
print "\n============ Warnings encountered while creating the contributor list ============"

View file

@ -44,6 +44,9 @@ except ImportError:
print "Install using 'sudo pip install unidecode'"
sys.exit(-1)
# Contributors list file name
contributors_file_name = "contributors.txt"
# Utility functions run git commands (written with Git 1.8.5)
def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
def get_author(commit_hash):
@ -69,7 +72,8 @@ known_issue_types = {
"build": "build fixes",
"improvement": "improvements",
"new feature": "new features",
"documentation": "documentation"
"documentation": "documentation",
"test": "test"
}
# Maintain a mapping for translating component names when creating the release notes
@ -182,36 +186,3 @@ def capitalize_author(author):
words = [w[0].capitalize() + w[1:] for w in words if w]
return " ".join(words)
# Maintain a mapping of translated author names as a cache
translated_authors = {}
# Format the given author in a format appropriate for the contributors list.
# If the author is not an actual name, search github and JIRA for potential
# replacements and log all candidates as a warning.
def translate_author(github_author, github_client, jira_client, warnings):
if is_valid_author(github_author):
return capitalize_author(github_author)
# If the translated author is already cached, just return it
if github_author in translated_authors:
return translated_authors[github_author]
# Otherwise, author name is not found, so we need to search for an alternative name
candidates = set()
github_name = get_github_name(github_author, github_client)
jira_name = get_jira_name(github_author, jira_client)
if is_valid_author(github_name): github_name = capitalize_author(github_name)
if is_valid_author(jira_name): jira_name = capitalize_author(jira_name)
if github_name: candidates.add(github_name)
if jira_name: candidates.add(jira_name)
# Only use the github name as a replacement automatically
# The JIRA name may not make sense because it can belong to someone else
if is_valid_author(github_name):
candidates_message = " (another candidate is %s)" % jira_name if jira_name else ""
warnings.append("Replacing github user %s with %s%s" % (github_author, github_name, candidates_message))
translated_authors[github_name] = github_name
return translated_authors[github_name]
# No direct replacement, so return the original author and list any candidates found
candidates_message = " (candidates: %s)" % nice_join(candidates) if candidates else ""
warnings.append("Unable to find a replacement for github user %s%s" % (github_author, candidates_message))
translated_authors[github_author] = github_author
return translated_authors[github_author]

View file

@ -0,0 +1,190 @@
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script translates invalid authors in the contributors list generated
# by generate-contributors.py. When the script encounters an author name that
# is considered invalid, it searches Github and JIRA in an attempt to search
# for replacements. This tool runs in two modes:
#
# (1) Interactive mode: For each invalid author name, this script presents
# all candidate replacements to the user and awaits user response. In this
# mode, the user may also input a custom name. This is the default.
#
# (2) Non-interactive mode: For each invalid author name, this script replaces
# the name with the first valid candidate it can find. If there is none, it
# uses the original name. This can be enabled through the --non-interactive flag.
import os
import sys
from releaseutils import *
# You must set the following before use!
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
if not JIRA_USERNAME or not JIRA_PASSWORD:
sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
# Write new contributors list to <old_file_name>.new
if not os.path.isfile(contributors_file_name):
print "Contributors file %s does not exist!" % contributors_file_name
print "Have you run ./generate-contributors.py yet?"
sys.exit(1)
contributors_file = open(contributors_file_name, "r")
new_contributors_file_name = contributors_file_name + ".new"
new_contributors_file = open(new_contributors_file_name, "w")
warnings = []
# In non-interactive mode, this script will choose the first replacement that is valid
INTERACTIVE_MODE = True
if len(sys.argv) > 1:
options = set(sys.argv[1:])
if "--non-interactive" in options:
INTERACTIVE_MODE = False
if INTERACTIVE_MODE:
print "Running in interactive mode. To disable this, provide the --non-interactive flag."
# Setup Github and JIRA clients
jira_options = { "server": JIRA_API_BASE }
jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
github_client = Github()
# Generate candidates for the given author. This should only be called if the given author
# name does not represent a full name as this operation is somewhat expensive. Under the
# hood, it makes several calls to the Github and JIRA API servers to find the candidates.
#
# This returns a list of (candidate name, source) 2-tuples. E.g.
# [
# (NOT_FOUND, "No full name found for Github user andrewor14"),
# ("Andrew Or", "Full name of JIRA user andrewor14"),
# ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"),
# ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"),
# (NOT_FOUND, "No assignee found for SPARK-1763")
# ]
NOT_FOUND = "Not found"
def generate_candidates(author, issues):
candidates = []
# First check for full name of Github user
github_name = get_github_name(new_author, github_client)
if github_name:
candidates.append((github_name, "Full name of Github user %s" % new_author))
else:
candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author))
# Then do the same for JIRA user
jira_name = get_jira_name(new_author, jira_client)
if jira_name:
candidates.append((jira_name, "Full name of JIRA user %s" % new_author))
else:
candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author))
# Then do the same for the assignee of each of the associated JIRAs
# Note that a given issue may not have an assignee, or the assignee may not have a full name
for issue in issues:
jira_issue = jira_client.issue(issue)
jira_assignee = jira_issue.fields.assignee
if jira_assignee:
user_name = jira_assignee.name
display_name = jira_assignee.displayName
if display_name:
candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name)))
else:
candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name)))
else:
candidates.append((NOT_FOUND, "No assignee found for %s" % issue))
# Guard against special characters in candidate names
# Note that the candidate name may already be in unicode (JIRA returns this)
for i, (candidate, source) in enumerate(candidates):
try:
candidate = unicode(candidate, "UTF-8")
except TypeError:
# already in unicode
pass
candidate = unidecode.unidecode(candidate).strip()
candidates[i] = (candidate, source)
return candidates
# Translate each invalid author by searching for possible candidates from Github and JIRA
# In interactive mode, this script presents the user with a list of choices and have the user
# select from this list. Additionally, the user may also choose to enter a custom name.
# In non-interactive mode, this script picks the first valid author name from the candidates
# If no such name exists, the original name is used (without the JIRA numbers).
print "\n========================== Translating contributor list =========================="
for line in contributors_file:
author = line.split(" - ")[0]
print "Processing author %s" % author
if not author:
print " ERROR: Expected the following format <author> - <contributions>"
print " ERROR: Actual = %s" % line
if not is_valid_author(author):
new_author = author.split("/")[0]
issues = author.split("/")[1:]
candidates = generate_candidates(new_author, issues)
# Print out potential replacement candidates along with the sources, e.g.
# [X] No full name found for Github user andrewor14
# [0] Andrew Or - Full name of JIRA user andrewor14
# [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
# [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
# [X] No assignee found for SPARK-1763
# [3] Custom
candidate_names = []
for candidate, source in candidates:
if candidate == NOT_FOUND:
print " [X] %s" % source
else:
index = len(candidate_names)
candidate_names.append(candidate)
print " [%d] %s - %s" % (index, candidate, source)
custom_index = len(candidate_names)
# In interactive mode, additionally provide "custom" option and await user response
if INTERACTIVE_MODE:
print " [%d] Custom" % custom_index
response = raw_input(" Your choice: ")
while not response.isdigit() or int(response) > custom_index:
response = raw_input(" Please enter an integer between 0 and %d: " % custom_index)
response = int(response)
if response == custom_index:
new_author = raw_input(" Please type a custom name for this author: ")
else:
new_author = candidate_names[response]
# In non-interactive mode, just pick the first candidate
else:
valid_candidate_names = [name for name, _ in candidates\
if is_valid_author(name) and name != NOT_FOUND]
if valid_candidate_names:
new_author = valid_candidate_names[0]
# Finally, capitalize the author and replace the original one with it
# If the final replacement is still invalid, log a warning
if is_valid_author(new_author):
new_author = capitalize_author(new_author)
else:
warnings.append("Unable to find a valid name %s for author %s" % (new_author, author))
print " * Replacing %s with %s" % (author, new_author)
line = line.replace(author, new_author)
new_contributors_file.write(line)
print "==================================================================================\n"
contributors_file.close()
new_contributors_file.close()
print "Translated contributors list successfully written to %s!" % new_contributors_file_name
# Log any warnings encountered in the process
if warnings:
print "\n========== Warnings encountered while translating the contributor list ==========="
for w in warnings: print w
print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name
print "==================================================================================\n"