spark-instrumented-optimizer/dev/create-release/generate-contributors.py

#!/usr/bin/env python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script automates the process of creating release notes.

import os
import re
import sys

from releaseutils import *

# You must set the following before use!
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")

# If the release tags are not provided, prompt the user to provide them
while not tag_exists(RELEASE_TAG):
    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
while not tag_exists(PREVIOUS_RELEASE_TAG):
    print("Please specify the previous release tag.")
    PREVIOUS_RELEASE_TAG = raw_input(
        "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")

# Gather commits found in the new tag but not in the old tag.
# This filters commits based on both the git hash and the PR number.
# If either is present in the old tag, then we ignore the commit.
print("Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
release_commits = get_commits(RELEASE_TAG)
previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
previous_release_hashes = set()
previous_release_prs = set()
for old_commit in previous_release_commits:
    previous_release_hashes.add(old_commit.get_hash())
    if old_commit.get_pr_number():
        previous_release_prs.add(old_commit.get_pr_number())
new_commits = []
for this_commit in release_commits:
    this_hash = this_commit.get_hash()
    this_pr_number = this_commit.get_pr_number()
    if this_hash in previous_release_hashes:
        continue
    if this_pr_number and this_pr_number in previous_release_prs:
        continue
    new_commits.append(this_commit)
if not new_commits:
    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))

# Prompt the user for confirmation that the commit range is correct
print("\n==================================================================================")
print("JIRA server: %s" % JIRA_API_BASE)
print("Release tag: %s" % RELEASE_TAG)
print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
print("Number of commits in this range: %s" % len(new_commits))
print("")


def print_indented(_list):
    for x in _list:
        print("  %s" % x)
if yesOrNoPrompt("Show all commits?"):
    print_indented(new_commits)
print("==================================================================================\n")
if not yesOrNoPrompt("Does this look correct?"):
    sys.exit("Ok, exiting")

# Filter out special commits
releases = []
maintenance = []
reverts = []
nojiras = []
filtered_commits = []


def is_release(commit_title):
    return ("[release]" in commit_title.lower() or
            "preparing spark release" in commit_title.lower() or
            "preparing development version" in commit_title.lower() or
            "CHANGES.txt" in commit_title)


def is_maintenance(commit_title):
    return "maintenance" in commit_title.lower() or \
        "manually close" in commit_title.lower()


def has_no_jira(commit_title):
    return not re.findall("SPARK-[0-9]+", commit_title.upper())


def is_revert(commit_title):
    return "revert" in commit_title.lower()


def is_docs(commit_title):
    return re.findall("docs*", commit_title.lower()) or \
        "programming guide" in commit_title.lower()


for c in new_commits:
    t = c.get_title()
    if not t:
        continue
    elif is_release(t):
        releases.append(c)
    elif is_maintenance(t):
        maintenance.append(c)
    elif is_revert(t):
        reverts.append(c)
    elif is_docs(t):
        filtered_commits.append(c)  # docs may not have JIRA numbers
    elif has_no_jira(t):
        nojiras.append(c)
    else:
        filtered_commits.append(c)

# Warn against ignored commits
if releases or maintenance or reverts or nojiras:
    print("\n==================================================================================")
    if releases:
        print("Found %d release commits" % len(releases))
    if maintenance:
        print("Found %d maintenance commits" % len(maintenance))
    if reverts:
        print("Found %d revert commits" % len(reverts))
    if nojiras:
        print("Found %d commits with no JIRA" % len(nojiras))
    print("* Warning: these commits will be ignored.\n")
    if yesOrNoPrompt("Show ignored commits?"):
        if releases:
            print("Release (%d)" % len(releases))
            print_indented(releases)
        if maintenance:
            print("Maintenance (%d)" % len(maintenance))
            print_indented(maintenance)
        if reverts:
            print("Revert (%d)" % len(reverts))
            print_indented(reverts)
        if nojiras:
            print("No JIRA (%d)" % len(nojiras))
            print_indented(nojiras)
    print("==================== Warning: the above commits will be ignored ==================\n")
prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
if not yesOrNoPrompt(prompt_msg):
    sys.exit("Ok, exiting.")

# Keep track of warnings to tell the user at the end
warnings = []

# Mapping from the invalid author name to its associated JIRA issues
# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
invalid_authors = {}

# Populate a map that groups issues and components by author
# It takes the form: Author name -> { Contribution type -> Spark components }
# For instance,
# {
#   'Andrew Or': {
#     'bug fixes': ['windows', 'core', 'web ui'],
#     'improvements': ['core']
#   },
#   'Tathagata Das' : {
#     'bug fixes': ['streaming']
#     'new feature': ['streaming']
#   }
# }
#
author_info = {}
jira_options = {"server": JIRA_API_BASE}
jira_client = JIRA(options=jira_options)
print("\n=========================== Compiling contributor list ===========================")
for commit in filtered_commits:
    _hash = commit.get_hash()
    title = commit.get_title()
    issues = re.findall("SPARK-[0-9]+", title.upper())
    author = commit.get_author()
    date = get_date(_hash)
    # If the author name is invalid, keep track of it along
    # with all associated issues so we can translate it later
    if is_valid_author(author):
        author = capitalize_author(author)
    else:
        if author not in invalid_authors:
            invalid_authors[author] = set()
        for issue in issues:
            invalid_authors[author].add(issue)
    # Parse components from the commit title, if any
    commit_components = find_components(title, _hash)
    # Populate or merge an issue into author_info[author]

    def populate(issue_type, components):
        components = components or [CORE_COMPONENT]  # assume core if no components provided
        if author not in author_info:
            author_info[author] = {}
        if issue_type not in author_info[author]:
            author_info[author][issue_type] = set()
        for component in components:
            author_info[author][issue_type].add(component)
    # Find issues and components associated with this commit
    for issue in issues:
        try:
            jira_issue = jira_client.issue(issue)
            jira_type = jira_issue.fields.issuetype.name
            jira_type = translate_issue_type(jira_type, issue, warnings)
            jira_components = [translate_component(c.name, _hash, warnings)
                               for c in jira_issue.fields.components]
            all_components = set(jira_components + commit_components)
            populate(jira_type, all_components)
        except Exception as e:
            print("Unexpected error:", e)
    # For docs without an associated JIRA, manually add it ourselves
    if is_docs(title) and not issues:
        populate("documentation", commit_components)
    print("  Processed commit %s authored by %s on %s" % (_hash, author, date))
print("==================================================================================\n")

# Write to contributors file ordered by author names
# Each line takes the format " * Author name -- semi-colon delimited contributions"
# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core
# e.g. * Tathagata Das -- Bug fixes and new features in Streaming
contributors_file = open(contributors_file_name, "w")
authors = author_info.keys()
authors.sort()
for author in authors:
    contribution = ""
    components = set()
    issue_types = set()
    for issue_type, comps in author_info[author].items():
        components.update(comps)
        issue_types.add(issue_type)
    # If there is only one component, mention it only once
    # e.g. Bug fixes, improvements in MLlib
    if len(components) == 1:
        contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
    # Otherwise, group contributions by issue types instead of modules
    # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
    else:
        contributions = ["%s in %s" % (issue_type, nice_join(comps))
                         for issue_type, comps in author_info[author].items()]
        contribution = "; ".join(contributions)
    # Do not use python's capitalize() on the whole string to preserve case
    assert contribution
    contribution = contribution[0].capitalize() + contribution[1:]
    # If the author name is invalid, use an intermediate format that
    # can be translated through translate-contributors.py later
    # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
    if author in invalid_authors and invalid_authors[author]:
        author = author + "/" + "/".join(invalid_authors[author])
    # line = " * %s -- %s" % (author, contribution)
    line = author
    contributors_file.write(line + "\n")
contributors_file.close()
print("Contributors list is successfully written to %s!" % contributors_file_name)

# Prompt the user to translate author names if necessary
if invalid_authors:
    warnings.append("Found the following invalid authors:")
    for a in invalid_authors:
        warnings.append("\t%s" % a)
    warnings.append("Please run './translate-contributors.py' to translate them.")

# Log any warnings encountered in the process
if warnings:
    print("\n============ Warnings encountered while creating the contributor list ============")
    for w in warnings:
        print(w)
    print("Please correct these in the final contributors list at %s." % contributors_file_name)
    print("==================================================================================\n")