spark-instrumented-optimizer/dev/create-release/releaseutils.py

#!/usr/bin/env python3

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file contains helper methods used in creating a release.

import re
import sys
from subprocess import Popen, PIPE

try:
    from jira.client import JIRA  # noqa: F401
    # Old versions have JIRAError in exceptions package, new (0.5+) in utils.
    try:
        from jira.exceptions import JIRAError
    except ImportError:
        from jira.utils import JIRAError
except ImportError:
    print("This tool requires the jira-python library")
    print("Install using 'sudo pip install jira'")
    sys.exit(-1)

try:
    from github import Github  # noqa: F401
    from github import GithubException
except ImportError:
    print("This tool requires the PyGithub library")
    print("Install using 'sudo pip install PyGithub'")
    sys.exit(-1)

try:
    import unidecode
except ImportError:
    print("This tool requires the unidecode library to decode obscure github usernames")
    print("Install using 'sudo pip install unidecode'")
    sys.exit(-1)


# Contributors list file name
contributors_file_name = "contributors.txt"


# Prompt the user to answer yes or no until they do so
def yesOrNoPrompt(msg):
    response = input("%s [y/n]: " % msg)
    while response != "y" and response != "n":
        return yesOrNoPrompt(msg)
    return response == "y"


# Utility functions run git commands (written with Git 1.8.5)
def run_cmd(cmd):
    return Popen(cmd, stdout=PIPE).communicate()[0]


def run_cmd_error(cmd):
    return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1]


def get_date(commit_hash):
    return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])


def tag_exists(tag):
    stderr = run_cmd_error(["git", "show", tag])
    return "error" not in stderr


# A type-safe representation of a commit
class Commit:
    def __init__(self, _hash, author, title, pr_number=None):
        self._hash = _hash
        self.author = author
        self.title = title
        self.pr_number = pr_number

    def get_hash(self):
        return self._hash

    def get_author(self):
        return self.author

    def get_title(self):
        return self.title

    def get_pr_number(self):
        return self.pr_number

    def __str__(self):
        closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
        return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)


# Return all commits that belong to the specified tag.
#
# Under the hood, this runs a `git log` on that tag and parses the fields
# from the command output to construct a list of Commit objects. Note that
# because certain fields reside in the commit description and cannot be parsed
# through the Github API itself, we need to do some intelligent regex parsing
# to extract those fields.
#
# This is written using Git 1.8.5.
def get_commits(tag):
    commit_start_marker = "|=== COMMIT START MARKER ===|"
    commit_end_marker = "|=== COMMIT END MARKER ===|"
    field_end_marker = "|=== COMMIT FIELD END MARKER ===|"
    log_format =\
        commit_start_marker + "%h" +\
        field_end_marker + "%an" +\
        field_end_marker + "%s" +\
        commit_end_marker + "%b"
    output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag])
    commits = []
    raw_commits = [c for c in output.split(commit_start_marker) if c]
    for commit in raw_commits:
        if commit.count(commit_end_marker) != 1:
            print("Commit end marker not found in commit: ")
            for line in commit.split("\n"):
                print(line)
            sys.exit(1)
        # Separate commit digest from the body
        # From the digest we extract the hash, author and the title
        # From the body, we extract the PR number and the github username
        [commit_digest, commit_body] = commit.split(commit_end_marker)
        if commit_digest.count(field_end_marker) != 2:
            sys.exit("Unexpected format in commit: %s" % commit_digest)
        [_hash, author, title] = commit_digest.split(field_end_marker)
        # The PR number and github username is in the commit message
        # itself and cannot be accessed through any Github API
        pr_number = None
        match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body)
        if match:
            [pr_number, github_username] = match.groups()
            # If the author name is not valid, use the github
            # username so we can translate it properly later
            if not is_valid_author(author):
                author = github_username
        # Guard against special characters
        author = str(author)
        author = unidecode.unidecode(author).strip()
        commit = Commit(_hash, author, title, pr_number)
        commits.append(commit)
    return commits

# Maintain a mapping for translating issue types to contributions in the release notes
# This serves an additional function of warning the user against unknown issue types
# Note: This list is partially derived from this link:
# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes
# Keep these in lower case
known_issue_types = {
    "bug": "bug fixes",
    "build": "build fixes",
    "dependency upgrade": "build fixes",
    "improvement": "improvements",
    "new feature": "new features",
    "documentation": "documentation",
    "test": "test",
    "task": "improvement",
    "sub-task": "improvement"
}

# Maintain a mapping for translating component names when creating the release notes
# This serves an additional function of warning the user against unknown components
# Note: This list is largely derived from this link:
# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components
CORE_COMPONENT = "Core"
known_components = {
    "block manager": CORE_COMPONENT,
    "build": CORE_COMPONENT,
    "deploy": CORE_COMPONENT,
    "documentation": CORE_COMPONENT,
    "examples": CORE_COMPONENT,
    "graphx": "GraphX",
    "input/output": CORE_COMPONENT,
    "java api": "Java API",
    "k8s": "Kubernetes",
    "kubernetes": "Kubernetes",
    "mesos": "Mesos",
    "ml": "MLlib",
    "mllib": "MLlib",
    "project infra": "Project Infra",
    "pyspark": "PySpark",
    "shuffle": "Shuffle",
    "spark core": CORE_COMPONENT,
    "spark shell": CORE_COMPONENT,
    "sql": "SQL",
    "streaming": "Streaming",
    "web ui": "Web UI",
    "windows": "Windows",
    "yarn": "YARN"
}


# Translate issue types using a format appropriate for writing contributions
# If an unknown issue type is encountered, warn the user
def translate_issue_type(issue_type, issue_id, warnings):
    issue_type = issue_type.lower()
    if issue_type in known_issue_types:
        return known_issue_types[issue_type]
    else:
        warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id))
        return issue_type


# Translate component names using a format appropriate for writing contributions
# If an unknown component is encountered, warn the user
def translate_component(component, commit_hash, warnings):
    component = component.lower()
    if component in known_components:
        return known_components[component]
    else:
        warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash))
        return component


# Parse components in the commit message
# The returned components are already filtered and translated
def find_components(commit, commit_hash):
    components = re.findall(r"\[\w*\]", commit.lower())
    components = [translate_component(c, commit_hash, [])
                  for c in components if c in known_components]
    return components


# Join a list of strings in a human-readable manner
# e.g. ["Juice"] -> "Juice"
# e.g. ["Juice", "baby"] -> "Juice and baby"
# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon"
def nice_join(str_list):
    str_list = list(str_list)  # sometimes it's a set
    if not str_list:
        return ""
    elif len(str_list) == 1:
        return next(iter(str_list))
    elif len(str_list) == 2:
        return " and ".join(str_list)
    else:
        return ", ".join(str_list[:-1]) + ", and " + str_list[-1]


# Return the full name of the specified user on Github
# If the user doesn't exist, return None
def get_github_name(author, github_client):
    if github_client:
        try:
            return github_client.get_user(author).name
        except GithubException as e:
            # If this is not a "not found" exception
            if e.status != 404:
                raise e
    return None


# Return the full name of the specified user on JIRA
# If the user doesn't exist, return None
def get_jira_name(author, jira_client):
    if jira_client:
        try:
            return jira_client.user(author).displayName
        except JIRAError as e:
            # If this is not a "not found" exception
            if e.status_code != 404:
                raise e
    return None


# Return whether the given name is in the form <First Name><space><Last Name>
def is_valid_author(author):
    if not author:
        return False
    return " " in author and not re.findall("[0-9]", author)


# Capitalize the first letter of each word in the given author name
def capitalize_author(author):
    if not author:
        return None
    words = author.split(" ")
    words = [w[0].capitalize() + w[1:] for w in words if w]
    return " ".join(words)