[SPARK-19002][BUILD][PYTHON] Check pep8 against all Python scripts

## What changes were proposed in this pull request? This PR proposes to check pep8 against all other Python scripts and fix the errors as below: ```bash ./dev/create-release/generate-contributors.py ./dev/create-release/releaseutils.py ./dev/create-release/translate-contributors.py ./dev/lint-python ./python/docs/epytext.py ./examples/src/main/python/mllib/decision_tree_classification_example.py ./examples/src/main/python/mllib/decision_tree_regression_example.py ./examples/src/main/python/mllib/gradient_boosting_classification_example.py ./examples/src/main/python/mllib/gradient_boosting_regression_example.py ./examples/src/main/python/mllib/linear_regression_with_sgd_example.py ./examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py ./examples/src/main/python/mllib/naive_bayes_example.py ./examples/src/main/python/mllib/random_forest_classification_example.py ./examples/src/main/python/mllib/random_forest_regression_example.py ./examples/src/main/python/mllib/svm_with_sgd_example.py ./examples/src/main/python/streaming/network_wordjoinsentiments.py ./sql/hive/src/test/resources/data/scripts/cat.py ./sql/hive/src/test/resources/data/scripts/cat_error.py ./sql/hive/src/test/resources/data/scripts/doubleescapedtab.py ./sql/hive/src/test/resources/data/scripts/dumpdata_script.py ./sql/hive/src/test/resources/data/scripts/escapedcarriagereturn.py ./sql/hive/src/test/resources/data/scripts/escapednewline.py ./sql/hive/src/test/resources/data/scripts/escapedtab.py ./sql/hive/src/test/resources/data/scripts/input20_script.py ./sql/hive/src/test/resources/data/scripts/newline.py ``` ## How was this patch tested? - `./python/docs/epytext.py` ```bash cd ./python/docs $$ make html ``` - pep8 check (Python 2.7 / Python 3.3.6) ``` ./dev/lint-python ``` - `./dev/merge_spark_pr.py` (Python 2.7 only / Python 3.3.6 not working) ```bash python -m doctest -v ./dev/merge_spark_pr.py ``` - `./dev/create-release/releaseutils.py` `./dev/create-release/generate-contributors.py` `./dev/create-release/translate-contributors.py` (Python 2.7 only / Python 3.3.6 not working) ```bash python generate-contributors.py python translate-contributors.py ``` - Examples (Python 2.7 / Python 3.3.6) ```bash ./bin/spark-submit examples/src/main/python/mllib/decision_tree_classification_example.py ./bin/spark-submit examples/src/main/python/mllib/decision_tree_regression_example.py ./bin/spark-submit examples/src/main/python/mllib/gradient_boosting_classification_example.py ./bin/spark-submit examples/src/main/python/mllib/gradient_boosting_regression_example.p ./bin/spark-submit examples/src/main/python/mllib/random_forest_classification_example.py ./bin/spark-submit examples/src/main/python/mllib/random_forest_regression_example.py ``` - Examples (Python 2.7 only / Python 3.3.6 not working) ``` ./bin/spark-submit examples/src/main/python/mllib/linear_regression_with_sgd_example.py ./bin/spark-submit examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py ./bin/spark-submit examples/src/main/python/mllib/naive_bayes_example.py ./bin/spark-submit examples/src/main/python/mllib/svm_with_sgd_example.py ``` - `sql/hive/src/test/resources/data/scripts/*.py` (Python 2.7 / Python 3.3.6 within suggested changes) Manually tested only changed ones. - `./dev/github_jira_sync.py` (Python 2.7 only / Python 3.3.6 not working) Manually tested this after disabling actually adding comments and links. And also via Jenkins tests. Author: hyukjinkwon <gurwls223@gmail.com> Closes #16405 from HyukjinKwon/minor-pep8.
2017-01-02 15:23:19 +00:00 · 2017-01-02 15:23:19 +00:00 · 46b2126024
parent f1330b1d9e
commit 46b2126024
27 changed files with 326 additions and 233 deletions
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@ -33,14 +33,14 @@ PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")
 while not tag_exists(RELEASE_TAG):
    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
 while not tag_exists(PREVIOUS_RELEASE_TAG):
-    print "Please specify the previous release tag."
-    PREVIOUS_RELEASE_TAG = raw_input(\
-      "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
+    print("Please specify the previous release tag.")
+    PREVIOUS_RELEASE_TAG = raw_input(
+        "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")

 # Gather commits found in the new tag but not in the old tag.
 # This filters commits based on both the git hash and the PR number.
 # If either is present in the old tag, then we ignore the commit.
-print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)
+print("Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
 release_commits = get_commits(RELEASE_TAG)
 previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
 previous_release_hashes = set()
@ -62,17 +62,20 @@ if not new_commits:
    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))

 # Prompt the user for confirmation that the commit range is correct
-print "\n=================================================================================="
-print "JIRA server: %s" % JIRA_API_BASE
-print "Release tag: %s" % RELEASE_TAG
-print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG
-print "Number of commits in this range: %s" % len(new_commits)
+print("\n==================================================================================")
+print("JIRA server: %s" % JIRA_API_BASE)
+print("Release tag: %s" % RELEASE_TAG)
+print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
+print("Number of commits in this range: %s" % len(new_commits))
 print
+
+
 def print_indented(_list):
-    for x in _list: print "  %s" % x
+    for x in _list:
+        print("  %s" % x)
 if yesOrNoPrompt("Show all commits?"):
    print_indented(new_commits)
-print "==================================================================================\n"
+print("==================================================================================\n")
 if not yesOrNoPrompt("Does this look correct?"):
    sys.exit("Ok, exiting")

@ -82,45 +85,76 @@ maintenance = []
 reverts = []
 nojiras = []
 filtered_commits = []
+
+
 def is_release(commit_title):
-    return re.findall("\[release\]", commit_title.lower()) or\
-      "preparing spark release" in commit_title.lower() or\
-      "preparing development version" in commit_title.lower() or\
-      "CHANGES.txt" in commit_title
+    return re.findall("\[release\]", commit_title.lower()) or \
+        "preparing spark release" in commit_title.lower() or \
+        "preparing development version" in commit_title.lower() or \
+        "CHANGES.txt" in commit_title
+
+
 def is_maintenance(commit_title):
-    return "maintenance" in commit_title.lower() or\
-      "manually close" in commit_title.lower()
+    return "maintenance" in commit_title.lower() or \
+        "manually close" in commit_title.lower()
+
+
 def has_no_jira(commit_title):
    return not re.findall("SPARK-[0-9]+", commit_title.upper())
+
+
 def is_revert(commit_title):
    return "revert" in commit_title.lower()
+
+
 def is_docs(commit_title):
-    return re.findall("docs*", commit_title.lower()) or\
-      "programming guide" in commit_title.lower()
+    return re.findall("docs*", commit_title.lower()) or \
+        "programming guide" in commit_title.lower()
+
+
 for c in new_commits:
    t = c.get_title()
-    if not t: continue
-    elif is_release(t): releases.append(c)
-    elif is_maintenance(t): maintenance.append(c)
-    elif is_revert(t): reverts.append(c)
-    elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers
-    elif has_no_jira(t): nojiras.append(c)
-    else: filtered_commits.append(c)
+    if not t:
+        continue
+    elif is_release(t):
+        releases.append(c)
+    elif is_maintenance(t):
+        maintenance.append(c)
+    elif is_revert(t):
+        reverts.append(c)
+    elif is_docs(t):
+        filtered_commits.append(c)  # docs may not have JIRA numbers
+    elif has_no_jira(t):
+        nojiras.append(c)
+    else:
+        filtered_commits.append(c)

 # Warn against ignored commits
 if releases or maintenance or reverts or nojiras:
-    print "\n=================================================================================="
-    if releases: print "Found %d release commits" % len(releases)
-    if maintenance: print "Found %d maintenance commits" % len(maintenance)
-    if reverts: print "Found %d revert commits" % len(reverts)
-    if nojiras: print "Found %d commits with no JIRA" % len(nojiras)
-    print "* Warning: these commits will be ignored.\n"
+    print("\n==================================================================================")
+    if releases:
+        print("Found %d release commits" % len(releases))
+    if maintenance:
+        print("Found %d maintenance commits" % len(maintenance))
+    if reverts:
+        print("Found %d revert commits" % len(reverts))
+    if nojiras:
+        print("Found %d commits with no JIRA" % len(nojiras))
+    print("* Warning: these commits will be ignored.\n")
    if yesOrNoPrompt("Show ignored commits?"):
-        if releases: print "Release (%d)" % len(releases); print_indented(releases)
-        if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance)
-        if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts)
-        if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
-    print "==================== Warning: the above commits will be ignored ==================\n"
+        if releases:
+            print("Release (%d)" % len(releases))
+            print_indented(releases)
+        if maintenance:
+            print("Maintenance (%d)" % len(maintenance))
+            print_indented(maintenance)
+        if reverts:
+            print("Revert (%d)" % len(reverts))
+            print_indented(reverts)
+        if nojiras:
+            print("No JIRA (%d)" % len(nojiras))
+            print_indented(nojiras)
+    print("==================== Warning: the above commits will be ignored ==================\n")
 prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
 if not yesOrNoPrompt(prompt_msg):
    sys.exit("Ok, exiting.")
@ -147,9 +181,9 @@ invalid_authors = {}
 # }
 #
 author_info = {}
-jira_options = { "server": JIRA_API_BASE }
-jira_client = JIRA(options = jira_options)
-print "\n=========================== Compiling contributor list ==========================="
+jira_options = {"server": JIRA_API_BASE}
+jira_client = JIRA(options=jira_options)
+print("\n=========================== Compiling contributor list ===========================")
 for commit in filtered_commits:
    _hash = commit.get_hash()
    title = commit.get_title()
@ -168,8 +202,9 @@ for commit in filtered_commits:
    # Parse components from the commit title, if any
    commit_components = find_components(title, _hash)
    # Populate or merge an issue into author_info[author]
+
    def populate(issue_type, components):
-        components = components or [CORE_COMPONENT] # assume core if no components provided
+        components = components or [CORE_COMPONENT]  # assume core if no components provided
        if author not in author_info:
            author_info[author] = {}
        if issue_type not in author_info[author]:
@ -182,17 +217,17 @@ for commit in filtered_commits:
            jira_issue = jira_client.issue(issue)
            jira_type = jira_issue.fields.issuetype.name
            jira_type = translate_issue_type(jira_type, issue, warnings)
-            jira_components = [translate_component(c.name, _hash, warnings)\
-              for c in jira_issue.fields.components]
+            jira_components = [translate_component(c.name, _hash, warnings)
+                               for c in jira_issue.fields.components]
            all_components = set(jira_components + commit_components)
            populate(jira_type, all_components)
        except Exception as e:
-            print "Unexpected error:", e
+            print("Unexpected error:", e)
    # For docs without an associated JIRA, manually add it ourselves
    if is_docs(title) and not issues:
        populate("documentation", commit_components)
-    print "  Processed commit %s authored by %s on %s" % (_hash, author, date)
-print "==================================================================================\n"
+    print("  Processed commit %s authored by %s on %s" % (_hash, author, date))
+print("==================================================================================\n")

 # Write to contributors file ordered by author names
 # Each line takes the format " * Author name -- semi-colon delimited contributions"
@ -215,8 +250,8 @@ for author in authors:
    # Otherwise, group contributions by issue types instead of modules
    # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
    else:
-        contributions = ["%s in %s" % (issue_type, nice_join(comps)) \
-          for issue_type, comps in author_info[author].items()]
+        contributions = ["%s in %s" % (issue_type, nice_join(comps))
+                         for issue_type, comps in author_info[author].items()]
        contribution = "; ".join(contributions)
    # Do not use python's capitalize() on the whole string to preserve case
    assert contribution
@ -226,11 +261,11 @@ for author in authors:
    # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
    if author in invalid_authors and invalid_authors[author]:
        author = author + "/" + "/".join(invalid_authors[author])
-    #line = " * %s -- %s" % (author, contribution)
+    # line = " * %s -- %s" % (author, contribution)
    line = author
    contributors_file.write(line + "\n")
 contributors_file.close()
-print "Contributors list is successfully written to %s!" % contributors_file_name
+print("Contributors list is successfully written to %s!" % contributors_file_name)

 # Prompt the user to translate author names if necessary
 if invalid_authors:
@ -241,8 +276,8 @@ if invalid_authors:

 # Log any warnings encountered in the process
 if warnings:
-    print "\n============ Warnings encountered while creating the contributor list ============"
-    for w in warnings: print w
-    print "Please correct these in the final contributors list at %s." % contributors_file_name
-    print "==================================================================================\n"
-
+    print("\n============ Warnings encountered while creating the contributor list ============")
+    for w in warnings:
+        print(w)
+    print("Please correct these in the final contributors list at %s." % contributors_file_name)
+    print("==================================================================================\n")
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@ -30,28 +30,29 @@ try:
    except ImportError:
        from jira.utils import JIRAError
 except ImportError:
-    print "This tool requires the jira-python library"
-    print "Install using 'sudo pip install jira'"
+    print("This tool requires the jira-python library")
+    print("Install using 'sudo pip install jira'")
    sys.exit(-1)

 try:
    from github import Github
    from github import GithubException
 except ImportError:
-    print "This tool requires the PyGithub library"
-    print "Install using 'sudo pip install PyGithub'"
+    print("This tool requires the PyGithub library")
+    print("Install using 'sudo pip install PyGithub'")
    sys.exit(-1)

 try:
    import unidecode
 except ImportError:
-    print "This tool requires the unidecode library to decode obscure github usernames"
-    print "Install using 'sudo pip install unidecode'"
+    print("This tool requires the unidecode library to decode obscure github usernames")
+    print("Install using 'sudo pip install unidecode'")
    sys.exit(-1)

 # Contributors list file name
 contributors_file_name = "contributors.txt"

+
 # Prompt the user to answer yes or no until they do so
 def yesOrNoPrompt(msg):
    response = raw_input("%s [y/n]: " % msg)
@ -59,30 +60,50 @@ def yesOrNoPrompt(msg):
        return yesOrNoPrompt(msg)
    return response == "y"

+
 # Utility functions run git commands (written with Git 1.8.5)
-def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
-def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1]
+def run_cmd(cmd):
+    return Popen(cmd, stdout=PIPE).communicate()[0]
+
+
+def run_cmd_error(cmd):
+    return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1]
+
+
 def get_date(commit_hash):
    return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
+
+
 def tag_exists(tag):
    stderr = run_cmd_error(["git", "show", tag])
    return "error" not in stderr

+
 # A type-safe representation of a commit
 class Commit:
-    def __init__(self, _hash, author, title, pr_number = None):
+    def __init__(self, _hash, author, title, pr_number=None):
        self._hash = _hash
        self.author = author
        self.title = title
        self.pr_number = pr_number
-    def get_hash(self): return self._hash
-    def get_author(self): return self.author
-    def get_title(self): return self.title
-    def get_pr_number(self): return self.pr_number
+
+    def get_hash(self):
+        return self._hash
+
+    def get_author(self):
+        return self.author
+
+    def get_title(self):
+        return self.title
+
+    def get_pr_number(self):
+        return self.pr_number
+
    def __str__(self):
        closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
        return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)

+
 # Return all commits that belong to the specified tag.
 #
 # Under the hood, this runs a `git log` on that tag and parses the fields
@ -106,8 +127,9 @@ def get_commits(tag):
    raw_commits = [c for c in output.split(commit_start_marker) if c]
    for commit in raw_commits:
        if commit.count(commit_end_marker) != 1:
-            print "Commit end marker not found in commit: "
-            for line in commit.split("\n"): print line
+            print("Commit end marker not found in commit: ")
+            for line in commit.split("\n"):
+                print(line)
            sys.exit(1)
        # Separate commit digest from the body
        # From the digest we extract the hash, author and the title
@ -178,6 +200,7 @@ known_components = {
    "yarn": "YARN"
 }

+
 # Translate issue types using a format appropriate for writing contributions
 # If an unknown issue type is encountered, warn the user
 def translate_issue_type(issue_type, issue_id, warnings):
@ -188,6 +211,7 @@ def translate_issue_type(issue_type, issue_id, warnings):
        warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id))
        return issue_type

+
 # Translate component names using a format appropriate for writing contributions
 # If an unknown component is encountered, warn the user
 def translate_component(component, commit_hash, warnings):
@ -198,20 +222,22 @@ def translate_component(component, commit_hash, warnings):
        warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash))
        return component

+
 # Parse components in the commit message
 # The returned components are already filtered and translated
 def find_components(commit, commit_hash):
    components = re.findall("\[\w*\]", commit.lower())
-    components = [translate_component(c, commit_hash)\
-        for c in components if c in known_components]
+    components = [translate_component(c, commit_hash)
+                  for c in components if c in known_components]
    return components

+
 # Join a list of strings in a human-readable manner
 # e.g. ["Juice"] -> "Juice"
 # e.g. ["Juice", "baby"] -> "Juice and baby"
 # e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon"
 def nice_join(str_list):
-    str_list = list(str_list) # sometimes it's a set
+    str_list = list(str_list)  # sometimes it's a set
    if not str_list:
        return ""
    elif len(str_list) == 1:
@ -221,6 +247,7 @@ def nice_join(str_list):
    else:
        return ", ".join(str_list[:-1]) + ", and " + str_list[-1]

+
 # Return the full name of the specified user on Github
 # If the user doesn't exist, return None
 def get_github_name(author, github_client):
@ -233,6 +260,7 @@ def get_github_name(author, github_client):
                raise e
    return None

+
 # Return the full name of the specified user on JIRA
 # If the user doesn't exist, return None
 def get_jira_name(author, jira_client):
@ -245,15 +273,18 @@ def get_jira_name(author, jira_client):
                raise e
    return None

+
 # Return whether the given name is in the form <First Name><space><Last Name>
 def is_valid_author(author):
-    if not author: return False
+    if not author:
+        return False
    return " " in author and not re.findall("[0-9]", author)

+
 # Capitalize the first letter of each word in the given author name
 def capitalize_author(author):
-    if not author: return None
+    if not author:
+        return None
    words = author.split(" ")
    words = [w[0].capitalize() + w[1:] for w in words if w]
    return " ".join(words)
-
--- a/dev/create-release/translate-contributors.py
+++ b/dev/create-release/translate-contributors.py
@ -45,8 +45,8 @@ if not GITHUB_API_TOKEN:

 # Write new contributors list to <old_file_name>.final
 if not os.path.isfile(contributors_file_name):
-    print "Contributors file %s does not exist!" % contributors_file_name
-    print "Have you run ./generate-contributors.py yet?"
+    print("Contributors file %s does not exist!" % contributors_file_name)
+    print("Have you run ./generate-contributors.py yet?")
    sys.exit(1)
 contributors_file = open(contributors_file_name, "r")
 warnings = []
@ -58,11 +58,11 @@ if len(sys.argv) > 1:
    if "--non-interactive" in options:
        INTERACTIVE_MODE = False
 if INTERACTIVE_MODE:
-    print "Running in interactive mode. To disable this, provide the --non-interactive flag."
+    print("Running in interactive mode. To disable this, provide the --non-interactive flag.")

 # Setup Github and JIRA clients
-jira_options = { "server": JIRA_API_BASE }
-jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
+jira_options = {"server": JIRA_API_BASE}
+jira_client = JIRA(options=jira_options, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
 github_client = Github(GITHUB_API_TOKEN)

 # Load known author translations that are cached locally
@ -70,7 +70,8 @@ known_translations = {}
 known_translations_file_name = "known_translations"
 known_translations_file = open(known_translations_file_name, "r")
 for line in known_translations_file:
-    if line.startswith("#"): continue
+    if line.startswith("#"):
+        continue
    [old_name, new_name] = line.strip("\n").split(" - ")
    known_translations[old_name] = new_name
 known_translations_file.close()
@ -91,6 +92,8 @@ known_translations_file = open(known_translations_file_name, "a")
 #   (NOT_FOUND, "No assignee found for SPARK-1763")
 # ]
 NOT_FOUND = "Not found"
+
+
 def generate_candidates(author, issues):
    candidates = []
    # First check for full name of Github user
@ -121,9 +124,11 @@ def generate_candidates(author, issues):
            user_name = jira_assignee.name
            display_name = jira_assignee.displayName
            if display_name:
-                candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name)))
+                candidates.append(
+                    (display_name, "Full name of %s assignee %s" % (issue, user_name)))
            else:
-                candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name)))
+                candidates.append(
+                    (NOT_FOUND, "No full name found for %s assignee %s" % (issue, user_name)))
        else:
            candidates.append((NOT_FOUND, "No assignee found for %s" % issue))
    # Guard against special characters in candidate names
@ -143,18 +148,18 @@ def generate_candidates(author, issues):
 # select from this list. Additionally, the user may also choose to enter a custom name.
 # In non-interactive mode, this script picks the first valid author name from the candidates
 # If no such name exists, the original name is used (without the JIRA numbers).
-print "\n========================== Translating contributor list =========================="
+print("\n========================== Translating contributor list ==========================")
 lines = contributors_file.readlines()
 contributions = []
 for i, line in enumerate(lines):
    # It is possible that a line in the contributor file only has the github name, e.g. yhuai.
    # So, we need a strip() to remove the newline.
    temp_author = line.strip(" * ").split(" -- ")[0].strip()
-    print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines))
+    print("Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)))
    if not temp_author:
        error_msg = "    ERROR: Expected the following format \" * <author> -- <contributions>\"\n"
        error_msg += "    ERROR: Actual = %s" % line
-        print error_msg
+        print(error_msg)
        warnings.append(error_msg)
        contributions.append(line)
        continue
@ -175,8 +180,8 @@ for i, line in enumerate(lines):
        #   [3] andrewor14 - Raw Github username
        #   [4] Custom
        candidate_names = []
-        bad_prompts = [] # Prompts that can't actually be selected; print these first.
-        good_prompts = [] # Prompts that contain valid choices
+        bad_prompts = []  # Prompts that can't actually be selected; print these first.
+        good_prompts = []  # Prompts that contain valid choices
        for candidate, source in candidates:
            if candidate == NOT_FOUND:
                bad_prompts.append("    [X] %s" % source)
@ -186,13 +191,16 @@ for i, line in enumerate(lines):
                good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
        raw_index = len(candidate_names)
        custom_index = len(candidate_names) + 1
-        for p in bad_prompts: print p
-        if bad_prompts: print "    ---"
-        for p in good_prompts: print p
+        for p in bad_prompts:
+            print(p)
+        if bad_prompts:
+            print("    ---")
+        for p in good_prompts:
+            print(p)
        # In interactive mode, additionally provide "custom" option and await user response
        if INTERACTIVE_MODE:
-            print "    [%d] %s - Raw Github username" % (raw_index, author)
-            print "    [%d] Custom" % custom_index
+            print("    [%d] %s - Raw Github username" % (raw_index, author))
+            print("    [%d] Custom" % custom_index)
            response = raw_input("    Your choice: ")
            last_index = custom_index
            while not response.isdigit() or int(response) > last_index:
@ -204,8 +212,8 @@ for i, line in enumerate(lines):
                new_author = candidate_names[response]
        # In non-interactive mode, just pick the first candidate
        else:
-            valid_candidate_names = [name for name, _ in candidates\
-                if is_valid_author(name) and name != NOT_FOUND]
+            valid_candidate_names = [name for name, _ in candidates
+                                     if is_valid_author(name) and name != NOT_FOUND]
            if valid_candidate_names:
                new_author = valid_candidate_names[0]
        # Finally, capitalize the author and replace the original one with it
@ -213,17 +221,20 @@ for i, line in enumerate(lines):
        if is_valid_author(new_author):
            new_author = capitalize_author(new_author)
        else:
-            warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author))
-        print "    * Replacing %s with %s" % (author, new_author)
-        # If we are in interactive mode, prompt the user whether we want to remember this new mapping
-        if INTERACTIVE_MODE and\
-          author not in known_translations and\
-          yesOrNoPrompt("    Add mapping %s -> %s to known translations file?" % (author, new_author)):
+            warnings.append(
+                "Unable to find a valid name %s for author %s" % (author, temp_author))
+        print("    * Replacing %s with %s" % (author, new_author))
+        # If we are in interactive mode, prompt the user whether we want to remember this new
+        # mapping
+        if INTERACTIVE_MODE and \
+            author not in known_translations and \
+                yesOrNoPrompt(
+                    "    Add mapping %s -> %s to known translations file?" % (author, new_author)):
            known_translations_file.write("%s - %s\n" % (author, new_author))
            known_translations_file.flush()
        line = line.replace(temp_author, author)
    contributions.append(line)
-print "==================================================================================\n"
+print("==================================================================================\n")
 contributors_file.close()
 known_translations_file.close()

@ -244,12 +255,13 @@ for line in contributions:
    new_contributors_file.write(line)
 new_contributors_file.close()

-print "Translated contributors list successfully written to %s!" % new_contributors_file_name
+print("Translated contributors list successfully written to %s!" % new_contributors_file_name)

 # Log any warnings encountered in the process
 if warnings:
-    print "\n========== Warnings encountered while translating the contributor list ==========="
-    for w in warnings: print w
-    print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name
-    print "==================================================================================\n"
-
+    print("\n========== Warnings encountered while translating the contributor list ===========")
+    for w in warnings:
+        print(w)
+    print("Please manually correct these in the final contributors list at %s." %
+          new_contributors_file_name)
+    print("==================================================================================\n")
--- a/dev/github_jira_sync.py
+++ b/dev/github_jira_sync.py
@ -27,8 +27,8 @@ import urllib2
 try:
    import jira.client
 except ImportError:
-    print "This tool requires the jira-python library"
-    print "Install using 'sudo pip install jira'"
+    print("This tool requires the jira-python library")
+    print("Install using 'sudo pip install jira'")
    sys.exit(-1)

 # User facing configs
@ -48,16 +48,19 @@ MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496"))
 # the state of JIRA's that are tied to PR's we've already looked at.
 MAX_FILE = ".github-jira-max"

+
 def get_url(url):
    try:
        return urllib2.urlopen(url)
-    except urllib2.HTTPError as e:
-        print "Unable to fetch URL, exiting: %s" % url
+    except urllib2.HTTPError:
+        print("Unable to fetch URL, exiting: %s" % url)
        sys.exit(-1)

+
 def get_json(urllib_response):
    return json.load(urllib_response)

+
 # Return a list of (JIRA id, JSON dict) tuples:
 # e.g. [('SPARK-1234', {.. json ..}), ('SPARK-5687', {.. json ..})}
 def get_jira_prs():
@ -65,83 +68,86 @@ def get_jira_prs():
    has_next_page = True
    page_num = 0
    while has_next_page:
-	page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num)
-	page_json = get_json(page)
+        page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num)
+        page_json = get_json(page)

-	for pull in page_json:
-	    jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title'])
-	    for jira in jiras:
-		result = result + [(jira,  pull)]
+        for pull in page_json:
+            jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title'])
+            for jira in jiras:
+                result = result + [(jira, pull)]

-	# Check if there is another page
-	link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0]
-	if not "next"in link_header:
-	    has_next_page = False
-	else:
-	    page_num = page_num + 1
+        # Check if there is another page
+        link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0]
+        if "next" not in link_header:
+            has_next_page = False
+        else:
+            page_num += 1
    return result

+
 def set_max_pr(max_val):
    f = open(MAX_FILE, 'w')
    f.write("%s" % max_val)
    f.close()
-    print "Writing largest PR number seen: %s" % max_val
+    print("Writing largest PR number seen: %s" % max_val)
+

 def get_max_pr():
    if os.path.exists(MAX_FILE):
        result = int(open(MAX_FILE, 'r').read())
-        print "Read largest PR number previously seen: %s" % result
+        print("Read largest PR number previously seen: %s" % result)
        return result
    else:
        return 0

+
 jira_client = jira.client.JIRA({'server': JIRA_API_BASE},
-                                basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
+                               basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))

 jira_prs = get_jira_prs()

 previous_max = get_max_pr()
-print "Retrieved %s JIRA PR's from Github" % len(jira_prs)
+print("Retrieved %s JIRA PR's from Github" % len(jira_prs))
 jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max]
-print "%s PR's remain after excluding visted ones" % len(jira_prs)
+print("%s PR's remain after excluding visted ones" % len(jira_prs))

 num_updates = 0
 considered = []
-for issue, pr in sorted(jira_prs, key=lambda (k, v): int(v['number'])):
+for issue, pr in sorted(jira_prs, key=lambda kv: int(kv[1]['number'])):
    if num_updates >= MAX_UPDATES:
-      break
+        break
    pr_num = int(pr['number'])

-    print "Checking issue %s" % issue
+    print("Checking issue %s" % issue)
    considered = considered + [pr_num]

    url = pr['html_url']
-    title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) 
+    title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login'])
    try:
-      existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue))
+        existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue))
    except:
-      print "Failure reading JIRA %s (does it exist?)" % issue
-      print sys.exc_info()[0]
-      continue
+        print("Failure reading JIRA %s (does it exist?)" % issue)
+        print(sys.exc_info()[0])
+        continue

    if url in existing_links:
        continue

-    icon = {"title": "Pull request #%s" % pr['number'], 
-      "url16x16": "https://assets-cdn.github.com/favicon.ico"}
+    icon = {"title": "Pull request #%s" % pr['number'],
+            "url16x16": "https://assets-cdn.github.com/favicon.ico"}
    destination = {"title": title, "url": url, "icon": icon}
    # For all possible fields see:
-    # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links     
-    # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} 
+    # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links
+    # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"}
    jira_client.add_remote_link(issue, destination)
-    
+
    comment = "User '%s' has created a pull request for this issue:" % pr['user']['login']
-    comment = comment + ("\n%s" % pr['html_url'])
+    comment += "\n%s" % pr['html_url']
    if pr_num >= MIN_COMMENT_PR:
        jira_client.add_comment(issue, comment)
-    
-    print "Added link %s <-> PR #%s" % (issue, pr['number'])
-    num_updates = num_updates + 1
+
+    print("Added link %s <-> PR #%s" % (issue, pr['number']))
+    num_updates += 1

 if len(considered) > 0:
    set_max_pr(max(considered))
--- a/dev/lint-python
+++ b/dev/lint-python
@ -19,10 +19,8 @@

 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
-PATHS_TO_CHECK="./python/pyspark/ ./examples/src/main/python/ ./dev/sparktestsupport"
-# TODO: fix pep8 errors with the rest of the Python scripts under dev
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/*.py ./dev/run-tests-jenkins.py"
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/pip-sanity-check.py"
+# Exclude auto-geneated configuration file.
+PATHS_TO_CHECK="$( cd "$SPARK_ROOT_DIR" && find . -name "*.py" -not -path "*python/docs/conf.py" )"
 PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
 PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@ -70,22 +70,22 @@ def get_json(url):
        return json.load(urllib2.urlopen(request))
    except urllib2.HTTPError as e:
        if "X-RateLimit-Remaining" in e.headers and e.headers["X-RateLimit-Remaining"] == '0':
-            print "Exceeded the GitHub API rate limit; see the instructions in " + \
-                  "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + \
-                  "GitHub requests."
+            print("Exceeded the GitHub API rate limit; see the instructions in " +
+                  "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " +
+                  "GitHub requests.")
        else:
-            print "Unable to fetch URL, exiting: %s" % url
+            print("Unable to fetch URL, exiting: %s" % url)
        sys.exit(-1)


 def fail(msg):
-    print msg
+    print(msg)
    clean_up()
    sys.exit(-1)


 def run_cmd(cmd):
-    print cmd
+    print(cmd)
    if isinstance(cmd, list):
        return subprocess.check_output(cmd)
    else:
@ -97,14 +97,15 @@ def continue_maybe(prompt):
    if result.lower() != "y":
        fail("Okay, exiting")

+
 def clean_up():
-    print "Restoring head pointer to %s" % original_head
+    print("Restoring head pointer to %s" % original_head)
    run_cmd("git checkout %s" % original_head)

    branches = run_cmd("git branch").replace(" ", "").split("\n")

    for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches):
-        print "Deleting local branch %s" % branch
+        print("Deleting local branch %s" % branch)
        run_cmd("git branch -D %s" % branch)


@ -246,9 +247,9 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):

    if cur_status == "Resolved" or cur_status == "Closed":
        fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status))
-    print ("=== JIRA %s ===" % jira_id)
-    print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % (
-        cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id))
+    print("=== JIRA %s ===" % jira_id)
+    print("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" %
+          (cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id))

    versions = asf_jira.project_versions("SPARK")
    versions = sorted(versions, key=lambda x: x.name, reverse=True)
@ -282,10 +283,10 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
    resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0]
    resolution = filter(lambda r: r.raw['name'] == "Fixed", asf_jira.resolutions())[0]
    asf_jira.transition_issue(
-        jira_id, resolve["id"], fixVersions = jira_fix_versions,
-        comment = comment, resolution = {'id': resolution.raw['id']})
+        jira_id, resolve["id"], fixVersions=jira_fix_versions,
+        comment=comment, resolution={'id': resolution.raw['id']})

-    print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions)
+    print("Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions))


 def resolve_jira_issues(title, merge_branches, comment):
@ -300,23 +301,29 @@ def resolve_jira_issues(title, merge_branches, comment):
 def standardize_jira_ref(text):
    """
    Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue"
+    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to
+    "[SPARK-XXX][MLLIB] Issue"

-    >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful")
+    >>> standardize_jira_ref(
+    ...     "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful")
    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful'
-    >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests")
+    >>> standardize_jira_ref(
+    ...     "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests")
    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests'
    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
    '[SPARK-5954][MLLIB] Top by key'
    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl")
    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
-    >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.")
+    >>> standardize_jira_ref(
+    ...     "SPARK-1094 Support MiMa for reporting binary compatibility accross versions.")
    '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.'
    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
    '[SPARK-1146][WIP] Vagrant support for Spark'
-    >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...")
+    >>> standardize_jira_ref(
+    ...     "SPARK-1032. If Yarn app fails before registering, app master stays aroun...")
    '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...'
-    >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.")
+    >>> standardize_jira_ref(
+    ...     "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.")
    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.'
    >>> standardize_jira_ref("Additional information for users building from source code")
    'Additional information for users building from source code'
@ -350,7 +357,8 @@ def standardize_jira_ref(text):
    # Assemble full text (JIRA ref(s), module(s), remaining text)
    clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip()

-    # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included
+    # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were
+    # included
    clean_text = re.sub(r'\s+', ' ', clean_text.strip())

    return clean_text
@ -385,17 +393,17 @@ def main():
    # Decide whether to use the modified title or not
    modified_title = standardize_jira_ref(pr["title"])
    if modified_title != pr["title"]:
-        print "I've re-written the title as follows to match the standard format:"
-        print "Original: %s" % pr["title"]
-        print "Modified: %s" % modified_title
+        print("I've re-written the title as follows to match the standard format:")
+        print("Original: %s" % pr["title"])
+        print("Modified: %s" % modified_title)
        result = raw_input("Would you like to use the modified title? (y/n): ")
        if result.lower() == "y":
            title = modified_title
-            print "Using modified title:"
+            print("Using modified title:")
        else:
            title = pr["title"]
-            print "Using original title:"
-        print title
+            print("Using original title:")
+        print(title)
    else:
        title = pr["title"]

@ -414,13 +422,13 @@ def main():
        merge_hash = merge_commits[0]["commit_id"]
        message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"]

-        print "Pull request %s has already been merged, assuming you want to backport" % pr_num
+        print("Pull request %s has already been merged, assuming you want to backport" % pr_num)
        commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify',
-                                    "%s^{commit}" % merge_hash]).strip() != ""
+                                        "%s^{commit}" % merge_hash]).strip() != ""
        if not commit_is_downloaded:
            fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num)

-        print "Found commit %s:\n%s" % (merge_hash, message)
+        print("Found commit %s:\n%s" % (merge_hash, message))
        cherry_pick(pr_num, merge_hash, latest_branch)
        sys.exit(0)

@ -429,9 +437,9 @@ def main():
            "Continue? (experts only!)"
        continue_maybe(msg)

-    print ("\n=== Pull Request #%s ===" % pr_num)
-    print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (
-        title, pr_repo_desc, target_ref, url))
+    print("\n=== Pull Request #%s ===" % pr_num)
+    print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" %
+          (title, pr_repo_desc, target_ref, url))
    continue_maybe("Proceed with merging pull request #%s?" % pr_num)

    merged_refs = [target_ref]
@ -445,14 +453,15 @@ def main():
    if JIRA_IMPORTED:
        if JIRA_USERNAME and JIRA_PASSWORD:
            continue_maybe("Would you like to update an associated JIRA?")
-            jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num)
+            jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % \
+                           (pr_num, GITHUB_BASE, pr_num)
            resolve_jira_issues(title, merged_refs, jira_comment)
        else:
-            print "JIRA_USERNAME and JIRA_PASSWORD not set"
-            print "Exiting without trying to close the associated JIRA."
+            print("JIRA_USERNAME and JIRA_PASSWORD not set")
+            print("Exiting without trying to close the associated JIRA.")
    else:
-        print "Could not find jira-python library. Run 'sudo pip install jira' to install."
-        print "Exiting without trying to close the associated JIRA."
+        print("Could not find jira-python library. Run 'sudo pip install jira' to install.")
+        print("Exiting without trying to close the associated JIRA.")

 if __name__ == "__main__":
    import doctest
--- a/examples/src/main/python/mllib/decision_tree_classification_example.py
+++ b/examples/src/main/python/mllib/decision_tree_classification_example.py
@ -44,7 +44,8 @@ if __name__ == "__main__":
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
+    testErr = labelsAndPredictions.filter(
+        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())
--- a/examples/src/main/python/mllib/decision_tree_regression_example.py
+++ b/examples/src/main/python/mllib/decision_tree_regression_example.py
@ -44,7 +44,7 @@ if __name__ == "__main__":
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
+    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression tree model:')
--- a/examples/src/main/python/mllib/gradient_boosting_classification_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_classification_example.py
@ -43,7 +43,8 @@ if __name__ == "__main__":
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
+    testErr = labelsAndPredictions.filter(
+        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification GBT model:')
    print(model.toDebugString())
--- a/examples/src/main/python/mllib/gradient_boosting_regression_example.py
+++ b/examples/src/main/python/mllib/gradient_boosting_regression_example.py
@ -43,7 +43,7 @@ if __name__ == "__main__":
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
+    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression GBT model:')
--- a/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
+++ b/examples/src/main/python/mllib/linear_regression_with_sgd_example.py
@ -44,7 +44,7 @@ if __name__ == "__main__":
    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds \
-        .map(lambda (v, p): (v - p)**2) \
+        .map(lambda vp: (vp[0] - vp[1])**2) \
        .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))

--- a/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
+++ b/examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py
@ -44,7 +44,7 @@ if __name__ == "__main__":

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
-    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
+    trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
--- a/examples/src/main/python/mllib/naive_bayes_example.py
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@ -50,7 +50,7 @@ if __name__ == "__main__":

    # Make prediction and test accuracy.
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
-    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
+    accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count()
    print('model accuracy {}'.format(accuracy))

    # Save and load model
@ -59,7 +59,7 @@ if __name__ == "__main__":
    model.save(sc, output_dir)
    sameModel = NaiveBayesModel.load(sc, output_dir)
    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
-    accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
+    accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count()
    print('sameModel accuracy {}'.format(accuracy))

    # $example off$
--- a/examples/src/main/python/mllib/random_forest_classification_example.py
+++ b/examples/src/main/python/mllib/random_forest_classification_example.py
@ -45,7 +45,8 @@ if __name__ == "__main__":
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
+    testErr = labelsAndPredictions.filter(
+        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())
--- a/examples/src/main/python/mllib/random_forest_regression_example.py
+++ b/examples/src/main/python/mllib/random_forest_regression_example.py
@ -45,7 +45,7 @@ if __name__ == "__main__":
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
+    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression forest model:')
--- a/examples/src/main/python/mllib/svm_with_sgd_example.py
+++ b/examples/src/main/python/mllib/svm_with_sgd_example.py
@ -38,7 +38,7 @@ if __name__ == "__main__":

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
-    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
+    trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
--- a/examples/src/main/python/streaming/network_wordjoinsentiments.py
+++ b/examples/src/main/python/streaming/network_wordjoinsentiments.py
@ -67,8 +67,8 @@ if __name__ == "__main__":
    # with the static RDD inside the transform() method and then multiplying
    # the frequency of the words by its sentiment value
    happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \
-        .map(lambda (word, tuple): (word, float(tuple[0]) * tuple[1])) \
-        .map(lambda (word, happiness): (happiness, word)) \
+        .map(lambda word_tuples: (word_tuples[0], float(word_tuples[1][0]) * word_tuples[1][1])) \
+        .map(lambda word_happiness: (word_happiness[1], word_happiness[0])) \
        .transform(lambda rdd: rdd.sortByKey(False))

    happiest_words.foreachRDD(print_happiest_words)
--- a/python/docs/epytext.py
+++ b/python/docs/epytext.py
@ -9,6 +9,7 @@ RULES = (
    ('pyspark.rdd.RDD', 'RDD'),
 )

+
 def _convert_epytext(line):
    """
    >>> _convert_epytext("L{A}")
@ -19,9 +20,11 @@ def _convert_epytext(line):
        line = re.sub(p, sub, line)
    return line

+
 def _process_docstring(app, what, name, obj, options, lines):
    for i in range(len(lines)):
        lines[i] = _convert_epytext(lines[i])

+
 def setup(app):
    app.connect("autodoc-process-docstring", _process_docstring)
--- a/sql/hive/src/test/resources/data/scripts/cat.py
+++ b/sql/hive/src/test/resources/data/scripts/cat.py
@ -16,14 +16,14 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-import sys, re
-import datetime
+from __future__ import print_function
+import sys
 import os

-table_name=None
-if os.environ.has_key('hive_streaming_tablename'):
-  table_name=os.environ['hive_streaming_tablename']
+table_name = None
+if os.environ in 'hive_streaming_tablename':
+    table_name = os.environ['hive_streaming_tablename']

 for line in sys.stdin:
-  print line
-  print >> sys.stderr, "dummy"
+    print(line)
+    print("dummy", file=sys.stderr)
--- a/sql/hive/src/test/resources/data/scripts/cat_error.py
+++ b/sql/hive/src/test/resources/data/scripts/cat_error.py
@ -19,6 +19,6 @@
 import sys

 for line in sys.stdin:
-  print line
+    print(line)

 sys.exit(1)
--- a/sql/hive/src/test/resources/data/scripts/doubleescapedtab.py
+++ b/sql/hive/src/test/resources/data/scripts/doubleescapedtab.py
@ -19,6 +19,5 @@
 import sys

 for line in sys.stdin:
-  print "1\\\\\\t2"
-  print "1\\\\\\\\t2"
-
+    print("1\\\\\\t2")
+    print("1\\\\\\\\t2")
--- a/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
+++ b/sql/hive/src/test/resources/data/scripts/dumpdata_script.py
@ -19,9 +19,9 @@
 import sys

 for i in xrange(50):
-   for j in xrange(5):
-      for k in xrange(20022):      
-         print 20000 * i + k
+    for j in xrange(5):
+        for k in xrange(20022):
+            print(20000 * i + k)

 for line in sys.stdin:
-  pass
+    pass
--- a/sql/hive/src/test/resources/data/scripts/escapedcarriagereturn.py
+++ b/sql/hive/src/test/resources/data/scripts/escapedcarriagereturn.py
@ -19,5 +19,4 @@
 import sys

 for line in sys.stdin:
-  print "1\\\\r2"
-
+    print("1\\\\r2")
--- a/sql/hive/src/test/resources/data/scripts/escapednewline.py
+++ b/sql/hive/src/test/resources/data/scripts/escapednewline.py
@ -19,5 +19,4 @@
 import sys

 for line in sys.stdin:
-  print "1\\\\n2"
-
+    print("1\\\\n2")
--- a/sql/hive/src/test/resources/data/scripts/escapedtab.py
+++ b/sql/hive/src/test/resources/data/scripts/escapedtab.py
@ -19,5 +19,4 @@
 import sys

 for line in sys.stdin:
-  print "1\\\\t2"
-
+    print("1\\\\t2")
--- a/sql/hive/src/test/resources/data/scripts/input20_script.py
+++ b/sql/hive/src/test/resources/data/scripts/input20_script.py
@ -21,10 +21,10 @@ import re
 line = sys.stdin.readline()
 x = 1
 while line:
-  tem = sys.stdin.readline()
-  if line == tem:
-    x = x + 1
-  else:
-    print str(x).strip()+'\t'+re.sub('\t','_',line.strip())
-    line = tem
-    x = 1
+    tem = sys.stdin.readline()
+    if line == tem:
+        x += 1
+    else:
+        print(str(x).strip()+'\t'+re.sub('\t', '_', line.strip()))
+        line = tem
+        x = 1
--- a/sql/hive/src/test/resources/data/scripts/newline.py
+++ b/sql/hive/src/test/resources/data/scripts/newline.py
@ -19,6 +19,6 @@
 import sys

 for line in sys.stdin:
-  print "1\\n2"
-  print "1\\r2"
-  print "1\\t2"
+    print("1\\n2")
+    print("1\\r2")
+    print("1\\t2")