[SPARK-25238][PYTHON] lint-python: Fix W605 warnings for pycodestyle 2.4

(This change is a subset of the changes needed for the JIRA; see https://github.com/apache/spark/pull/22231) ## What changes were proposed in this pull request? Use raw strings and simpler regex syntax consistently in Python, which also avoids warnings from pycodestyle about accidentally relying Python's non-escaping of non-reserved chars in normal strings. Also, fix a few long lines. ## How was this patch tested? Existing tests, and some manual double-checking of the behavior of regexes in Python 2/3 to be sure. Closes #22400 from srowen/SPARK-25238.2. Authored-by: Sean Owen <sean.owen@databricks.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org>
2018-09-13 11:19:43 +08:00 · 2018-09-13 11:19:43 +08:00 · 08c76b5d39
parent 6dc5921e66
commit 08c76b5d39
22 changed files with 66 additions and 63 deletions
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@ -67,7 +67,7 @@ print("JIRA server: %s" % JIRA_API_BASE)
 print("Release tag: %s" % RELEASE_TAG)
 print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
 print("Number of commits in this range: %s" % len(new_commits))
-print
+print("")


 def print_indented(_list):
@ -88,10 +88,10 @@ filtered_commits = []


 def is_release(commit_title):
-    return re.findall("\[release\]", commit_title.lower()) or \
-        "preparing spark release" in commit_title.lower() or \
-        "preparing development version" in commit_title.lower() or \
-        "CHANGES.txt" in commit_title
+    return ("[release]" in commit_title.lower() or
+            "preparing spark release" in commit_title.lower() or
+            "preparing development version" in commit_title.lower() or
+            "CHANGES.txt" in commit_title)


 def is_maintenance(commit_title):
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings):
 # Parse components in the commit message
 # The returned components are already filtered and translated
 def find_components(commit, commit_hash):
-    components = re.findall("\[\w*\]", commit.lower())
+    components = re.findall(r"\[\w*\]", commit.lower())
    components = [translate_component(c, commit_hash)
                  for c in components if c in known_components]
    return components
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
    versions = sorted(versions, key=lambda x: x.name, reverse=True)
    versions = filter(lambda x: x.raw['released'] is False, versions)
    # Consider only x.y.z versions
-    versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
+    versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions)

    default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
    for v in default_fix_versions:
@ -403,7 +403,7 @@ def standardize_jira_ref(text):

    # Extract spark component(s):
    # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
-    pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE)
+    pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE)
    for component in pattern.findall(text):
        components.append(component.upper())
        text = text.replace(component, '')
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@ -115,7 +115,8 @@ def run_tests(tests_timeout):
                                         os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()

    failure_note_by_errcode = {
-        1: 'executing the `dev/run-tests` script',  # error to denote run-tests script failures
+        # error to denote run-tests script failures:
+        1: 'executing the `dev/run-tests` script',  # noqa: W605
        ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
        ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
        ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@ -169,7 +169,7 @@ def determine_java_version(java_exe):
    # find raw version string, eg 'java version "1.8.0_25"'
    raw_version_str = next(x for x in raw_output_lines if " version " in x)

-    match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str)
+    match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str)

    major = int(match.group(1))
    minor = int(match.group(2))
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@ -773,8 +773,8 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
        which is a Dataframe having two fields (FPR, TPR) with
        (0.0, 0.0) prepended and (1.0, 1.0) appended to it.

-        .. seealso:: `Wikipedia reference \
-        <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+        .. seealso:: `Wikipedia reference
+            <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

        .. note:: This ignores instance weights (setting all to 1.0) from
            `LogisticRegression.weightCol`. This will change in later Spark
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada
    .. note:: Experimental

    Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
-    <a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract:
+    `Lin and Cohen <http://www.icml2010.org/papers/387.pdf>`_. From the abstract:
    PIC finds a very low-dimensional embedding of a dataset using truncated power
    iteration on a normalized pair-wise similarity matrix of the data.

    This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method
    to run the PowerIterationClustering algorithm.

-    .. seealso:: `Wikipedia on Spectral clustering \
-    <http://en.wikipedia.org/wiki/Spectral_clustering>`_
+    .. seealso:: `Wikipedia on Spectral clustering
+        <http://en.wikipedia.org/wiki/Spectral_clustering>`_

-   >>> data = [(1, 0, 0.5), \
-               (2, 0, 0.5), (2, 1, 0.7), \
-               (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \
-               (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \
-               (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
+    >>> data = [(1, 0, 0.5),
+    ...         (2, 0, 0.5), (2, 1, 0.7),
+    ...         (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),
+    ...         (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),
+    ...         (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
    >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight")
    >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight")
    >>> assignments = pic.assignClusters(df)
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@ -207,8 +207,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
    distance space. The output will be vectors of configurable dimension. Hash values in the same
    dimension are calculated by the same hash function.

-    .. seealso:: `Stable Distributions \
-    <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
+    .. seealso:: `Stable Distributions
+        <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
    .. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_

    >>> from pyspark.ml.linalg import Vectors
@ -303,7 +303,7 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp


 class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
-    """
+    r"""
    .. note:: Experimental

    Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are
@ -653,8 +653,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
    The return vector is scaled such that the transform matrix is
    unitary (aka scaled DCT-II).

-    .. seealso:: `More information on Wikipedia \
-    <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
+    .. seealso:: `More information on Wikipedia
+        <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.

    >>> from pyspark.ml.linalg import Vectors
    >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
@ -1353,7 +1353,7 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,


 class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
-    """
+    r"""
    .. note:: Experimental

    Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each
@ -1362,8 +1362,8 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
    :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise
    independent according to the reference.

-    .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \
-    permutations." Electronic Journal of Combinatorics 7 (2000): R26.
+    .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear
+        permutations." Electronic Journal of Combinatorics 7 (2000): R26.

    .. versionadded:: 2.2.0
    """
--- a/python/pyspark/ml/fpm.py
+++ b/python/pyspark/ml/fpm.py
@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol,
               HasMinSupport, HasNumPartitions, HasMinConfidence,
               JavaMLWritable, JavaMLReadable):

-    """
+    r"""
    .. note:: Experimental

    A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@ -188,8 +188,8 @@ class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritabl
    @property
    @since("2.3.0")
    def scale(self):
-        """
-        The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0.
+        r"""
+        The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0.
        """
        return self._call_java("scale")

@ -279,12 +279,12 @@ class LinearRegressionSummary(JavaWrapper):
    @property
    @since("2.0.0")
    def explainedVariance(self):
-        """
+        r"""
        Returns the explained variance regression score.
-        explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+        explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`

-        .. seealso:: `Wikipedia explain variation \
-        <http://en.wikipedia.org/wiki/Explained_variation>`_
+        .. seealso:: `Wikipedia explain variation
+            <http://en.wikipedia.org/wiki/Explained_variation>`_

        .. note:: This ignores instance weights (setting all to 1.0) from
            `LinearRegression.weightCol`. This will change in later Spark
@ -339,8 +339,8 @@ class LinearRegressionSummary(JavaWrapper):
        """
        Returns R^2, the coefficient of determination.

-        .. seealso:: `Wikipedia coefficient of determination \
-        <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
+        .. seealso:: `Wikipedia coefficient of determination
+            <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_

        .. note:: This ignores instance weights (setting all to 1.0) from
            `LinearRegression.weightCol`. This will change in later Spark
@ -354,8 +354,8 @@ class LinearRegressionSummary(JavaWrapper):
        """
        Returns Adjusted R^2, the adjusted coefficient of determination.

-        .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \
-        <https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
+        .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2
+            <https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_

        .. note:: This ignores instance weights (setting all to 1.0) from
            `LinearRegression.weightCol`. This will change in later Spark versions.
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@ -647,7 +647,7 @@ class PowerIterationClustering(object):
    @classmethod
    @since('1.5.0')
    def train(cls, rdd, k, maxIterations=100, initMode="random"):
-        """
+        r"""
        :param rdd:
          An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
          affinity matrix, which is the matrix A in the PIC paper.  The
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@ -117,9 +117,9 @@ class RegressionMetrics(JavaModelWrapper):
    @property
    @since('1.4.0')
    def explainedVariance(self):
-        """
+        r"""
        Returns the explained variance regression score.
-        explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+        explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
        """
        return self.call("explainedVariance")

--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@ -59,7 +59,7 @@ class VectorTransformer(object):


 class Normalizer(VectorTransformer):
-    """
+    r"""
    Normalizes samples individually to unit L\ :sup:`p`\  norm

    For any 1 <= `p` < float('inf'), normalizes samples using
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@ -2399,7 +2399,7 @@ class RDD(object):
        :return: an :class:`RDDBarrier` instance that provides actions within a barrier stage.

        .. seealso:: :class:`BarrierTaskContext`
-        .. seealso:: `SPIP: Barrier Execution Mode \
+        .. seealso:: `SPIP: Barrier Execution Mode
            <http://jira.apache.org/jira/browse/SPARK-24374>`_
        .. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_

--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@ -54,7 +54,7 @@ atexit.register(lambda: sc.stop())
 sqlContext = spark._wrapped
 sqlCtx = sqlContext

-print("""Welcome to
+print(r"""Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None):

@since(2.1)
 def approx_count_distinct(col, rsd=None):
-    """Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`.
+    """Aggregate function: returns a new :class:`Column` for approximate distinct count of
+    column `col`.

    :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
        efficient to use :func:`countDistinct`
@ -346,7 +347,8 @@ def coalesce(*cols):

@since(1.6)
 def corr(col1, col2):
-    """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``.
+    """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
+    and ``col2``.

    >>> a = range(20)
    >>> b = [2 * x for x in range(20)]
@ -1688,14 +1690,14 @@ def split(str, pattern):
@ignore_unicode_prefix
@since(1.5)
 def regexp_extract(str, pattern, idx):
-    """Extract a specific group matched by a Java regex, from the specified string column.
+    r"""Extract a specific group matched by a Java regex, from the specified string column.
    If the regex did not match, or the specified group did not match, an empty string is returned.

    >>> df = spark.createDataFrame([('100-200',)], ['str'])
-    >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
+    >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
    [Row(d=u'100')]
    >>> df = spark.createDataFrame([('foo',)], ['str'])
-    >>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
+    >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
    [Row(d=u'')]
    >>> df = spark.createDataFrame([('aaaac',)], ['str'])
    >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
@ -1712,7 +1714,7 @@ def regexp_replace(str, pattern, replacement):
    """Replace all substrings of the specified string value that match regexp with rep.

    >>> df = spark.createDataFrame([('100-200',)], ['str'])
-    >>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect()
+    >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
    [Row(d=u'-----')]
    """
    sc = SparkContext._active_spark_context
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@ -350,7 +350,7 @@ class DataFrameReader(OptionUtils):
            maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
            samplingRatio=None, enforceSchema=None, emptyValue=None):
-        """Loads a CSV file and returns the result as a  :class:`DataFrame`.
+        r"""Loads a CSV file and returns the result as a  :class:`DataFrame`.

        This function will go through the input once to determine the input schema if
        ``inferSchema`` is enabled. To avoid going through the entire data once, disable
@ -519,8 +519,8 @@ class DataFrameReader(OptionUtils):

        If both ``column`` and ``predicates`` are specified, ``column`` will be used.

-        .. note:: Don't create too many partitions in parallel on a large cluster; \
-        otherwise Spark might crash your external database systems.
+        .. note:: Don't create too many partitions in parallel on a large cluster;
+            otherwise Spark might crash your external database systems.

        :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
        :param table: the name of the table
@ -862,7 +862,7 @@ class DataFrameWriter(OptionUtils):
            header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
            timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
            charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None):
-        """Saves the content of the :class:`DataFrame` in CSV format at the specified path.
+        r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.

        :param path: the path in any Hadoop supported file system
        :param mode: specifies the behavior of the save operation when data already exists.
@ -962,8 +962,8 @@ class DataFrameWriter(OptionUtils):
    def jdbc(self, url, table, mode=None, properties=None):
        """Saves the content of the :class:`DataFrame` to an external database table via JDBC.

-        .. note:: Don't create too many partitions in parallel on a large cluster; \
-        otherwise Spark might crash your external database systems.
+        .. note:: Don't create too many partitions in parallel on a large cluster;
+            otherwise Spark might crash your external database systems.

        :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
        :param table: Name of the table in the external database.
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@ -565,7 +565,7 @@ class DataStreamReader(OptionUtils):
            maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
            enforceSchema=None, emptyValue=None):
-        """Loads a CSV file stream and returns the result as a  :class:`DataFrame`.
+        r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.

        This function will go through the input once to determine the input schema if
        ``inferSchema`` is enabled. To avoid going through the entire data once, disable
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@ -752,7 +752,7 @@ _all_complex_types = dict((v.typeName(), v)
                          for v in [ArrayType, MapType, StructType])


-_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
+_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)")


 def _parse_datatype_string(s):
--- a/python/pyspark/storagelevel.py
+++ b/python/pyspark/storagelevel.py
@ -58,8 +58,8 @@ StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
 StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)

 """
-.. note:: The following four storage level constants are deprecated in 2.0, since the records \
-will always be serialized in Python.
+.. note:: The following four storage level constants are deprecated in 2.0, since the records
+    will always be serialized in Python.
 """
 StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY
 """.. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead."""
--- a/python/pyspark/util.py
+++ b/python/pyspark/util.py
@ -80,7 +80,7 @@ class VersionUtils(object):
        (2, 3)

        """
-        m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion)
+        m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion)
        if m is not None:
            return (int(m.group(1)), int(m.group(2)))
        else:
--- a/python/run-tests.py
+++ b/python/run-tests.py
@ -138,7 +138,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
            # 2 (or --verbose option is enabled).
            decoded_lines = map(lambda line: line.decode(), iter(per_test_output))
            skipped_tests = list(filter(
-                lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line),
+                lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line),
                decoded_lines))
            skipped_counts = len(skipped_tests)
            if skipped_counts > 0: