[SPARK-25238][PYTHON] lint-python: Fix W605 warnings for pycodestyle 2.4

(This change is a subset of the changes needed for the JIRA; see https://github.com/apache/spark/pull/22231)

## What changes were proposed in this pull request?

Use raw strings and simpler regex syntax consistently in Python, which also avoids warnings from pycodestyle about accidentally relying Python's non-escaping of non-reserved chars in normal strings. Also, fix a few long lines.

## How was this patch tested?

Existing tests, and some manual double-checking of the behavior of regexes in Python 2/3 to be sure.

Closes #22400 from srowen/SPARK-25238.2.

Authored-by: Sean Owen <sean.owen@databricks.com>
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
This commit is contained in:
Sean Owen 2018-09-13 11:19:43 +08:00 committed by hyukjinkwon
parent 6dc5921e66
commit 08c76b5d39
22 changed files with 66 additions and 63 deletions

View file

@ -67,7 +67,7 @@ print("JIRA server: %s" % JIRA_API_BASE)
print("Release tag: %s" % RELEASE_TAG)
print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
print("Number of commits in this range: %s" % len(new_commits))
print
print("")
def print_indented(_list):
@ -88,10 +88,10 @@ filtered_commits = []
def is_release(commit_title):
return re.findall("\[release\]", commit_title.lower()) or \
"preparing spark release" in commit_title.lower() or \
"preparing development version" in commit_title.lower() or \
"CHANGES.txt" in commit_title
return ("[release]" in commit_title.lower() or
"preparing spark release" in commit_title.lower() or
"preparing development version" in commit_title.lower() or
"CHANGES.txt" in commit_title)
def is_maintenance(commit_title):

View file

@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings):
# Parse components in the commit message
# The returned components are already filtered and translated
def find_components(commit, commit_hash):
components = re.findall("\[\w*\]", commit.lower())
components = re.findall(r"\[\w*\]", commit.lower())
components = [translate_component(c, commit_hash)
for c in components if c in known_components]
return components

View file

@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
versions = sorted(versions, key=lambda x: x.name, reverse=True)
versions = filter(lambda x: x.raw['released'] is False, versions)
# Consider only x.y.z versions
versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions)
default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
for v in default_fix_versions:
@ -403,7 +403,7 @@ def standardize_jira_ref(text):
# Extract spark component(s):
# Look for alphanumeric chars, spaces, dashes, periods, and/or commas
pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE)
pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE)
for component in pattern.findall(text):
components.append(component.upper())
text = text.replace(component, '')

View file

@ -115,7 +115,8 @@ def run_tests(tests_timeout):
os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
failure_note_by_errcode = {
1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures
# error to denote run-tests script failures:
1: 'executing the `dev/run-tests` script', # noqa: W605
ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',

View file

@ -169,7 +169,7 @@ def determine_java_version(java_exe):
# find raw version string, eg 'java version "1.8.0_25"'
raw_version_str = next(x for x in raw_output_lines if " version " in x)
match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str)
match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str)
major = int(match.group(1))
minor = int(match.group(2))

View file

@ -773,8 +773,8 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
which is a Dataframe having two fields (FPR, TPR) with
(0.0, 0.0) prepended and (1.0, 1.0) appended to it.
.. seealso:: `Wikipedia reference \
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. seealso:: `Wikipedia reference
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. note:: This ignores instance weights (setting all to 1.0) from
`LogisticRegression.weightCol`. This will change in later Spark

View file

@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada
.. note:: Experimental
Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
<a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract:
`Lin and Cohen <http://www.icml2010.org/papers/387.pdf>`_. From the abstract:
PIC finds a very low-dimensional embedding of a dataset using truncated power
iteration on a normalized pair-wise similarity matrix of the data.
This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method
to run the PowerIterationClustering algorithm.
.. seealso:: `Wikipedia on Spectral clustering \
<http://en.wikipedia.org/wiki/Spectral_clustering>`_
.. seealso:: `Wikipedia on Spectral clustering
<http://en.wikipedia.org/wiki/Spectral_clustering>`_
>>> data = [(1, 0, 0.5), \
(2, 0, 0.5), (2, 1, 0.7), \
(3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \
(4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \
(5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
>>> data = [(1, 0, 0.5),
... (2, 0, 0.5), (2, 1, 0.7),
... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),
... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),
... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
>>> df = spark.createDataFrame(data).toDF("src", "dst", "weight")
>>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight")
>>> assignments = pic.assignClusters(df)

View file

@ -207,8 +207,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
distance space. The output will be vectors of configurable dimension. Hash values in the same
dimension are calculated by the same hash function.
.. seealso:: `Stable Distributions \
<https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
.. seealso:: `Stable Distributions
<https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
.. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_
>>> from pyspark.ml.linalg import Vectors
@ -303,7 +303,7 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
"""
r"""
.. note:: Experimental
Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are
@ -653,8 +653,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
The return vector is scaled such that the transform matrix is
unitary (aka scaled DCT-II).
.. seealso:: `More information on Wikipedia \
<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
.. seealso:: `More information on Wikipedia
<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
>>> from pyspark.ml.linalg import Vectors
>>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
@ -1353,7 +1353,7 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,
class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
"""
r"""
.. note:: Experimental
Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each
@ -1362,8 +1362,8 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
:math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise
independent according to the reference.
.. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \
permutations." Electronic Journal of Combinatorics 7 (2000): R26.
.. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear
permutations." Electronic Journal of Combinatorics 7 (2000): R26.
.. versionadded:: 2.2.0
"""

View file

@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol,
HasMinSupport, HasNumPartitions, HasMinConfidence,
JavaMLWritable, JavaMLReadable):
"""
r"""
.. note:: Experimental
A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in

View file

@ -188,8 +188,8 @@ class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritabl
@property
@since("2.3.0")
def scale(self):
"""
The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0.
r"""
The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0.
"""
return self._call_java("scale")
@ -279,12 +279,12 @@ class LinearRegressionSummary(JavaWrapper):
@property
@since("2.0.0")
def explainedVariance(self):
"""
r"""
Returns the explained variance regression score.
explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
.. seealso:: `Wikipedia explain variation \
<http://en.wikipedia.org/wiki/Explained_variation>`_
.. seealso:: `Wikipedia explain variation
<http://en.wikipedia.org/wiki/Explained_variation>`_
.. note:: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark
@ -339,8 +339,8 @@ class LinearRegressionSummary(JavaWrapper):
"""
Returns R^2, the coefficient of determination.
.. seealso:: `Wikipedia coefficient of determination \
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
.. seealso:: `Wikipedia coefficient of determination
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
.. note:: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark
@ -354,8 +354,8 @@ class LinearRegressionSummary(JavaWrapper):
"""
Returns Adjusted R^2, the adjusted coefficient of determination.
.. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \
<https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
.. seealso:: `Wikipedia coefficient of determination, Adjusted R^2
<https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
.. note:: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark versions.

View file

@ -647,7 +647,7 @@ class PowerIterationClustering(object):
@classmethod
@since('1.5.0')
def train(cls, rdd, k, maxIterations=100, initMode="random"):
"""
r"""
:param rdd:
An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
affinity matrix, which is the matrix A in the PIC paper. The

View file

@ -117,9 +117,9 @@ class RegressionMetrics(JavaModelWrapper):
@property
@since('1.4.0')
def explainedVariance(self):
"""
r"""
Returns the explained variance regression score.
explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
"""
return self.call("explainedVariance")

View file

@ -59,7 +59,7 @@ class VectorTransformer(object):
class Normalizer(VectorTransformer):
"""
r"""
Normalizes samples individually to unit L\ :sup:`p`\ norm
For any 1 <= `p` < float('inf'), normalizes samples using

View file

@ -2399,7 +2399,7 @@ class RDD(object):
:return: an :class:`RDDBarrier` instance that provides actions within a barrier stage.
.. seealso:: :class:`BarrierTaskContext`
.. seealso:: `SPIP: Barrier Execution Mode \
.. seealso:: `SPIP: Barrier Execution Mode
<http://jira.apache.org/jira/browse/SPARK-24374>`_
.. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_

View file

@ -54,7 +54,7 @@ atexit.register(lambda: sc.stop())
sqlContext = spark._wrapped
sqlCtx = sqlContext
print("""Welcome to
print(r"""Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/

View file

@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None):
@since(2.1)
def approx_count_distinct(col, rsd=None):
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`.
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of
column `col`.
:param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
efficient to use :func:`countDistinct`
@ -346,7 +347,8 @@ def coalesce(*cols):
@since(1.6)
def corr(col1, col2):
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``.
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
and ``col2``.
>>> a = range(20)
>>> b = [2 * x for x in range(20)]
@ -1688,14 +1690,14 @@ def split(str, pattern):
@ignore_unicode_prefix
@since(1.5)
def regexp_extract(str, pattern, idx):
"""Extract a specific group matched by a Java regex, from the specified string column.
r"""Extract a specific group matched by a Java regex, from the specified string column.
If the regex did not match, or the specified group did not match, an empty string is returned.
>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
>>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
[Row(d=u'100')]
>>> df = spark.createDataFrame([('foo',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
>>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
[Row(d=u'')]
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
@ -1712,7 +1714,7 @@ def regexp_replace(str, pattern, replacement):
"""Replace all substrings of the specified string value that match regexp with rep.
>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect()
>>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
[Row(d=u'-----')]
"""
sc = SparkContext._active_spark_context

View file

@ -350,7 +350,7 @@ class DataFrameReader(OptionUtils):
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
samplingRatio=None, enforceSchema=None, emptyValue=None):
"""Loads a CSV file and returns the result as a :class:`DataFrame`.
r"""Loads a CSV file and returns the result as a :class:`DataFrame`.
This function will go through the input once to determine the input schema if
``inferSchema`` is enabled. To avoid going through the entire data once, disable
@ -519,8 +519,8 @@ class DataFrameReader(OptionUtils):
If both ``column`` and ``predicates`` are specified, ``column`` will be used.
.. note:: Don't create too many partitions in parallel on a large cluster; \
otherwise Spark might crash your external database systems.
.. note:: Don't create too many partitions in parallel on a large cluster;
otherwise Spark might crash your external database systems.
:param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
:param table: the name of the table
@ -862,7 +862,7 @@ class DataFrameWriter(OptionUtils):
header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None):
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
:param mode: specifies the behavior of the save operation when data already exists.
@ -962,8 +962,8 @@ class DataFrameWriter(OptionUtils):
def jdbc(self, url, table, mode=None, properties=None):
"""Saves the content of the :class:`DataFrame` to an external database table via JDBC.
.. note:: Don't create too many partitions in parallel on a large cluster; \
otherwise Spark might crash your external database systems.
.. note:: Don't create too many partitions in parallel on a large cluster;
otherwise Spark might crash your external database systems.
:param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
:param table: Name of the table in the external database.

View file

@ -565,7 +565,7 @@ class DataStreamReader(OptionUtils):
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
enforceSchema=None, emptyValue=None):
"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
This function will go through the input once to determine the input schema if
``inferSchema`` is enabled. To avoid going through the entire data once, disable

View file

@ -752,7 +752,7 @@ _all_complex_types = dict((v.typeName(), v)
for v in [ArrayType, MapType, StructType])
_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)")
def _parse_datatype_string(s):

View file

@ -58,8 +58,8 @@ StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)
"""
.. note:: The following four storage level constants are deprecated in 2.0, since the records \
will always be serialized in Python.
.. note:: The following four storage level constants are deprecated in 2.0, since the records
will always be serialized in Python.
"""
StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY
""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead."""

View file

@ -80,7 +80,7 @@ class VersionUtils(object):
(2, 3)
"""
m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion)
m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion)
if m is not None:
return (int(m.group(1)), int(m.group(2)))
else:

View file

@ -138,7 +138,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
# 2 (or --verbose option is enabled).
decoded_lines = map(lambda line: line.decode(), iter(per_test_output))
skipped_tests = list(filter(
lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line),
lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line),
decoded_lines))
skipped_counts = len(skipped_tests)
if skipped_counts > 0: