[SPARK-25238][PYTHON] lint-python: Fix W605 warnings for pycodestyle 2.4
(This change is a subset of the changes needed for the JIRA; see https://github.com/apache/spark/pull/22231) ## What changes were proposed in this pull request? Use raw strings and simpler regex syntax consistently in Python, which also avoids warnings from pycodestyle about accidentally relying Python's non-escaping of non-reserved chars in normal strings. Also, fix a few long lines. ## How was this patch tested? Existing tests, and some manual double-checking of the behavior of regexes in Python 2/3 to be sure. Closes #22400 from srowen/SPARK-25238.2. Authored-by: Sean Owen <sean.owen@databricks.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org>
This commit is contained in:
parent
6dc5921e66
commit
08c76b5d39
|
@ -67,7 +67,7 @@ print("JIRA server: %s" % JIRA_API_BASE)
|
|||
print("Release tag: %s" % RELEASE_TAG)
|
||||
print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
|
||||
print("Number of commits in this range: %s" % len(new_commits))
|
||||
print
|
||||
print("")
|
||||
|
||||
|
||||
def print_indented(_list):
|
||||
|
@ -88,10 +88,10 @@ filtered_commits = []
|
|||
|
||||
|
||||
def is_release(commit_title):
|
||||
return re.findall("\[release\]", commit_title.lower()) or \
|
||||
"preparing spark release" in commit_title.lower() or \
|
||||
"preparing development version" in commit_title.lower() or \
|
||||
"CHANGES.txt" in commit_title
|
||||
return ("[release]" in commit_title.lower() or
|
||||
"preparing spark release" in commit_title.lower() or
|
||||
"preparing development version" in commit_title.lower() or
|
||||
"CHANGES.txt" in commit_title)
|
||||
|
||||
|
||||
def is_maintenance(commit_title):
|
||||
|
|
|
@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings):
|
|||
# Parse components in the commit message
|
||||
# The returned components are already filtered and translated
|
||||
def find_components(commit, commit_hash):
|
||||
components = re.findall("\[\w*\]", commit.lower())
|
||||
components = re.findall(r"\[\w*\]", commit.lower())
|
||||
components = [translate_component(c, commit_hash)
|
||||
for c in components if c in known_components]
|
||||
return components
|
||||
|
|
|
@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
|
|||
versions = sorted(versions, key=lambda x: x.name, reverse=True)
|
||||
versions = filter(lambda x: x.raw['released'] is False, versions)
|
||||
# Consider only x.y.z versions
|
||||
versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
|
||||
versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions)
|
||||
|
||||
default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
|
||||
for v in default_fix_versions:
|
||||
|
@ -403,7 +403,7 @@ def standardize_jira_ref(text):
|
|||
|
||||
# Extract spark component(s):
|
||||
# Look for alphanumeric chars, spaces, dashes, periods, and/or commas
|
||||
pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE)
|
||||
pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE)
|
||||
for component in pattern.findall(text):
|
||||
components.append(component.upper())
|
||||
text = text.replace(component, '')
|
||||
|
|
|
@ -115,7 +115,8 @@ def run_tests(tests_timeout):
|
|||
os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
|
||||
|
||||
failure_note_by_errcode = {
|
||||
1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures
|
||||
# error to denote run-tests script failures:
|
||||
1: 'executing the `dev/run-tests` script', # noqa: W605
|
||||
ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
|
||||
ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
|
||||
ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',
|
||||
|
|
|
@ -169,7 +169,7 @@ def determine_java_version(java_exe):
|
|||
# find raw version string, eg 'java version "1.8.0_25"'
|
||||
raw_version_str = next(x for x in raw_output_lines if " version " in x)
|
||||
|
||||
match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str)
|
||||
match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str)
|
||||
|
||||
major = int(match.group(1))
|
||||
minor = int(match.group(2))
|
||||
|
|
|
@ -773,7 +773,7 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
|
|||
which is a Dataframe having two fields (FPR, TPR) with
|
||||
(0.0, 0.0) prepended and (1.0, 1.0) appended to it.
|
||||
|
||||
.. seealso:: `Wikipedia reference \
|
||||
.. seealso:: `Wikipedia reference
|
||||
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
|
||||
|
||||
.. note:: This ignores instance weights (setting all to 1.0) from
|
||||
|
|
|
@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada
|
|||
.. note:: Experimental
|
||||
|
||||
Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
|
||||
<a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract:
|
||||
`Lin and Cohen <http://www.icml2010.org/papers/387.pdf>`_. From the abstract:
|
||||
PIC finds a very low-dimensional embedding of a dataset using truncated power
|
||||
iteration on a normalized pair-wise similarity matrix of the data.
|
||||
|
||||
This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method
|
||||
to run the PowerIterationClustering algorithm.
|
||||
|
||||
.. seealso:: `Wikipedia on Spectral clustering \
|
||||
.. seealso:: `Wikipedia on Spectral clustering
|
||||
<http://en.wikipedia.org/wiki/Spectral_clustering>`_
|
||||
|
||||
>>> data = [(1, 0, 0.5), \
|
||||
(2, 0, 0.5), (2, 1, 0.7), \
|
||||
(3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \
|
||||
(4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \
|
||||
(5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
|
||||
>>> data = [(1, 0, 0.5),
|
||||
... (2, 0, 0.5), (2, 1, 0.7),
|
||||
... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),
|
||||
... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),
|
||||
... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
|
||||
>>> df = spark.createDataFrame(data).toDF("src", "dst", "weight")
|
||||
>>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight")
|
||||
>>> assignments = pic.assignClusters(df)
|
||||
|
|
|
@ -207,7 +207,7 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
|
|||
distance space. The output will be vectors of configurable dimension. Hash values in the same
|
||||
dimension are calculated by the same hash function.
|
||||
|
||||
.. seealso:: `Stable Distributions \
|
||||
.. seealso:: `Stable Distributions
|
||||
<https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
|
||||
.. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_
|
||||
|
||||
|
@ -303,7 +303,7 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
|
|||
|
||||
|
||||
class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
|
||||
"""
|
||||
r"""
|
||||
.. note:: Experimental
|
||||
|
||||
Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are
|
||||
|
@ -653,7 +653,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
|
|||
The return vector is scaled such that the transform matrix is
|
||||
unitary (aka scaled DCT-II).
|
||||
|
||||
.. seealso:: `More information on Wikipedia \
|
||||
.. seealso:: `More information on Wikipedia
|
||||
<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
|
||||
|
||||
>>> from pyspark.ml.linalg import Vectors
|
||||
|
@ -1353,7 +1353,7 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,
|
|||
|
||||
|
||||
class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
|
||||
"""
|
||||
r"""
|
||||
.. note:: Experimental
|
||||
|
||||
Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each
|
||||
|
@ -1362,7 +1362,7 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
|
|||
:math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise
|
||||
independent according to the reference.
|
||||
|
||||
.. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \
|
||||
.. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear
|
||||
permutations." Electronic Journal of Combinatorics 7 (2000): R26.
|
||||
|
||||
.. versionadded:: 2.2.0
|
||||
|
|
|
@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol,
|
|||
HasMinSupport, HasNumPartitions, HasMinConfidence,
|
||||
JavaMLWritable, JavaMLReadable):
|
||||
|
||||
"""
|
||||
r"""
|
||||
.. note:: Experimental
|
||||
|
||||
A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
|
||||
|
|
|
@ -188,8 +188,8 @@ class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritabl
|
|||
@property
|
||||
@since("2.3.0")
|
||||
def scale(self):
|
||||
"""
|
||||
The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0.
|
||||
r"""
|
||||
The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0.
|
||||
"""
|
||||
return self._call_java("scale")
|
||||
|
||||
|
@ -279,11 +279,11 @@ class LinearRegressionSummary(JavaWrapper):
|
|||
@property
|
||||
@since("2.0.0")
|
||||
def explainedVariance(self):
|
||||
"""
|
||||
r"""
|
||||
Returns the explained variance regression score.
|
||||
explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
|
||||
explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
|
||||
|
||||
.. seealso:: `Wikipedia explain variation \
|
||||
.. seealso:: `Wikipedia explain variation
|
||||
<http://en.wikipedia.org/wiki/Explained_variation>`_
|
||||
|
||||
.. note:: This ignores instance weights (setting all to 1.0) from
|
||||
|
@ -339,7 +339,7 @@ class LinearRegressionSummary(JavaWrapper):
|
|||
"""
|
||||
Returns R^2, the coefficient of determination.
|
||||
|
||||
.. seealso:: `Wikipedia coefficient of determination \
|
||||
.. seealso:: `Wikipedia coefficient of determination
|
||||
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
|
||||
|
||||
.. note:: This ignores instance weights (setting all to 1.0) from
|
||||
|
@ -354,7 +354,7 @@ class LinearRegressionSummary(JavaWrapper):
|
|||
"""
|
||||
Returns Adjusted R^2, the adjusted coefficient of determination.
|
||||
|
||||
.. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \
|
||||
.. seealso:: `Wikipedia coefficient of determination, Adjusted R^2
|
||||
<https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
|
||||
|
||||
.. note:: This ignores instance weights (setting all to 1.0) from
|
||||
|
|
|
@ -647,7 +647,7 @@ class PowerIterationClustering(object):
|
|||
@classmethod
|
||||
@since('1.5.0')
|
||||
def train(cls, rdd, k, maxIterations=100, initMode="random"):
|
||||
"""
|
||||
r"""
|
||||
:param rdd:
|
||||
An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
|
||||
affinity matrix, which is the matrix A in the PIC paper. The
|
||||
|
|
|
@ -117,9 +117,9 @@ class RegressionMetrics(JavaModelWrapper):
|
|||
@property
|
||||
@since('1.4.0')
|
||||
def explainedVariance(self):
|
||||
"""
|
||||
r"""
|
||||
Returns the explained variance regression score.
|
||||
explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
|
||||
explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
|
||||
"""
|
||||
return self.call("explainedVariance")
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ class VectorTransformer(object):
|
|||
|
||||
|
||||
class Normalizer(VectorTransformer):
|
||||
"""
|
||||
r"""
|
||||
Normalizes samples individually to unit L\ :sup:`p`\ norm
|
||||
|
||||
For any 1 <= `p` < float('inf'), normalizes samples using
|
||||
|
|
|
@ -2399,7 +2399,7 @@ class RDD(object):
|
|||
:return: an :class:`RDDBarrier` instance that provides actions within a barrier stage.
|
||||
|
||||
.. seealso:: :class:`BarrierTaskContext`
|
||||
.. seealso:: `SPIP: Barrier Execution Mode \
|
||||
.. seealso:: `SPIP: Barrier Execution Mode
|
||||
<http://jira.apache.org/jira/browse/SPARK-24374>`_
|
||||
.. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ atexit.register(lambda: sc.stop())
|
|||
sqlContext = spark._wrapped
|
||||
sqlCtx = sqlContext
|
||||
|
||||
print("""Welcome to
|
||||
print(r"""Welcome to
|
||||
____ __
|
||||
/ __/__ ___ _____/ /__
|
||||
_\ \/ _ \/ _ `/ __/ '_/
|
||||
|
|
|
@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None):
|
|||
|
||||
@since(2.1)
|
||||
def approx_count_distinct(col, rsd=None):
|
||||
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`.
|
||||
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of
|
||||
column `col`.
|
||||
|
||||
:param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
|
||||
efficient to use :func:`countDistinct`
|
||||
|
@ -346,7 +347,8 @@ def coalesce(*cols):
|
|||
|
||||
@since(1.6)
|
||||
def corr(col1, col2):
|
||||
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``.
|
||||
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
|
||||
and ``col2``.
|
||||
|
||||
>>> a = range(20)
|
||||
>>> b = [2 * x for x in range(20)]
|
||||
|
@ -1688,14 +1690,14 @@ def split(str, pattern):
|
|||
@ignore_unicode_prefix
|
||||
@since(1.5)
|
||||
def regexp_extract(str, pattern, idx):
|
||||
"""Extract a specific group matched by a Java regex, from the specified string column.
|
||||
r"""Extract a specific group matched by a Java regex, from the specified string column.
|
||||
If the regex did not match, or the specified group did not match, an empty string is returned.
|
||||
|
||||
>>> df = spark.createDataFrame([('100-200',)], ['str'])
|
||||
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
|
||||
>>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
|
||||
[Row(d=u'100')]
|
||||
>>> df = spark.createDataFrame([('foo',)], ['str'])
|
||||
>>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
|
||||
>>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
|
||||
[Row(d=u'')]
|
||||
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
|
||||
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
|
||||
|
@ -1712,7 +1714,7 @@ def regexp_replace(str, pattern, replacement):
|
|||
"""Replace all substrings of the specified string value that match regexp with rep.
|
||||
|
||||
>>> df = spark.createDataFrame([('100-200',)], ['str'])
|
||||
>>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect()
|
||||
>>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
|
||||
[Row(d=u'-----')]
|
||||
"""
|
||||
sc = SparkContext._active_spark_context
|
||||
|
|
|
@ -350,7 +350,7 @@ class DataFrameReader(OptionUtils):
|
|||
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
|
||||
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
|
||||
samplingRatio=None, enforceSchema=None, emptyValue=None):
|
||||
"""Loads a CSV file and returns the result as a :class:`DataFrame`.
|
||||
r"""Loads a CSV file and returns the result as a :class:`DataFrame`.
|
||||
|
||||
This function will go through the input once to determine the input schema if
|
||||
``inferSchema`` is enabled. To avoid going through the entire data once, disable
|
||||
|
@ -519,7 +519,7 @@ class DataFrameReader(OptionUtils):
|
|||
|
||||
If both ``column`` and ``predicates`` are specified, ``column`` will be used.
|
||||
|
||||
.. note:: Don't create too many partitions in parallel on a large cluster; \
|
||||
.. note:: Don't create too many partitions in parallel on a large cluster;
|
||||
otherwise Spark might crash your external database systems.
|
||||
|
||||
:param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
|
||||
|
@ -862,7 +862,7 @@ class DataFrameWriter(OptionUtils):
|
|||
header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
|
||||
timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
|
||||
charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None):
|
||||
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
|
||||
r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
|
||||
|
||||
:param path: the path in any Hadoop supported file system
|
||||
:param mode: specifies the behavior of the save operation when data already exists.
|
||||
|
@ -962,7 +962,7 @@ class DataFrameWriter(OptionUtils):
|
|||
def jdbc(self, url, table, mode=None, properties=None):
|
||||
"""Saves the content of the :class:`DataFrame` to an external database table via JDBC.
|
||||
|
||||
.. note:: Don't create too many partitions in parallel on a large cluster; \
|
||||
.. note:: Don't create too many partitions in parallel on a large cluster;
|
||||
otherwise Spark might crash your external database systems.
|
||||
|
||||
:param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
|
||||
|
|
|
@ -565,7 +565,7 @@ class DataStreamReader(OptionUtils):
|
|||
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
|
||||
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
|
||||
enforceSchema=None, emptyValue=None):
|
||||
"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
|
||||
r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
|
||||
|
||||
This function will go through the input once to determine the input schema if
|
||||
``inferSchema`` is enabled. To avoid going through the entire data once, disable
|
||||
|
|
|
@ -752,7 +752,7 @@ _all_complex_types = dict((v.typeName(), v)
|
|||
for v in [ArrayType, MapType, StructType])
|
||||
|
||||
|
||||
_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
|
||||
_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)")
|
||||
|
||||
|
||||
def _parse_datatype_string(s):
|
||||
|
|
|
@ -58,8 +58,8 @@ StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
|
|||
StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)
|
||||
|
||||
"""
|
||||
.. note:: The following four storage level constants are deprecated in 2.0, since the records \
|
||||
will always be serialized in Python.
|
||||
.. note:: The following four storage level constants are deprecated in 2.0, since the records
|
||||
will always be serialized in Python.
|
||||
"""
|
||||
StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY
|
||||
""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead."""
|
||||
|
|
|
@ -80,7 +80,7 @@ class VersionUtils(object):
|
|||
(2, 3)
|
||||
|
||||
"""
|
||||
m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion)
|
||||
m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion)
|
||||
if m is not None:
|
||||
return (int(m.group(1)), int(m.group(2)))
|
||||
else:
|
||||
|
|
|
@ -138,7 +138,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
|
|||
# 2 (or --verbose option is enabled).
|
||||
decoded_lines = map(lambda line: line.decode(), iter(per_test_output))
|
||||
skipped_tests = list(filter(
|
||||
lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line),
|
||||
lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line),
|
||||
decoded_lines))
|
||||
skipped_counts = len(skipped_tests)
|
||||
if skipped_counts > 0:
|
||||
|
|
Loading…
Reference in a new issue