[SPARK-25238][PYTHON] lint-python: Fix W605 warnings for pycodestyle 2.4

(This change is a subset of the changes needed for the JIRA; see https://github.com/apache/spark/pull/22231)

## What changes were proposed in this pull request?

Use raw strings and simpler regex syntax consistently in Python, which also avoids warnings from pycodestyle about accidentally relying Python's non-escaping of non-reserved chars in normal strings. Also, fix a few long lines.

## How was this patch tested?

Existing tests, and some manual double-checking of the behavior of regexes in Python 2/3 to be sure.

Closes #22400 from srowen/SPARK-25238.2.

Authored-by: Sean Owen <sean.owen@databricks.com>
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
This commit is contained in:
Sean Owen 2018-09-13 11:19:43 +08:00 committed by hyukjinkwon
parent 6dc5921e66
commit 08c76b5d39
22 changed files with 66 additions and 63 deletions

View file

@ -67,7 +67,7 @@ print("JIRA server: %s" % JIRA_API_BASE)
print("Release tag: %s" % RELEASE_TAG) print("Release tag: %s" % RELEASE_TAG)
print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG) print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
print("Number of commits in this range: %s" % len(new_commits)) print("Number of commits in this range: %s" % len(new_commits))
print print("")
def print_indented(_list): def print_indented(_list):
@ -88,10 +88,10 @@ filtered_commits = []
def is_release(commit_title): def is_release(commit_title):
return re.findall("\[release\]", commit_title.lower()) or \ return ("[release]" in commit_title.lower() or
"preparing spark release" in commit_title.lower() or \ "preparing spark release" in commit_title.lower() or
"preparing development version" in commit_title.lower() or \ "preparing development version" in commit_title.lower() or
"CHANGES.txt" in commit_title "CHANGES.txt" in commit_title)
def is_maintenance(commit_title): def is_maintenance(commit_title):

View file

@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings):
# Parse components in the commit message # Parse components in the commit message
# The returned components are already filtered and translated # The returned components are already filtered and translated
def find_components(commit, commit_hash): def find_components(commit, commit_hash):
components = re.findall("\[\w*\]", commit.lower()) components = re.findall(r"\[\w*\]", commit.lower())
components = [translate_component(c, commit_hash) components = [translate_component(c, commit_hash)
for c in components if c in known_components] for c in components if c in known_components]
return components return components

View file

@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
versions = sorted(versions, key=lambda x: x.name, reverse=True) versions = sorted(versions, key=lambda x: x.name, reverse=True)
versions = filter(lambda x: x.raw['released'] is False, versions) versions = filter(lambda x: x.raw['released'] is False, versions)
# Consider only x.y.z versions # Consider only x.y.z versions
versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions) versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions)
default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
for v in default_fix_versions: for v in default_fix_versions:
@ -403,7 +403,7 @@ def standardize_jira_ref(text):
# Extract spark component(s): # Extract spark component(s):
# Look for alphanumeric chars, spaces, dashes, periods, and/or commas # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE) pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE)
for component in pattern.findall(text): for component in pattern.findall(text):
components.append(component.upper()) components.append(component.upper())
text = text.replace(component, '') text = text.replace(component, '')

View file

@ -115,7 +115,8 @@ def run_tests(tests_timeout):
os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait() os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
failure_note_by_errcode = { failure_note_by_errcode = {
1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures # error to denote run-tests script failures:
1: 'executing the `dev/run-tests` script', # noqa: W605
ERROR_CODES["BLOCK_GENERAL"]: 'some tests', ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
ERROR_CODES["BLOCK_RAT"]: 'RAT tests', ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests', ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',

View file

@ -169,7 +169,7 @@ def determine_java_version(java_exe):
# find raw version string, eg 'java version "1.8.0_25"' # find raw version string, eg 'java version "1.8.0_25"'
raw_version_str = next(x for x in raw_output_lines if " version " in x) raw_version_str = next(x for x in raw_output_lines if " version " in x)
match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str) match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str)
major = int(match.group(1)) major = int(match.group(1))
minor = int(match.group(2)) minor = int(match.group(2))

View file

@ -773,7 +773,7 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
which is a Dataframe having two fields (FPR, TPR) with which is a Dataframe having two fields (FPR, TPR) with
(0.0, 0.0) prepended and (1.0, 1.0) appended to it. (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
.. seealso:: `Wikipedia reference \ .. seealso:: `Wikipedia reference
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. note:: This ignores instance weights (setting all to 1.0) from .. note:: This ignores instance weights (setting all to 1.0) from

View file

@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada
.. note:: Experimental .. note:: Experimental
Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
<a href=http://www.icml2010.org/papers/387.pdf>Lin and Cohen</a>. From the abstract: `Lin and Cohen <http://www.icml2010.org/papers/387.pdf>`_. From the abstract:
PIC finds a very low-dimensional embedding of a dataset using truncated power PIC finds a very low-dimensional embedding of a dataset using truncated power
iteration on a normalized pair-wise similarity matrix of the data. iteration on a normalized pair-wise similarity matrix of the data.
This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method
to run the PowerIterationClustering algorithm. to run the PowerIterationClustering algorithm.
.. seealso:: `Wikipedia on Spectral clustering \ .. seealso:: `Wikipedia on Spectral clustering
<http://en.wikipedia.org/wiki/Spectral_clustering>`_ <http://en.wikipedia.org/wiki/Spectral_clustering>`_
>>> data = [(1, 0, 0.5), \ >>> data = [(1, 0, 0.5),
(2, 0, 0.5), (2, 1, 0.7), \ ... (2, 0, 0.5), (2, 1, 0.7),
(3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \ ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9),
(4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \ ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1),
(5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)]
>>> df = spark.createDataFrame(data).toDF("src", "dst", "weight") >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight")
>>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight")
>>> assignments = pic.assignClusters(df) >>> assignments = pic.assignClusters(df)

View file

@ -207,7 +207,7 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
distance space. The output will be vectors of configurable dimension. Hash values in the same distance space. The output will be vectors of configurable dimension. Hash values in the same
dimension are calculated by the same hash function. dimension are calculated by the same hash function.
.. seealso:: `Stable Distributions \ .. seealso:: `Stable Distributions
<https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_ <https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions>`_
.. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_ .. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_
@ -303,7 +303,7 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
""" r"""
.. note:: Experimental .. note:: Experimental
Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are
@ -653,7 +653,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
The return vector is scaled such that the transform matrix is The return vector is scaled such that the transform matrix is
unitary (aka scaled DCT-II). unitary (aka scaled DCT-II).
.. seealso:: `More information on Wikipedia \ .. seealso:: `More information on Wikipedia
<https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_. <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
>>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.linalg import Vectors
@ -1353,7 +1353,7 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,
class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
""" r"""
.. note:: Experimental .. note:: Experimental
Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each
@ -1362,7 +1362,7 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable):
:math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise
independent according to the reference. independent according to the reference.
.. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \ .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear
permutations." Electronic Journal of Combinatorics 7 (2000): R26. permutations." Electronic Journal of Combinatorics 7 (2000): R26.
.. versionadded:: 2.2.0 .. versionadded:: 2.2.0

View file

@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol,
HasMinSupport, HasNumPartitions, HasMinConfidence, HasMinSupport, HasNumPartitions, HasMinConfidence,
JavaMLWritable, JavaMLReadable): JavaMLWritable, JavaMLReadable):
""" r"""
.. note:: Experimental .. note:: Experimental
A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in

View file

@ -188,8 +188,8 @@ class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritabl
@property @property
@since("2.3.0") @since("2.3.0")
def scale(self): def scale(self):
""" r"""
The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0. The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0.
""" """
return self._call_java("scale") return self._call_java("scale")
@ -279,11 +279,11 @@ class LinearRegressionSummary(JavaWrapper):
@property @property
@since("2.0.0") @since("2.0.0")
def explainedVariance(self): def explainedVariance(self):
""" r"""
Returns the explained variance regression score. Returns the explained variance regression score.
explainedVariance = 1 - variance(y - \hat{y}) / variance(y) explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
.. seealso:: `Wikipedia explain variation \ .. seealso:: `Wikipedia explain variation
<http://en.wikipedia.org/wiki/Explained_variation>`_ <http://en.wikipedia.org/wiki/Explained_variation>`_
.. note:: This ignores instance weights (setting all to 1.0) from .. note:: This ignores instance weights (setting all to 1.0) from
@ -339,7 +339,7 @@ class LinearRegressionSummary(JavaWrapper):
""" """
Returns R^2, the coefficient of determination. Returns R^2, the coefficient of determination.
.. seealso:: `Wikipedia coefficient of determination \ .. seealso:: `Wikipedia coefficient of determination
<http://en.wikipedia.org/wiki/Coefficient_of_determination>`_ <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
.. note:: This ignores instance weights (setting all to 1.0) from .. note:: This ignores instance weights (setting all to 1.0) from
@ -354,7 +354,7 @@ class LinearRegressionSummary(JavaWrapper):
""" """
Returns Adjusted R^2, the adjusted coefficient of determination. Returns Adjusted R^2, the adjusted coefficient of determination.
.. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \ .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2
<https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_ <https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2>`_
.. note:: This ignores instance weights (setting all to 1.0) from .. note:: This ignores instance weights (setting all to 1.0) from

View file

@ -647,7 +647,7 @@ class PowerIterationClustering(object):
@classmethod @classmethod
@since('1.5.0') @since('1.5.0')
def train(cls, rdd, k, maxIterations=100, initMode="random"): def train(cls, rdd, k, maxIterations=100, initMode="random"):
""" r"""
:param rdd: :param rdd:
An RDD of (i, j, s\ :sub:`ij`\) tuples representing the An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
affinity matrix, which is the matrix A in the PIC paper. The affinity matrix, which is the matrix A in the PIC paper. The

View file

@ -117,9 +117,9 @@ class RegressionMetrics(JavaModelWrapper):
@property @property
@since('1.4.0') @since('1.4.0')
def explainedVariance(self): def explainedVariance(self):
""" r"""
Returns the explained variance regression score. Returns the explained variance regression score.
explainedVariance = 1 - variance(y - \hat{y}) / variance(y) explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}`
""" """
return self.call("explainedVariance") return self.call("explainedVariance")

View file

@ -59,7 +59,7 @@ class VectorTransformer(object):
class Normalizer(VectorTransformer): class Normalizer(VectorTransformer):
""" r"""
Normalizes samples individually to unit L\ :sup:`p`\ norm Normalizes samples individually to unit L\ :sup:`p`\ norm
For any 1 <= `p` < float('inf'), normalizes samples using For any 1 <= `p` < float('inf'), normalizes samples using

View file

@ -2399,7 +2399,7 @@ class RDD(object):
:return: an :class:`RDDBarrier` instance that provides actions within a barrier stage. :return: an :class:`RDDBarrier` instance that provides actions within a barrier stage.
.. seealso:: :class:`BarrierTaskContext` .. seealso:: :class:`BarrierTaskContext`
.. seealso:: `SPIP: Barrier Execution Mode \ .. seealso:: `SPIP: Barrier Execution Mode
<http://jira.apache.org/jira/browse/SPARK-24374>`_ <http://jira.apache.org/jira/browse/SPARK-24374>`_
.. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_ .. seealso:: `Design Doc <https://jira.apache.org/jira/browse/SPARK-24582>`_

View file

@ -54,7 +54,7 @@ atexit.register(lambda: sc.stop())
sqlContext = spark._wrapped sqlContext = spark._wrapped
sqlCtx = sqlContext sqlCtx = sqlContext
print("""Welcome to print(r"""Welcome to
____ __ ____ __
/ __/__ ___ _____/ /__ / __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/ _\ \/ _ \/ _ `/ __/ '_/

View file

@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None):
@since(2.1) @since(2.1)
def approx_count_distinct(col, rsd=None): def approx_count_distinct(col, rsd=None):
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`. """Aggregate function: returns a new :class:`Column` for approximate distinct count of
column `col`.
:param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
efficient to use :func:`countDistinct` efficient to use :func:`countDistinct`
@ -346,7 +347,8 @@ def coalesce(*cols):
@since(1.6) @since(1.6)
def corr(col1, col2): def corr(col1, col2):
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``. """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
and ``col2``.
>>> a = range(20) >>> a = range(20)
>>> b = [2 * x for x in range(20)] >>> b = [2 * x for x in range(20)]
@ -1688,14 +1690,14 @@ def split(str, pattern):
@ignore_unicode_prefix @ignore_unicode_prefix
@since(1.5) @since(1.5)
def regexp_extract(str, pattern, idx): def regexp_extract(str, pattern, idx):
"""Extract a specific group matched by a Java regex, from the specified string column. r"""Extract a specific group matched by a Java regex, from the specified string column.
If the regex did not match, or the specified group did not match, an empty string is returned. If the regex did not match, or the specified group did not match, an empty string is returned.
>>> df = spark.createDataFrame([('100-200',)], ['str']) >>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect() >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
[Row(d=u'100')] [Row(d=u'100')]
>>> df = spark.createDataFrame([('foo',)], ['str']) >>> df = spark.createDataFrame([('foo',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect() >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
[Row(d=u'')] [Row(d=u'')]
>>> df = spark.createDataFrame([('aaaac',)], ['str']) >>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
@ -1712,7 +1714,7 @@ def regexp_replace(str, pattern, replacement):
"""Replace all substrings of the specified string value that match regexp with rep. """Replace all substrings of the specified string value that match regexp with rep.
>>> df = spark.createDataFrame([('100-200',)], ['str']) >>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect() >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
[Row(d=u'-----')] [Row(d=u'-----')]
""" """
sc = SparkContext._active_spark_context sc = SparkContext._active_spark_context

View file

@ -350,7 +350,7 @@ class DataFrameReader(OptionUtils):
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
samplingRatio=None, enforceSchema=None, emptyValue=None): samplingRatio=None, enforceSchema=None, emptyValue=None):
"""Loads a CSV file and returns the result as a :class:`DataFrame`. r"""Loads a CSV file and returns the result as a :class:`DataFrame`.
This function will go through the input once to determine the input schema if This function will go through the input once to determine the input schema if
``inferSchema`` is enabled. To avoid going through the entire data once, disable ``inferSchema`` is enabled. To avoid going through the entire data once, disable
@ -519,7 +519,7 @@ class DataFrameReader(OptionUtils):
If both ``column`` and ``predicates`` are specified, ``column`` will be used. If both ``column`` and ``predicates`` are specified, ``column`` will be used.
.. note:: Don't create too many partitions in parallel on a large cluster; \ .. note:: Don't create too many partitions in parallel on a large cluster;
otherwise Spark might crash your external database systems. otherwise Spark might crash your external database systems.
:param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
@ -862,7 +862,7 @@ class DataFrameWriter(OptionUtils):
header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None, header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None): charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None):
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path. r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
:param path: the path in any Hadoop supported file system :param path: the path in any Hadoop supported file system
:param mode: specifies the behavior of the save operation when data already exists. :param mode: specifies the behavior of the save operation when data already exists.
@ -962,7 +962,7 @@ class DataFrameWriter(OptionUtils):
def jdbc(self, url, table, mode=None, properties=None): def jdbc(self, url, table, mode=None, properties=None):
"""Saves the content of the :class:`DataFrame` to an external database table via JDBC. """Saves the content of the :class:`DataFrame` to an external database table via JDBC.
.. note:: Don't create too many partitions in parallel on a large cluster; \ .. note:: Don't create too many partitions in parallel on a large cluster;
otherwise Spark might crash your external database systems. otherwise Spark might crash your external database systems.
:param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``

View file

@ -565,7 +565,7 @@ class DataStreamReader(OptionUtils):
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
enforceSchema=None, emptyValue=None): enforceSchema=None, emptyValue=None):
"""Loads a CSV file stream and returns the result as a :class:`DataFrame`. r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
This function will go through the input once to determine the input schema if This function will go through the input once to determine the input schema if
``inferSchema`` is enabled. To avoid going through the entire data once, disable ``inferSchema`` is enabled. To avoid going through the entire data once, disable

View file

@ -752,7 +752,7 @@ _all_complex_types = dict((v.typeName(), v)
for v in [ArrayType, MapType, StructType]) for v in [ArrayType, MapType, StructType])
_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)") _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)")
def _parse_datatype_string(s): def _parse_datatype_string(s):

View file

@ -58,8 +58,8 @@ StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1) StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1)
""" """
.. note:: The following four storage level constants are deprecated in 2.0, since the records \ .. note:: The following four storage level constants are deprecated in 2.0, since the records
will always be serialized in Python. will always be serialized in Python.
""" """
StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY
""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead.""" """.. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead."""

View file

@ -80,7 +80,7 @@ class VersionUtils(object):
(2, 3) (2, 3)
""" """
m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion) m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion)
if m is not None: if m is not None:
return (int(m.group(1)), int(m.group(2))) return (int(m.group(1)), int(m.group(2)))
else: else:

View file

@ -138,7 +138,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
# 2 (or --verbose option is enabled). # 2 (or --verbose option is enabled).
decoded_lines = map(lambda line: line.decode(), iter(per_test_output)) decoded_lines = map(lambda line: line.decode(), iter(per_test_output))
skipped_tests = list(filter( skipped_tests = list(filter(
lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line), lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line),
decoded_lines)) decoded_lines))
skipped_counts = len(skipped_tests) skipped_counts = len(skipped_tests)
if skipped_counts > 0: if skipped_counts > 0: