2015-05-19 17:23:28 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
from py4j.java_gateway import JavaClass
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
from pyspark.sql import since
|
2015-05-19 17:23:28 -04:00
|
|
|
from pyspark.sql.column import _to_seq
|
|
|
|
from pyspark.sql.types import *
|
|
|
|
|
|
|
|
__all__ = ["DataFrameReader", "DataFrameWriter"]
|
|
|
|
|
|
|
|
|
|
|
|
class DataFrameReader(object):
|
|
|
|
"""
|
|
|
|
Interface used to load a :class:`DataFrame` from external storage systems
|
|
|
|
(e.g. file systems, key-value stores, etc). Use :func:`SQLContext.read`
|
|
|
|
to access this.
|
|
|
|
|
|
|
|
::Note: Experimental
|
2015-05-21 02:05:54 -04:00
|
|
|
|
|
|
|
.. versionadded:: 1.4
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, sqlContext):
|
|
|
|
self._jreader = sqlContext._ssql_ctx.read()
|
|
|
|
self._sqlContext = sqlContext
|
|
|
|
|
|
|
|
def _df(self, jdf):
|
|
|
|
from pyspark.sql.dataframe import DataFrame
|
|
|
|
return DataFrame(jdf, self._sqlContext)
|
|
|
|
|
2015-06-02 11:37:18 -04:00
|
|
|
@since(1.4)
|
|
|
|
def format(self, source):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Specifies the input data source format.
|
|
|
|
|
|
|
|
:param source: string, name of the data source, e.g. 'json', 'parquet'.
|
|
|
|
|
|
|
|
>>> df = sqlContext.read.format('json').load('python/test_support/sql/people.json')
|
|
|
|
>>> df.dtypes
|
|
|
|
[('age', 'bigint'), ('name', 'string')]
|
|
|
|
|
2015-06-02 11:37:18 -04:00
|
|
|
"""
|
|
|
|
self._jreader = self._jreader.format(source)
|
|
|
|
return self
|
|
|
|
|
|
|
|
@since(1.4)
|
|
|
|
def schema(self, schema):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Specifies the input schema.
|
|
|
|
|
|
|
|
Some data sources (e.g. JSON) can infer the input schema automatically from data.
|
|
|
|
By specifying the schema here, the underlying data source can skip the schema
|
2015-06-02 11:37:18 -04:00
|
|
|
inference step, and thus speed up data loading.
|
|
|
|
|
|
|
|
:param schema: a StructType object
|
|
|
|
"""
|
|
|
|
if not isinstance(schema, StructType):
|
|
|
|
raise TypeError("schema should be StructType")
|
|
|
|
jschema = self._sqlContext._ssql_ctx.parseDataType(schema.json())
|
|
|
|
self._jreader = self._jreader.schema(jschema)
|
|
|
|
return self
|
|
|
|
|
|
|
|
@since(1.4)
|
|
|
|
def options(self, **options):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Adds input options for the underlying data source.
|
2015-06-02 11:37:18 -04:00
|
|
|
"""
|
|
|
|
for k in options:
|
|
|
|
self._jreader = self._jreader.option(k, options[k])
|
|
|
|
return self
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def load(self, path=None, format=None, schema=None, **options):
|
|
|
|
"""Loads data from a data source and returns it as a :class`DataFrame`.
|
|
|
|
|
|
|
|
:param path: optional string for file-system backed data sources.
|
|
|
|
:param format: optional string for format of the data source. Default to 'parquet'.
|
|
|
|
:param schema: optional :class:`StructType` for the input schema.
|
|
|
|
:param options: all other string options
|
2015-06-03 03:23:34 -04:00
|
|
|
|
|
|
|
>>> df = sqlContext.read.load('python/test_support/sql/parquet_partitioned')
|
|
|
|
>>> df.dtypes
|
|
|
|
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
|
|
|
if format is not None:
|
2015-06-02 11:37:18 -04:00
|
|
|
self.format(format)
|
2015-05-19 17:23:28 -04:00
|
|
|
if schema is not None:
|
2015-06-02 11:37:18 -04:00
|
|
|
self.schema(schema)
|
|
|
|
self.options(**options)
|
2015-05-19 17:23:28 -04:00
|
|
|
if path is not None:
|
2015-06-02 11:37:18 -04:00
|
|
|
return self._df(self._jreader.load(path))
|
2015-05-19 17:23:28 -04:00
|
|
|
else:
|
2015-06-02 11:37:18 -04:00
|
|
|
return self._df(self._jreader.load())
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def json(self, path, schema=None):
|
|
|
|
"""
|
|
|
|
Loads a JSON file (one object per line) and returns the result as
|
|
|
|
a :class`DataFrame`.
|
|
|
|
|
|
|
|
If the ``schema`` parameter is not specified, this function goes
|
|
|
|
through the input once to determine the input schema.
|
|
|
|
|
|
|
|
:param path: string, path to the JSON dataset.
|
|
|
|
:param schema: an optional :class:`StructType` for the input schema.
|
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
>>> df = sqlContext.read.json('python/test_support/sql/people.json')
|
|
|
|
>>> df.dtypes
|
|
|
|
[('age', 'bigint'), ('name', 'string')]
|
|
|
|
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
2015-06-02 11:37:18 -04:00
|
|
|
if schema is not None:
|
|
|
|
self.schema(schema)
|
|
|
|
return self._df(self._jreader.json(path))
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def table(self, tableName):
|
|
|
|
"""Returns the specified table as a :class:`DataFrame`.
|
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
:param tableName: string, name of the table.
|
|
|
|
|
|
|
|
>>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
|
|
|
|
>>> df.registerTempTable('tmpTable')
|
|
|
|
>>> sqlContext.read.table('tmpTable').dtypes
|
|
|
|
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
|
|
|
return self._df(self._jreader.table(tableName))
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def parquet(self, *path):
|
|
|
|
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
|
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
>>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
|
|
|
|
>>> df.dtypes
|
|
|
|
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
|
|
|
return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None,
|
|
|
|
predicates=None, properties={}):
|
|
|
|
"""
|
|
|
|
Construct a :class:`DataFrame` representing the database table accessible
|
|
|
|
via JDBC URL `url` named `table` and connection `properties`.
|
|
|
|
|
|
|
|
The `column` parameter could be used to partition the table, then it will
|
|
|
|
be retrieved in parallel based on the parameters passed to this function.
|
|
|
|
|
|
|
|
The `predicates` parameter gives a list expressions suitable for inclusion
|
|
|
|
in WHERE clauses; each one defines one partition of the :class:`DataFrame`.
|
|
|
|
|
|
|
|
::Note: Don't create too many partitions in parallel on a large cluster;
|
|
|
|
otherwise Spark might crash your external database systems.
|
|
|
|
|
|
|
|
:param url: a JDBC URL
|
|
|
|
:param table: name of table
|
|
|
|
:param column: the column used to partition
|
|
|
|
:param lowerBound: the lower bound of partition column
|
|
|
|
:param upperBound: the upper bound of the partition column
|
|
|
|
:param numPartitions: the number of partitions
|
|
|
|
:param predicates: a list of expressions
|
|
|
|
:param properties: JDBC database connection arguments, a list of arbitrary string
|
|
|
|
tag/value. Normally at least a "user" and "password" property
|
|
|
|
should be included.
|
|
|
|
:return: a DataFrame
|
|
|
|
"""
|
|
|
|
jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)()
|
|
|
|
for k in properties:
|
|
|
|
jprop.setProperty(k, properties[k])
|
|
|
|
if column is not None:
|
|
|
|
if numPartitions is None:
|
|
|
|
numPartitions = self._sqlContext._sc.defaultParallelism
|
|
|
|
return self._df(self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound),
|
|
|
|
int(numPartitions), jprop))
|
|
|
|
if predicates is not None:
|
|
|
|
arr = self._sqlContext._sc._jvm.PythonUtils.toArray(predicates)
|
|
|
|
return self._df(self._jreader.jdbc(url, table, arr, jprop))
|
|
|
|
return self._df(self._jreader.jdbc(url, table, jprop))
|
|
|
|
|
|
|
|
|
|
|
|
class DataFrameWriter(object):
|
|
|
|
"""
|
|
|
|
Interface used to write a [[DataFrame]] to external storage systems
|
|
|
|
(e.g. file systems, key-value stores, etc). Use :func:`DataFrame.write`
|
|
|
|
to access this.
|
|
|
|
|
|
|
|
::Note: Experimental
|
2015-05-21 02:05:54 -04:00
|
|
|
|
|
|
|
.. versionadded:: 1.4
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
|
|
|
def __init__(self, df):
|
|
|
|
self._df = df
|
|
|
|
self._sqlContext = df.sql_ctx
|
|
|
|
self._jwrite = df._jdf.write()
|
|
|
|
|
2015-06-02 11:37:18 -04:00
|
|
|
@since(1.4)
|
|
|
|
def mode(self, saveMode):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Specifies the behavior when data or table already exists.
|
|
|
|
|
|
|
|
Options include:
|
2015-06-02 11:37:18 -04:00
|
|
|
|
|
|
|
* `append`: Append contents of this :class:`DataFrame` to existing data.
|
|
|
|
* `overwrite`: Overwrite existing data.
|
|
|
|
* `error`: Throw an exception if data already exists.
|
|
|
|
* `ignore`: Silently ignore this operation if data already exists.
|
2015-06-03 03:23:34 -04:00
|
|
|
|
|
|
|
>>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
|
2015-06-02 11:37:18 -04:00
|
|
|
"""
|
|
|
|
self._jwrite = self._jwrite.mode(saveMode)
|
|
|
|
return self
|
|
|
|
|
|
|
|
@since(1.4)
|
|
|
|
def format(self, source):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Specifies the underlying output data source.
|
|
|
|
|
|
|
|
:param source: string, name of the data source, e.g. 'json', 'parquet'.
|
|
|
|
|
|
|
|
>>> df.write.format('json').save(os.path.join(tempfile.mkdtemp(), 'data'))
|
2015-06-02 11:37:18 -04:00
|
|
|
"""
|
|
|
|
self._jwrite = self._jwrite.format(source)
|
|
|
|
return self
|
|
|
|
|
|
|
|
@since(1.4)
|
|
|
|
def options(self, **options):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Adds output options for the underlying data source.
|
2015-06-02 11:37:18 -04:00
|
|
|
"""
|
|
|
|
for k in options:
|
|
|
|
self._jwrite = self._jwrite.option(k, options[k])
|
|
|
|
return self
|
|
|
|
|
|
|
|
@since(1.4)
|
|
|
|
def partitionBy(self, *cols):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Partitions the output by the given columns on the file system.
|
|
|
|
|
2015-06-02 11:37:18 -04:00
|
|
|
If specified, the output is laid out on the file system similar
|
|
|
|
to Hive's partitioning scheme.
|
|
|
|
|
|
|
|
:param cols: name of columns
|
2015-06-03 03:23:34 -04:00
|
|
|
|
|
|
|
>>> df.write.partitionBy('year', 'month').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
|
2015-06-02 11:37:18 -04:00
|
|
|
"""
|
|
|
|
if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
|
|
|
|
cols = cols[0]
|
|
|
|
self._jwrite = self._jwrite.partitionBy(_to_seq(self._sqlContext._sc, cols))
|
|
|
|
return self
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def save(self, path=None, format=None, mode="error", **options):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Saves the contents of the :class:`DataFrame` to a data source.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
|
|
|
The data source is specified by the ``format`` and a set of ``options``.
|
|
|
|
If ``format`` is not specified, the default data source configured by
|
|
|
|
``spark.sql.sources.default`` will be used.
|
|
|
|
|
|
|
|
:param path: the path in a Hadoop supported file system
|
|
|
|
:param format: the format used to save
|
2015-06-03 03:23:34 -04:00
|
|
|
:param mode: specifies the behavior of the save operation when data already exists.
|
|
|
|
|
|
|
|
* ``append``: Append contents of this :class:`DataFrame` to existing data.
|
|
|
|
* ``overwrite``: Overwrite existing data.
|
|
|
|
* ``ignore``: Silently ignore this operation if data already exists.
|
|
|
|
* ``error`` (default case): Throw an exception if data already exists.
|
2015-05-19 17:23:28 -04:00
|
|
|
:param options: all other string options
|
2015-06-03 03:23:34 -04:00
|
|
|
|
|
|
|
>>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
2015-06-02 11:37:18 -04:00
|
|
|
self.mode(mode).options(**options)
|
2015-05-19 17:23:28 -04:00
|
|
|
if format is not None:
|
2015-06-02 11:37:18 -04:00
|
|
|
self.format(format)
|
2015-05-19 17:23:28 -04:00
|
|
|
if path is None:
|
2015-06-02 11:37:18 -04:00
|
|
|
self._jwrite.save()
|
2015-05-19 17:23:28 -04:00
|
|
|
else:
|
2015-06-02 11:37:18 -04:00
|
|
|
self._jwrite.save(path)
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-02 11:37:18 -04:00
|
|
|
@since(1.4)
|
2015-05-23 12:07:14 -04:00
|
|
|
def insertInto(self, tableName, overwrite=False):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Inserts the content of the :class:`DataFrame` to the specified table.
|
|
|
|
|
2015-05-23 12:07:14 -04:00
|
|
|
It requires that the schema of the class:`DataFrame` is the same as the
|
|
|
|
schema of the table.
|
|
|
|
|
|
|
|
Optionally overwriting any existing data.
|
|
|
|
"""
|
|
|
|
self._jwrite.mode("overwrite" if overwrite else "append").insertInto(tableName)
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def saveAsTable(self, name, format=None, mode="error", **options):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Saves the content of the :class:`DataFrame` as the specified table.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-05-23 12:07:14 -04:00
|
|
|
In the case the table already exists, behavior of this function depends on the
|
|
|
|
save mode, specified by the `mode` function (default to throwing an exception).
|
|
|
|
When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be
|
|
|
|
the same as that of the existing table.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
|
|
|
* `append`: Append contents of this :class:`DataFrame` to existing data.
|
|
|
|
* `overwrite`: Overwrite existing data.
|
|
|
|
* `error`: Throw an exception if data already exists.
|
|
|
|
* `ignore`: Silently ignore this operation if data already exists.
|
|
|
|
|
|
|
|
:param name: the table name
|
|
|
|
:param format: the format used to save
|
|
|
|
:param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
|
|
|
|
:param options: all other string options
|
|
|
|
"""
|
2015-06-02 11:37:18 -04:00
|
|
|
self.mode(mode).options(**options)
|
2015-05-19 17:23:28 -04:00
|
|
|
if format is not None:
|
2015-06-02 11:37:18 -04:00
|
|
|
self.format(format)
|
2015-06-03 03:23:34 -04:00
|
|
|
self._jwrite.saveAsTable(name)
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def json(self, path, mode="error"):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Saves the content of the :class:`DataFrame` in JSON format at the specified path.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
:param path: the path in any Hadoop supported file system
|
|
|
|
:param mode: specifies the behavior of the save operation when data already exists.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
* ``append``: Append contents of this :class:`DataFrame` to existing data.
|
|
|
|
* ``overwrite``: Overwrite existing data.
|
|
|
|
* ``ignore``: Silently ignore this operation if data already exists.
|
|
|
|
* ``error`` (default case): Throw an exception if data already exists.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
>>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
2015-06-03 03:23:34 -04:00
|
|
|
self._jwrite.mode(mode).json(path)
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def parquet(self, path, mode="error"):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Saves the content of the :class:`DataFrame` in Parquet format at the specified path.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
:param path: the path in any Hadoop supported file system
|
|
|
|
:param mode: specifies the behavior of the save operation when data already exists.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
* ``append``: Append contents of this :class:`DataFrame` to existing data.
|
|
|
|
* ``overwrite``: Overwrite existing data.
|
|
|
|
* ``ignore``: Silently ignore this operation if data already exists.
|
|
|
|
* ``error`` (default case): Throw an exception if data already exists.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
>>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data'))
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
2015-06-03 03:23:34 -04:00
|
|
|
self._jwrite.mode(mode).parquet(path)
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-19 17:23:28 -04:00
|
|
|
def jdbc(self, url, table, mode="error", properties={}):
|
2015-06-03 03:23:34 -04:00
|
|
|
"""Saves the content of the :class:`DataFrame` to a external database table via JDBC.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
.. note:: Don't create too many partitions in parallel on a large cluster;\
|
|
|
|
otherwise Spark might crash your external database systems.
|
2015-05-19 17:23:28 -04:00
|
|
|
|
2015-06-03 03:23:34 -04:00
|
|
|
:param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
|
2015-05-19 17:23:28 -04:00
|
|
|
:param table: Name of the table in the external database.
|
2015-06-03 03:23:34 -04:00
|
|
|
:param mode: specifies the behavior of the save operation when data already exists.
|
|
|
|
|
|
|
|
* ``append``: Append contents of this :class:`DataFrame` to existing data.
|
|
|
|
* ``overwrite``: Overwrite existing data.
|
|
|
|
* ``ignore``: Silently ignore this operation if data already exists.
|
|
|
|
* ``error`` (default case): Throw an exception if data already exists.
|
2015-05-19 17:23:28 -04:00
|
|
|
:param properties: JDBC database connection arguments, a list of
|
2015-06-03 03:23:34 -04:00
|
|
|
arbitrary string tag/value. Normally at least a
|
|
|
|
"user" and "password" property should be included.
|
2015-05-19 17:23:28 -04:00
|
|
|
"""
|
|
|
|
jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)()
|
|
|
|
for k in properties:
|
|
|
|
jprop.setProperty(k, properties[k])
|
|
|
|
self._jwrite.mode(mode).jdbc(url, table, jprop)
|
|
|
|
|
|
|
|
|
|
|
|
def _test():
|
|
|
|
import doctest
|
2015-06-03 03:23:34 -04:00
|
|
|
import os
|
|
|
|
import tempfile
|
2015-05-19 17:23:28 -04:00
|
|
|
from pyspark.context import SparkContext
|
|
|
|
from pyspark.sql import Row, SQLContext
|
|
|
|
import pyspark.sql.readwriter
|
2015-06-03 03:23:34 -04:00
|
|
|
|
|
|
|
os.chdir(os.environ["SPARK_HOME"])
|
|
|
|
|
2015-05-19 17:23:28 -04:00
|
|
|
globs = pyspark.sql.readwriter.__dict__.copy()
|
|
|
|
sc = SparkContext('local[4]', 'PythonTest')
|
2015-06-03 03:23:34 -04:00
|
|
|
|
|
|
|
globs['tempfile'] = tempfile
|
|
|
|
globs['os'] = os
|
2015-05-19 17:23:28 -04:00
|
|
|
globs['sc'] = sc
|
|
|
|
globs['sqlContext'] = SQLContext(sc)
|
2015-06-03 03:23:34 -04:00
|
|
|
globs['df'] = globs['sqlContext'].read.parquet('python/test_support/sql/parquet_partitioned')
|
|
|
|
|
2015-05-19 17:23:28 -04:00
|
|
|
(failure_count, test_count) = doctest.testmod(
|
|
|
|
pyspark.sql.readwriter, globs=globs,
|
|
|
|
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
|
|
|
|
globs['sc'].stop()
|
|
|
|
if failure_count:
|
|
|
|
exit(-1)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
_test()
|