[SPARK-10380][SQL] Fix confusing documentation examples for astype/drop_duplicates.

## What changes were proposed in this pull request?
We have seen users getting confused by the documentation for astype and drop_duplicates, because the examples in them do not use these functions (but do uses their aliases). This patch simply removes all examples for these functions, and say that they are aliases.

## How was this patch tested?
Existing PySpark unit tests.

Closes #11543.

Author: Reynold Xin <rxin@databricks.com>

Closes #11698 from rxin/SPARK-10380.
This commit is contained in:
Reynold Xin 2016-03-14 19:25:49 -07:00 committed by Michael Armbrust
parent 4bf4609795
commit 8e0b030606
3 changed files with 37 additions and 7 deletions

View file

@ -37,6 +37,8 @@ Public classes:
"""
import types
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.rdd import RDD
@ -64,6 +66,24 @@ def since(version):
return deco
def copy_func(f, name=None, sinceversion=None, doc=None):
"""
Returns a function with same code, globals, defaults, closure, and
name (or provide a new name).
"""
# See
# http://stackoverflow.com/questions/6527633/how-can-i-make-a-deepcopy-of-a-function-in-python
fn = types.FunctionType(f.__code__, f.__globals__, name or f.__name__, f.__defaults__,
f.__closure__)
# in case f was given attrs (note this dict is a shallow copy):
fn.__dict__.update(f.__dict__)
if doc is not None:
fn.__doc__ = doc
if sinceversion is not None:
fn = since(sinceversion)(fn)
return fn
# for back compatibility
from pyspark.sql import SQLContext, HiveContext, Row

View file

@ -22,7 +22,7 @@ if sys.version >= '3':
basestring = str
long = int
from pyspark import since
from pyspark import copy_func, since
from pyspark.context import SparkContext
from pyspark.rdd import ignore_unicode_prefix
from pyspark.sql.types import *
@ -337,7 +337,7 @@ class Column(object):
raise TypeError("unexpected type: %s" % type(dataType))
return Column(jc)
astype = cast
astype = copy_func(cast, sinceversion=1.4, doc=":func:`astype` is an alias for :func:`cast`.")
@since(1.3)
def between(self, lowerBound, upperBound):

View file

@ -26,7 +26,7 @@ if sys.version >= '3':
else:
from itertools import imap as map
from pyspark import since
from pyspark import copy_func, since
from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
from pyspark.storagelevel import StorageLevel
@ -829,8 +829,6 @@ class DataFrame(object):
raise TypeError("condition should be string or Column")
return DataFrame(jdf, self.sql_ctx)
where = filter
@ignore_unicode_prefix
@since(1.3)
def groupBy(self, *cols):
@ -1361,8 +1359,20 @@ class DataFrame(object):
# Pandas compatibility
##########################################################################################
groupby = groupBy
drop_duplicates = dropDuplicates
groupby = copy_func(
groupBy,
sinceversion=1.4,
doc=":func:`groupby` is an alias for :func:`groupBy`.")
drop_duplicates = copy_func(
dropDuplicates,
sinceversion=1.4,
doc=":func:`drop_duplicates` is an alias for :func:`dropDuplicates`.")
where = copy_func(
filter,
sinceversion=1.3,
doc=":func:`where` is an alias for :func:`filter`.")
def _to_scala_map(sc, jm):