[MINOR][PYSPARK][DOC] Fix wrongly formatted examples in PySpark documentation

## What changes were proposed in this pull request?

This PR fixes wrongly formatted examples in PySpark documentation as below:

- **`SparkSession`**

  - **Before**

    ![2016-07-06 11 34 41](https://cloud.githubusercontent.com/assets/6477701/16605847/ae939526-436d-11e6-8ab8-6ad578362425.png)

  - **After**

    ![2016-07-06 11 33 56](https://cloud.githubusercontent.com/assets/6477701/16605845/ace9ee78-436d-11e6-8923-b76d4fc3e7c3.png)

- **`Builder`**

  - **Before**
    ![2016-07-06 11 34 44](https://cloud.githubusercontent.com/assets/6477701/16605844/aba60dbc-436d-11e6-990a-c87bc0281c6b.png)

  - **After**
    ![2016-07-06 1 26 37](https://cloud.githubusercontent.com/assets/6477701/16607562/586704c0-437d-11e6-9483-e0af93d8f74e.png)

This PR also fixes several similar instances across the documentation in `sql` PySpark module.

## How was this patch tested?

N/A

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #14063 from HyukjinKwon/minor-pyspark-builder.
This commit is contained in:
hyukjinkwon 2016-07-06 10:45:51 -07:00 committed by Reynold Xin
parent b1310425b3
commit 4e14199ff7
6 changed files with 26 additions and 23 deletions

View file

@ -571,14 +571,14 @@ class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader):
>>> import math
>>> def genCircle(r, n):
... points = []
... for i in range(0, n):
... theta = 2.0 * math.pi * i / n
... points.append((r * math.cos(theta), r * math.sin(theta)))
... return points
... points = []
... for i in range(0, n):
... theta = 2.0 * math.pi * i / n
... points.append((r * math.cos(theta), r * math.sin(theta)))
... return points
>>> def sim(x, y):
... dist2 = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1])
... return math.exp(-dist2 / 2.0)
... dist2 = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1])
... return math.exp(-dist2 / 2.0)
>>> r1 = 1.0
>>> n1 = 10
>>> r2 = 4.0

View file

@ -1045,10 +1045,10 @@ class DataFrame(object):
:func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
>>> from pyspark.sql import Row
>>> df = sc.parallelize([ \
Row(name='Alice', age=5, height=80), \
Row(name='Alice', age=5, height=80), \
Row(name='Alice', age=10, height=80)]).toDF()
>>> df = sc.parallelize([ \\
... Row(name='Alice', age=5, height=80), \\
... Row(name='Alice', age=5, height=80), \\
... Row(name='Alice', age=10, height=80)]).toDF()
>>> df.dropDuplicates().show()
+---+------+-----+
|age|height| name|

View file

@ -1550,8 +1550,8 @@ def translate(srcCol, matching, replace):
The translate will happen when any character in the string matching with the character
in the `matching`.
>>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123")\
.alias('r')).collect()
>>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\
... .alias('r')).collect()
[Row(r=u'1a2s3ae')]
"""
sc = SparkContext._active_spark_context
@ -1670,8 +1670,8 @@ def get_json_object(col, path):
>>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
>>> df = spark.createDataFrame(data, ("key", "jstring"))
>>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \
get_json_object(df.jstring, '$.f2').alias("c1") ).collect()
>>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\
... get_json_object(df.jstring, '$.f2').alias("c1") ).collect()
[Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
"""
sc = SparkContext._active_spark_context

View file

@ -179,10 +179,12 @@ class GroupedData(object):
:param values: List of values that will be translated to columns in the output DataFrame.
# Compute the sum of earnings for each year by course with each course as a separate column
>>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect()
[Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)]
# Or without specifying column values (less efficient)
>>> df4.groupBy("year").pivot("course").sum("earnings").collect()
[Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)]
"""

View file

@ -66,12 +66,11 @@ class SparkSession(object):
tables, execute SQL over tables, cache tables, and read parquet files.
To create a SparkSession, use the following builder pattern:
>>> spark = SparkSession.builder \
.master("local") \
.appName("Word Count") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
>>> spark = SparkSession.builder \\
... .master("local") \\
... .appName("Word Count") \\
... .config("spark.some.config.option", "some-value") \\
... .getOrCreate()
"""
class Builder(object):
@ -87,11 +86,13 @@ class SparkSession(object):
both :class:`SparkConf` and :class:`SparkSession`'s own configuration.
For an existing SparkConf, use `conf` parameter.
>>> from pyspark.conf import SparkConf
>>> SparkSession.builder.config(conf=SparkConf())
<pyspark.sql.session...
For a (key, value) pair, you can omit parameter names.
>>> SparkSession.builder.config("spark.some.config.option", "some-value")
<pyspark.sql.session...

View file

@ -486,8 +486,8 @@ class StructType(DataType):
DataType object.
>>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
>>> struct2 = StructType([StructField("f1", StringType(), True),\
StructField("f2", StringType(), True, None)])
>>> struct2 = StructType([StructField("f1", StringType(), True), \\
... StructField("f2", StringType(), True, None)])
>>> struct1 == struct2
True
>>> struct1 = StructType().add(StructField("f1", StringType(), True))