From ce97834dc0cc55eece0e909a4061ca6f2123f60d Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 26 Aug 2015 22:19:11 -0700
Subject: [PATCH] [SPARK-9964] [PYSPARK] [SQL] PySpark DataFrameReader accept
 RDD of String for JSON

PySpark DataFrameReader should could accept an RDD of Strings (like the Scala version does) for JSON, rather than only taking a path.
If this PR is merged, it should be duplicated to cover the other input types (not just JSON).

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #8444 from yanboliang/spark-9964.
---
 python/pyspark/sql/readwriter.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 78247c8fa7..3fa6895880 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -15,8 +15,14 @@
 # limitations under the License.
 #
 
+import sys
+
+if sys.version >= '3':
+    basestring = unicode = str
+
 from py4j.java_gateway import JavaClass
 
+from pyspark import RDD
 from pyspark.sql import since
 from pyspark.sql.column import _to_seq
 from pyspark.sql.types import *
@@ -125,23 +131,33 @@ class DataFrameReader(object):
     @since(1.4)
     def json(self, path, schema=None):
         """
-        Loads a JSON file (one object per line) and returns the result as
-        a :class`DataFrame`.
+        Loads a JSON file (one object per line) or an RDD of Strings storing JSON objects
+        (one object per record) and returns the result as a :class`DataFrame`.
 
         If the ``schema`` parameter is not specified, this function goes
         through the input once to determine the input schema.
 
-        :param path: string, path to the JSON dataset.
+        :param path: string represents path to the JSON dataset,
+                     or RDD of Strings storing JSON objects.
         :param schema: an optional :class:`StructType` for the input schema.
 
-        >>> df = sqlContext.read.json('python/test_support/sql/people.json')
-        >>> df.dtypes
+        >>> df1 = sqlContext.read.json('python/test_support/sql/people.json')
+        >>> df1.dtypes
+        [('age', 'bigint'), ('name', 'string')]
+        >>> rdd = sc.textFile('python/test_support/sql/people.json')
+        >>> df2 = sqlContext.read.json(rdd)
+        >>> df2.dtypes
         [('age', 'bigint'), ('name', 'string')]
 
         """
         if schema is not None:
             self.schema(schema)
-        return self._df(self._jreader.json(path))
+        if isinstance(path, basestring):
+            return self._df(self._jreader.json(path))
+        elif isinstance(path, RDD):
+            return self._df(self._jreader.json(path._jrdd))
+        else:
+            raise TypeError("path can be only string or RDD")
 
     @since(1.4)
     def table(self, tableName):