[SPARK-5872] [SQL] create a sqlCtx in pyspark shell

The sqlCtx will be HiveContext if hive is built in assembly jar, or SQLContext if not. It also skip the Hive tests in pyspark.sql.tests if no hive is available. Author: Davies Liu <davies@databricks.com> Closes #4659 from davies/sqlctx and squashes the following commits: 0e6629a [Davies Liu] sqlCtx in pyspark
2015-02-17 15:44:37 -08:00 · 2015-02-17 15:44:37 -08:00 · 4d4cc760fa
parent 3df85dccbc
commit 4d4cc760fa
2 changed files with 22 additions and 3 deletions
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@ -31,8 +31,12 @@ if sys.version_info[0] != 2:
 import atexit
 import os
 import platform
+
+import py4j
+
 import pyspark
 from pyspark.context import SparkContext
+from pyspark.sql import SQLContext, HiveContext
 from pyspark.storagelevel import StorageLevel

 # this is the deprecated equivalent of ADD_JARS
@ -46,6 +50,13 @@ if os.environ.get("SPARK_EXECUTOR_URI"):
 sc = SparkContext(appName="PySparkShell", pyFiles=add_files)
 atexit.register(lambda: sc.stop())

+try:
+    # Try to access HiveConf, it will raise exception if Hive is not added
+    sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+    sqlCtx = HiveContext(sc)
+except py4j.protocol.Py4JError:
+    sqlCtx = SQLContext(sc)
+
 print("""Welcome to
      ____              __
     / __/__  ___ _____/ /__
@ -57,7 +68,7 @@ print("Using Python version %s (%s, %s)" % (
    platform.python_version(),
    platform.python_build()[0],
    platform.python_build()[1]))
-print("SparkContext available as sc.")
+print("SparkContext available as sc, %s available as sqlCtx." % sqlCtx.__class__.__name__)

 if add_files is not None:
    print("Warning: ADD_FILES environment variable is deprecated, use --py-files argument instead")
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@ -25,6 +25,8 @@ import pydoc
 import shutil
 import tempfile

+import py4j
+
 if sys.version_info[:2] <= (2, 6):
    try:
        import unittest2 as unittest
@ -329,9 +331,12 @@ class HiveContextSQLTests(ReusedPySparkTestCase):
    def setUpClass(cls):
        ReusedPySparkTestCase.setUpClass()
        cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+        except py4j.protocol.Py4JError:
+            cls.sqlCtx = None
+            return
        os.unlink(cls.tempdir.name)
-        print "type", type(cls.sc)
-        print "type", type(cls.sc._jsc)
        _scala_HiveContext =\
            cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
        cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext)
@ -344,6 +349,9 @@ class HiveContextSQLTests(ReusedPySparkTestCase):
        shutil.rmtree(cls.tempdir.name, ignore_errors=True)

    def test_save_and_load_table(self):
+        if self.sqlCtx is None:
+            return  # no hive available, skipped
+
        df = self.df
        tmpPath = tempfile.mkdtemp()
        shutil.rmtree(tmpPath)