diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml index 8e366f7f02..8b43c75f51 100644 --- a/docs/_data/menu-ml.yaml +++ b/docs/_data/menu-ml.yaml @@ -1,7 +1,7 @@ - text: Basic statistics url: ml-statistics.html - text: Data sources - url: ml-datasource + url: ml-datasource.html - text: Pipelines url: ml-pipeline.html - text: Extracting, transforming and selecting features diff --git a/docs/ml-datasource.md b/docs/ml-datasource.md index 71bec9c798..5dc2d057a9 100644 --- a/docs/ml-datasource.md +++ b/docs/ml-datasource.md @@ -63,7 +63,7 @@ scala> df.select("image.origin", "image.width", "image.height").show(truncate=fa
[`ImageDataSource`](api/java/org/apache/spark/ml/source/image/ImageDataSource.html) -implements Spark SQL data source API for loading image data as DataFrame. +implements Spark SQL data source API for loading image data as a DataFrame. {% highlight java %} Dataset imagesDF = spark.read().format("image").option("dropInvalid", true).load("data/mllib/images/origin/kittens"); @@ -83,7 +83,7 @@ Will output:
-In PySpark we provide Spark SQL data source API for loading image data as DataFrame. +In PySpark we provide Spark SQL data source API for loading image data as a DataFrame. {% highlight python %} >>> df = spark.read.format("image").option("dropInvalid", true).load("data/mllib/images/origin/kittens") @@ -100,7 +100,7 @@ In PySpark we provide Spark SQL data source API for loading image data as DataFr
-In SparkR we provide Spark SQL data source API for loading image data as DataFrame. +In SparkR we provide Spark SQL data source API for loading image data as a DataFrame. {% highlight r %} > df = read.df("data/mllib/images/origin/kittens", "image") @@ -120,4 +120,118 @@ In SparkR we provide Spark SQL data source API for loading image data as DataFra
+ + + +## LIBSVM data source + +This `LIBSVM` data source is used to load 'libsvm' type files from a directory. +The loaded DataFrame has two columns: label containing labels stored as doubles and features containing feature vectors stored as Vectors. +The schemas of the columns are: + - label: `DoubleType` (represents the instance label) + - features: `VectorUDT` (represents the feature vector) + +
+
+[`LibSVMDataSource`](api/scala/index.html#org.apache.spark.ml.source.libsvm.LibSVMDataSource) +implements a Spark SQL data source API for loading `LIBSVM` data as a DataFrame. + +{% highlight scala %} +scala> val df = spark.read.format("libsvm").option("numFeatures", "780").load("data/mllib/sample_libsvm_data.txt") +df: org.apache.spark.sql.DataFrame = [label: double, features: vector] + +scala> df.show(10) ++-----+--------------------+ +|label| features| ++-----+--------------------+ +| 0.0|(780,[127,128,129...| +| 1.0|(780,[158,159,160...| +| 1.0|(780,[124,125,126...| +| 1.0|(780,[152,153,154...| +| 1.0|(780,[151,152,153...| +| 0.0|(780,[129,130,131...| +| 1.0|(780,[158,159,160...| +| 1.0|(780,[99,100,101,...| +| 0.0|(780,[154,155,156...| +| 0.0|(780,[127,128,129...| ++-----+--------------------+ +only showing top 10 rows +{% endhighlight %} +
+ +
+[`LibSVMDataSource`](api/java/org/apache/spark/ml/source/libsvm/LibSVMDataSource.html) +implements Spark SQL data source API for loading `LIBSVM` data as a DataFrame. + +{% highlight java %} +Dataset df = spark.read.format("libsvm").option("numFeatures", "780").load("data/mllib/sample_libsvm_data.txt"); +df.show(10); +/* +Will output: ++-----+--------------------+ +|label| features| ++-----+--------------------+ +| 0.0|(780,[127,128,129...| +| 1.0|(780,[158,159,160...| +| 1.0|(780,[124,125,126...| +| 1.0|(780,[152,153,154...| +| 1.0|(780,[151,152,153...| +| 0.0|(780,[129,130,131...| +| 1.0|(780,[158,159,160...| +| 1.0|(780,[99,100,101,...| +| 0.0|(780,[154,155,156...| +| 0.0|(780,[127,128,129...| ++-----+--------------------+ +only showing top 10 rows +*/ +{% endhighlight %} +
+ +
+In PySpark we provide Spark SQL data source API for loading `LIBSVM` data as a DataFrame. + +{% highlight python %} +>>> df = spark.read.format("libsvm").option("numFeatures", "780").load("data/mllib/sample_libsvm_data.txt") +>>> df.show(10) ++-----+--------------------+ +|label| features| ++-----+--------------------+ +| 0.0|(780,[127,128,129...| +| 1.0|(780,[158,159,160...| +| 1.0|(780,[124,125,126...| +| 1.0|(780,[152,153,154...| +| 1.0|(780,[151,152,153...| +| 0.0|(780,[129,130,131...| +| 1.0|(780,[158,159,160...| +| 1.0|(780,[99,100,101,...| +| 0.0|(780,[154,155,156...| +| 0.0|(780,[127,128,129...| ++-----+--------------------+ +only showing top 10 rows +{% endhighlight %} +
+ +
+In SparkR we provide Spark SQL data source API for loading `LIBSVM` data as a DataFrame. + +{% highlight r %} +> df = read.df("data/mllib/sample_libsvm_data.txt", "libsvm") +> head(select(df, df$label, df$features), 10) + + label features +1 0 +2 1 +3 1 +4 1 +5 1 +6 0 +7 1 +8 1 +9 0 +10 0 + +{% endhighlight %} +
+ +