[SPARK-7242] added python api for freqItems in DataFrames
The python api for DataFrame's plus addressed your comments from previous PR. rxin Author: Burak Yavuz <brkyvz@gmail.com> Closes #5859 from brkyvz/df-freq-py2 and squashes the following commits: f9aa9ce [Burak Yavuz] addressed comments v0.1 4b25056 [Burak Yavuz] added python api for freqItems
This commit is contained in:
parent
b79aeb95b4
commit
2e0f3579f1
|
@ -889,6 +889,26 @@ class DataFrame(object):
|
|||
raise ValueError("col2 should be a string.")
|
||||
return self._jdf.stat().cov(col1, col2)
|
||||
|
||||
def freqItems(self, cols, support=None):
|
||||
"""
|
||||
Finding frequent items for columns, possibly with false positives. Using the
|
||||
frequent element count algorithm described in
|
||||
"http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
|
||||
:func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
|
||||
|
||||
:param cols: Names of the columns to calculate frequent items for as a list or tuple of
|
||||
strings.
|
||||
:param support: The frequency with which to consider an item 'frequent'. Default is 1%.
|
||||
The support must be greater than 1e-4.
|
||||
"""
|
||||
if isinstance(cols, tuple):
|
||||
cols = list(cols)
|
||||
if not isinstance(cols, list):
|
||||
raise ValueError("cols must be a list or tuple of column names as strings.")
|
||||
if not support:
|
||||
support = 0.01
|
||||
return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
|
||||
|
||||
@ignore_unicode_prefix
|
||||
def withColumn(self, colName, col):
|
||||
"""Returns a new :class:`DataFrame` by adding a column.
|
||||
|
@ -1344,6 +1364,11 @@ class DataFrameStatFunctions(object):
|
|||
|
||||
cov.__doc__ = DataFrame.cov.__doc__
|
||||
|
||||
def freqItems(self, cols, support=None):
|
||||
return self.df.freqItems(cols, support)
|
||||
|
||||
freqItems.__doc__ = DataFrame.freqItems.__doc__
|
||||
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
|
|
|
@ -375,6 +375,13 @@ class SQLTests(ReusedPySparkTestCase):
|
|||
self.assertEqual(self.testData, df.select(df.key, df.value).collect())
|
||||
self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect())
|
||||
|
||||
def test_freqItems(self):
|
||||
vals = [Row(a=1, b=-2.0) if i % 2 == 0 else Row(a=i, b=i * 1.0) for i in range(100)]
|
||||
df = self.sc.parallelize(vals).toDF()
|
||||
items = df.stat.freqItems(("a", "b"), 0.4).collect()[0]
|
||||
self.assertTrue(1 in items[0])
|
||||
self.assertTrue(-2.0 in items[1])
|
||||
|
||||
def test_aggregator(self):
|
||||
df = self.df
|
||||
g = df.groupBy()
|
||||
|
|
|
@ -43,7 +43,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Runs `freqItems` with a default `support` of 1%.
|
||||
* Finding frequent items for columns, possibly with false positives. Using the
|
||||
* frequent element count algorithm described in
|
||||
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
|
||||
* Uses a `default` support of 1%.
|
||||
*
|
||||
* @param cols the names of the columns to search frequent items in.
|
||||
* @return A Local DataFrame with the Array of frequent items for each column.
|
||||
|
@ -55,14 +58,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
|
|||
/**
|
||||
* Python friendly implementation for `freqItems`
|
||||
*/
|
||||
def freqItems(cols: List[String], support: Double): DataFrame = {
|
||||
def freqItems(cols: Seq[String], support: Double): DataFrame = {
|
||||
FrequentItems.singlePassFreqItems(df, cols, support)
|
||||
}
|
||||
|
||||
/**
|
||||
* Python friendly implementation for `freqItems` with a default `support` of 1%.
|
||||
*/
|
||||
def freqItems(cols: List[String]): DataFrame = {
|
||||
def freqItems(cols: Seq[String]): DataFrame = {
|
||||
FrequentItems.singlePassFreqItems(df, cols, 0.01)
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue