2020-01-27 09:58:45 -05:00
|
|
|
---
|
|
|
|
layout: global
|
|
|
|
title: DISTRIBUTE BY Clause
|
|
|
|
displayTitle: DISTRIBUTE BY Clause
|
|
|
|
license: |
|
|
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
|
|
this work for additional information regarding copyright ownership.
|
|
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
(the "License"); you may not use this file except in compliance with
|
|
|
|
the License. You may obtain a copy of the License at
|
2020-03-11 19:52:40 -04:00
|
|
|
|
2020-01-27 09:58:45 -05:00
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
2020-03-11 19:52:40 -04:00
|
|
|
|
2020-01-27 09:58:45 -05:00
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
---
|
|
|
|
The <code>DISTRIBUTE BY</code> clause is used to repartition the data based
|
2020-01-29 09:41:40 -05:00
|
|
|
on the input expressions. Unlike the [CLUSTER BY](sql-ref-syntax-qry-select-clusterby.html)
|
2020-03-11 19:52:40 -04:00
|
|
|
clause, this does not sort the data within each partition.
|
2020-01-27 09:58:45 -05:00
|
|
|
|
|
|
|
### Syntax
|
|
|
|
{% highlight sql %}
|
|
|
|
DISTRIBUTE BY { expression [ , ... ] }
|
|
|
|
{% endhighlight %}
|
|
|
|
|
|
|
|
### Parameters
|
|
|
|
<dl>
|
|
|
|
<dt><code><em>expression</em></code></dt>
|
|
|
|
<dd>
|
|
|
|
Specifies combination of one or more values, operators and SQL functions that results in a value.
|
|
|
|
</dd>
|
|
|
|
</dl>
|
|
|
|
|
|
|
|
### Examples
|
|
|
|
{% highlight sql %}
|
|
|
|
CREATE TABLE person (name STRING, age INT);
|
|
|
|
INSERT INTO person VALUES
|
2020-03-11 19:52:40 -04:00
|
|
|
('Zen Hui', 25),
|
|
|
|
('Anil B', 18),
|
|
|
|
('Shone S', 16),
|
2020-01-27 09:58:45 -05:00
|
|
|
('Mike A', 25),
|
2020-03-11 19:52:40 -04:00
|
|
|
('John A', 18),
|
2020-01-27 09:58:45 -05:00
|
|
|
('Jack N', 16);
|
|
|
|
|
|
|
|
-- Reduce the number of shuffle partitions to 2 to illustrate the behavior of `DISTRIBUTE BY`.
|
|
|
|
-- It's easier to see the clustering and sorting behavior with less number of partitions.
|
|
|
|
SET spark.sql.shuffle.partitions = 2;
|
2020-03-11 19:52:40 -04:00
|
|
|
|
2020-01-27 09:58:45 -05:00
|
|
|
-- Select the rows with no ordering. Please note that without any sort directive, the result
|
2020-03-11 19:52:40 -04:00
|
|
|
-- of the query is not deterministic. It's included here to just contrast it with the
|
2020-01-27 09:58:45 -05:00
|
|
|
-- behavior of `DISTRIBUTE BY`. The query below produces rows where age columns are not
|
|
|
|
-- clustered together.
|
|
|
|
SELECT age, name FROM person;
|
|
|
|
|
|
|
|
+---+-------+
|
|
|
|
|age|name |
|
|
|
|
+---+-------+
|
|
|
|
|16 |Shone S|
|
|
|
|
|25 |Zen Hui|
|
|
|
|
|16 |Jack N |
|
|
|
|
|25 |Mike A |
|
|
|
|
|18 |John A |
|
|
|
|
|18 |Anil B |
|
|
|
|
+---+-------+
|
|
|
|
|
|
|
|
-- Produces rows clustered by age. Persons with same age are clustered together.
|
|
|
|
-- Unlike `CLUSTER BY` clause, the rows are not sorted within a partition.
|
|
|
|
SELECT age, name FROM person DISTRIBUTE BY age;
|
|
|
|
|
|
|
|
+---+-------+
|
|
|
|
|age|name |
|
|
|
|
+---+-------+
|
|
|
|
|25 |Zen Hui|
|
|
|
|
|25 |Mike A |
|
|
|
|
|18 |John A |
|
|
|
|
|18 |Anil B |
|
|
|
|
|16 |Shone S|
|
|
|
|
|16 |Jack N |
|
|
|
|
+---+-------+
|
|
|
|
{% endhighlight %}
|
2020-01-29 09:41:40 -05:00
|
|
|
|
|
|
|
### Related Clauses
|
|
|
|
- [SELECT Main](sql-ref-syntax-qry-select.html)
|
|
|
|
- [WHERE Clause](sql-ref-syntax-qry-select-where.html)
|
|
|
|
- [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html)
|
|
|
|
- [HAVING Clause](sql-ref-syntax-qry-select-having.html)
|
|
|
|
- [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html)
|
|
|
|
- [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html)
|
|
|
|
- [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html)
|
2020-02-16 10:55:03 -05:00
|
|
|
- [LIMIT Clause](sql-ref-syntax-qry-select-limit.html)
|