spark-instrumented-optimizer/sql/gen-sql-functions-docs.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import itertools
import os
import re
from collections import namedtuple

# To avoid adding a new direct dependency, we import markdown from within mkdocs.
from mkdocs.structure.pages import markdown

from pyspark.java_gateway import launch_gateway


ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")

groups = {
    "agg_funcs", "array_funcs", "datetime_funcs",
    "json_funcs", "map_funcs", "window_funcs",
}


def _list_grouped_function_infos(jvm):
    """
    Returns a list of function information grouped by each group value via JVM.
    Sorts wrapped expression infos in each group by name and returns them.
    """

    jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
    infos = []

    for jinfo in filter(lambda x: x.getGroup() in groups, jinfos):
        name = jinfo.getName()
        usage = jinfo.getUsage()
        usage = usage.replace("_FUNC_", name) if usage is not None else usage
        infos.append(ExpressionInfo(
            name=name,
            usage=usage,
            examples=jinfo.getExamples().replace("_FUNC_", name),
            group=jinfo.getGroup()))

    # Groups expression info by each group value
    grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)
    # Then, sort expression infos in each group by name
    return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]


# TODO(SPARK-31499): Needs to add a column to describe arguments and their types
def _make_pretty_usage(infos):
    """
    Makes the usage description pretty and returns a formatted string.

    Expected input:

        func(*) - ...

        func(expr[, expr...]) - ...

    Expected output:
    <table class="table">
      <thead>
        <tr>
          <th style="width:25%">Function</th>
          <th>Description</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>func(*)</td>
          <td>...</td>
        </tr>
        <tr>
          <td>func(expr[, expr...])</td>
          <td>...</td>
        </tr>
      </tbody>
      ...
    </table>

    """

    result = []
    result.append("<table class=\"table\">")
    result.append("  <thead>")
    result.append("    <tr>")
    result.append("      <th style=\"width:25%\">Function</th>")
    result.append("      <th>Description</th>")
    result.append("    </tr>")
    result.append("  </thead>")
    result.append("  <tbody>")

    for info in infos:
        # Extracts (signature, description) pairs from `info.usage`.
        # Expected formats are as follows;
        #  - `_FUNC_(...) - description`, or
        #  - `_FUNC_ - description`
        usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:])
        for (sig, description) in zip(usages, usages):
            result.append("    <tr>")
            result.append("      <td>%s</td>" % sig)
            result.append("      <td>%s</td>" % description.strip())
            result.append("    </tr>")

    result.append("  </tbody>")
    result.append("</table>\n")
    return "\n".join(result)


def _make_pretty_examples(jspark, infos):
    """
    Makes the examples description pretty and returns a formatted string if `infos`
    has any `examples` starting with the example prefix. Otherwise, returns None.

    Expected input:

        Examples:
          > SELECT func(col)...;
           ...
          > SELECT func(col)...;
           ...

    Expected output:
    <div class="codehilite"><pre><span></span>
      <span class="c1">-- func</span>
      <span class="k">SELECT</span>
      ...
    </pre></div>
    ```

    """

    pretty_output = ""
    for info in infos:
        if info.examples.startswith("\n    Examples:"):
            output = []
            output.append("-- %s" % info.name)
            query_examples = filter(lambda x: x.startswith("      > "), info.examples.split("\n"))
            for query_example in query_examples:
                query = query_example.lstrip("      > ")
                print("    %s" % query)
                query_output = jspark.sql(query).showString(20, 20, False)
                output.append(query)
                output.append(query_output)
            pretty_output += "\n" + "\n".join(output)
    if pretty_output != "":
        return markdown.markdown(
            "```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code'])


def generate_functions_table_html(jvm, html_output_dir):
    """
    Generates a HTML file after listing the function information. The output file
    is created under `html_output_dir`.

    Expected output:

    <table class="table">
      <thead>
        <tr>
          <th style="width:25%">Function</th>
          <th>Description</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>func(*)</td>
          <td>...</td>
        </tr>
        <tr>
          <td>func(expr[, expr...])</td>
          <td>...</td>
        </tr>
      </tbody>
      ...
    </table>

    """
    for key, infos in _list_grouped_function_infos(jvm):
        function_table = _make_pretty_usage(infos)
        key = key.replace("_", "-")
        with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html:
            table_html.write(function_table)


def generate_functions_examples_html(jvm, jspark, html_output_dir):
    """
    Generates a HTML file after listing and executing the function information.
    The output file is created under `html_output_dir`.

    Expected output:

    <div class="codehilite"><pre><span></span>
      <span class="c1">-- func</span>
      <span class="k">SELECT</span>
      ...
    </pre></div>

    """
    print("Running SQL examples to generate formatted output.")
    for key, infos in _list_grouped_function_infos(jvm):
        examples = _make_pretty_examples(jspark, infos)
        key = key.replace("_", "-")
        if examples is not None:
            with open("%s/generated-%s-examples.html" % (
                    html_output_dir, key), 'w') as examples_html:
                examples_html.write(examples)


if __name__ == "__main__":
    jvm = launch_gateway().jvm
    jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()
    jspark.sparkContext().setLogLevel("ERROR")  # Make it less noisy.
    spark_root_dir = os.path.dirname(os.path.dirname(__file__))
    html_output_dir = os.path.join(spark_root_dir, "docs")
    generate_functions_table_html(jvm, html_output_dir)
    generate_functions_examples_html(jvm, jspark, html_output_dir)
[SPARK-31429][SQL][DOC] Automatically generates a SQL document for built-in functions ### What changes were proposed in this pull request? This PR intends to add a Python script to generates a SQL document for built-in functions and the document in SQL references. ### Why are the changes needed? To make SQL references complete. ### Does this PR introduce any user-facing change? Yes; ![a](https://user-images.githubusercontent.com/692303/79406712-c39e1b80-7fd2-11ea-8b85-9f9cbb6efed3.png) ![b](https://user-images.githubusercontent.com/692303/79320526-eb46a280-7f44-11ea-8639-90b1fb2b8848.png) ![c](https://user-images.githubusercontent.com/692303/79320707-3365c500-7f45-11ea-9984-69ffe800fb87.png) ### How was this patch tested? Manually checked and added tests. Closes #28224 from maropu/SPARK-31429. Lead-authored-by: Takeshi Yamamuro <yamamuro@apache.org> Co-authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-04-20 21:55:13 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`import itertools`
			`import os`
			`import re`
			`from collections import namedtuple`

			`# To avoid adding a new direct dependency, we import markdown from within mkdocs.`
			`from mkdocs.structure.pages import markdown`

			`from pyspark.java_gateway import launch_gateway`


			`ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")`

			`groups = {`
			`"agg_funcs", "array_funcs", "datetime_funcs",`
			`"json_funcs", "map_funcs", "window_funcs",`
			`}`


			`def _list_grouped_function_infos(jvm):`
			`"""`
			`Returns a list of function information grouped by each group value via JVM.`
			`Sorts wrapped expression infos in each group by name and returns them.`
			`"""`

			`jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()`
			`infos = []`

			`for jinfo in filter(lambda x: x.getGroup() in groups, jinfos):`
			`name = jinfo.getName()`
			`usage = jinfo.getUsage()`
			`usage = usage.replace("_FUNC_", name) if usage is not None else usage`
			`infos.append(ExpressionInfo(`
			`name=name,`
			`usage=usage,`
			`examples=jinfo.getExamples().replace("_FUNC_", name),`
			`group=jinfo.getGroup()))`

			`# Groups expression info by each group value`
			`grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)`
			`# Then, sort expression infos in each group by name`
			`return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]`


			`# TODO(SPARK-31499): Needs to add a column to describe arguments and their types`
			`def _make_pretty_usage(infos):`
			`"""`
			`Makes the usage description pretty and returns a formatted string.`

			`Expected input:`

			`func(*) - ...`

			`func(expr[, expr...]) - ...`

			`Expected output:`
			`<table class="table">`
			`<thead>`
			`<tr>`
			`<th style="width:25%">Function</th>`
			`<th>Description</th>`
			`</tr>`
			`</thead>`
			`<tbody>`
			`<tr>`
			`<td>func(*)</td>`
			`<td>...</td>`
			`</tr>`
			`<tr>`
			`<td>func(expr[, expr...])</td>`
			`<td>...</td>`
			`</tr>`
			`</tbody>`
			`...`
			`</table>`

			`"""`

			`result = []`
			`result.append("<table class=\"table\">")`
			`result.append(" <thead>")`
			`result.append(" <tr>")`
			`result.append(" <th style=\"width:25%\">Function</th>")`
			`result.append(" <th>Description</th>")`
			`result.append(" </tr>")`
			`result.append(" </thead>")`
			`result.append(" <tbody>")`

			`for info in infos:`
[SPARK-31562][SQL] Update ExpressionDescription for substring, current_date, and current_timestamp ### What changes were proposed in this pull request? This PR intends to add entries for substring, current_date, and current_timestamp in the SQL built-in function documents. Specifically, the entries are as follows; - SELECT current_date; - SELECT current_timestamp; - SELECT substring('abcd' FROM 1); - SELECT substring('abcd' FROM 1 FOR 2); ### Why are the changes needed? To make the SQL (built-in functions) references complete. ### Does this PR introduce any user-facing change? <img width="1040" alt="Screen Shot 2020-04-25 at 16 51 07" src="https://user-images.githubusercontent.com/692303/80274851-6ca5ee00-8718-11ea-9a35-9ae82008cb4b.png"> <img width="974" alt="Screen Shot 2020-04-25 at 17 24 24" src="https://user-images.githubusercontent.com/692303/80275032-a88d8300-8719-11ea-92ec-95b80169ae28.png"> <img width="862" alt="Screen Shot 2020-04-25 at 17 27 48" src="https://user-images.githubusercontent.com/692303/80275114-36696e00-871a-11ea-8e39-02e93eabb92f.png"> ### How was this patch tested? Added test examples. Closes #28342 from maropu/SPARK-31562. Authored-by: Takeshi Yamamuro <yamamuro@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org> 2020-04-26 14:46:52 -04:00			# Extracts (signature, description) pairs from `info.usage`.
			`# Expected formats are as follows;`
			# - `_FUNC_(...) - description`, or
			# - `_FUNC_ - description`
			`usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:])`
[SPARK-31429][SQL][DOC] Automatically generates a SQL document for built-in functions ### What changes were proposed in this pull request? This PR intends to add a Python script to generates a SQL document for built-in functions and the document in SQL references. ### Why are the changes needed? To make SQL references complete. ### Does this PR introduce any user-facing change? Yes; ![a](https://user-images.githubusercontent.com/692303/79406712-c39e1b80-7fd2-11ea-8b85-9f9cbb6efed3.png) ![b](https://user-images.githubusercontent.com/692303/79320526-eb46a280-7f44-11ea-8639-90b1fb2b8848.png) ![c](https://user-images.githubusercontent.com/692303/79320707-3365c500-7f45-11ea-9984-69ffe800fb87.png) ### How was this patch tested? Manually checked and added tests. Closes #28224 from maropu/SPARK-31429. Lead-authored-by: Takeshi Yamamuro <yamamuro@apache.org> Co-authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-04-20 21:55:13 -04:00			`for (sig, description) in zip(usages, usages):`
			`result.append(" <tr>")`
			`result.append(" <td>%s</td>" % sig)`
			`result.append(" <td>%s</td>" % description.strip())`
			`result.append(" </tr>")`

			`result.append(" </tbody>")`
			`result.append("</table>\n")`
			`return "\n".join(result)`


			`def _make_pretty_examples(jspark, infos):`
			`"""`
			Makes the examples description pretty and returns a formatted string if `infos`
			has any `examples` starting with the example prefix. Otherwise, returns None.

			`Expected input:`

			`Examples:`
			`> SELECT func(col)...;`
			`...`
			`> SELECT func(col)...;`
			`...`

			`Expected output:`
			`<div class="codehilite"><pre><span></span>`
			`<span class="c1">-- func</span>`
			`<span class="k">SELECT</span>`
			`...`
			`</pre></div>`
			```

			`"""`

			`pretty_output = ""`
			`for info in infos:`
			`if info.examples.startswith("\n Examples:"):`
			`output = []`
			`output.append("-- %s" % info.name)`
			`query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n"))`
			`for query_example in query_examples:`
			`query = query_example.lstrip(" > ")`
			`print(" %s" % query)`
			`query_output = jspark.sql(query).showString(20, 20, False)`
			`output.append(query)`
			`output.append(query_output)`
			`pretty_output += "\n" + "\n".join(output)`
			`if pretty_output != "":`
			`return markdown.markdown(`
			"```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code'])


			`def generate_functions_table_html(jvm, html_output_dir):`
			`"""`
			`Generates a HTML file after listing the function information. The output file`
			is created under `html_output_dir`.

			`Expected output:`

			`<table class="table">`
			`<thead>`
			`<tr>`
			`<th style="width:25%">Function</th>`
			`<th>Description</th>`
			`</tr>`
			`</thead>`
			`<tbody>`
			`<tr>`
			`<td>func(*)</td>`
			`<td>...</td>`
			`</tr>`
			`<tr>`
			`<td>func(expr[, expr...])</td>`
			`<td>...</td>`
			`</tr>`
			`</tbody>`
			`...`
			`</table>`

			`"""`
			`for key, infos in _list_grouped_function_infos(jvm):`
			`function_table = _make_pretty_usage(infos)`
			`key = key.replace("_", "-")`
			`with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html:`
			`table_html.write(function_table)`


			`def generate_functions_examples_html(jvm, jspark, html_output_dir):`
			`"""`
			`Generates a HTML file after listing and executing the function information.`
			The output file is created under `html_output_dir`.

			`Expected output:`

			`<div class="codehilite"><pre><span></span>`
			`<span class="c1">-- func</span>`
			`<span class="k">SELECT</span>`
			`...`
			`</pre></div>`

			`"""`
			`print("Running SQL examples to generate formatted output.")`
			`for key, infos in _list_grouped_function_infos(jvm):`
			`examples = _make_pretty_examples(jspark, infos)`
			`key = key.replace("_", "-")`
			`if examples is not None:`
			`with open("%s/generated-%s-examples.html" % (`
			`html_output_dir, key), 'w') as examples_html:`
			`examples_html.write(examples)`


			`if __name__ == "__main__":`
			`jvm = launch_gateway().jvm`
			`jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()`
			`jspark.sparkContext().setLogLevel("ERROR") # Make it less noisy.`
			`spark_root_dir = os.path.dirname(os.path.dirname(__file__))`
			`html_output_dir = os.path.join(spark_root_dir, "docs")`
			`generate_functions_table_html(jvm, html_output_dir)`
			`generate_functions_examples_html(jvm, jspark, html_output_dir)`