# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import itertools import os import re from collections import namedtuple # To avoid adding a new direct dependency, we import markdown from within mkdocs. from mkdocs.structure.pages import markdown from pyspark.java_gateway import launch_gateway ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group") groups = { "agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs", "window_funcs", } def _list_grouped_function_infos(jvm): """ Returns a list of function information grouped by each group value via JVM. Sorts wrapped expression infos in each group by name and returns them. """ jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos() infos = [] for jinfo in filter(lambda x: x.getGroup() in groups, jinfos): name = jinfo.getName() usage = jinfo.getUsage() usage = usage.replace("_FUNC_", name) if usage is not None else usage infos.append(ExpressionInfo( name=name, usage=usage, examples=jinfo.getExamples().replace("_FUNC_", name), group=jinfo.getGroup())) # Groups expression info by each group value grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group) # Then, sort expression infos in each group by name return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos] # TODO(SPARK-31499): Needs to add a column to describe arguments and their types def _make_pretty_usage(infos): """ Makes the usage description pretty and returns a formatted string. Expected input: func(*) - ... func(expr[, expr...]) - ... Expected output: ...
Function Description
func(*) ...
func(expr[, expr...]) ...
""" result = [] result.append("") result.append(" ") result.append(" ") result.append(" ") result.append(" ") result.append(" ") result.append(" ") result.append(" ") for info in infos: # Extracts (signature, description) pairs from `info.usage`, e.g., # the signature is `func(expr)` and the description is `...` in an usage `func(expr) - ...`. usages = iter(re.split(r"(%s\(.*\)) - " % info.name, info.usage.strip())[1:]) for (sig, description) in zip(usages, usages): result.append(" ") result.append(" " % sig) result.append(" " % description.strip()) result.append(" ") result.append(" ") result.append("
FunctionDescription
%s%s
\n") return "\n".join(result) def _make_pretty_examples(jspark, infos): """ Makes the examples description pretty and returns a formatted string if `infos` has any `examples` starting with the example prefix. Otherwise, returns None. Expected input: Examples: > SELECT func(col)...; ... > SELECT func(col)...; ... Expected output:

      -- func
      SELECT
      ...
    
``` """ pretty_output = "" for info in infos: if info.examples.startswith("\n Examples:"): output = [] output.append("-- %s" % info.name) query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n")) for query_example in query_examples: query = query_example.lstrip(" > ") print(" %s" % query) query_output = jspark.sql(query).showString(20, 20, False) output.append(query) output.append(query_output) pretty_output += "\n" + "\n".join(output) if pretty_output != "": return markdown.markdown( "```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code']) def generate_functions_table_html(jvm, html_output_dir): """ Generates a HTML file after listing the function information. The output file is created under `html_output_dir`. Expected output: ...
Function Description
func(*) ...
func(expr[, expr...]) ...
""" for key, infos in _list_grouped_function_infos(jvm): function_table = _make_pretty_usage(infos) key = key.replace("_", "-") with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html: table_html.write(function_table) def generate_functions_examples_html(jvm, jspark, html_output_dir): """ Generates a HTML file after listing and executing the function information. The output file is created under `html_output_dir`. Expected output:

      -- func
      SELECT
      ...
    
""" print("Running SQL examples to generate formatted output.") for key, infos in _list_grouped_function_infos(jvm): examples = _make_pretty_examples(jspark, infos) key = key.replace("_", "-") if examples is not None: with open("%s/generated-%s-examples.html" % ( html_output_dir, key), 'w') as examples_html: examples_html.write(examples) if __name__ == "__main__": jvm = launch_gateway().jvm jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate() jspark.sparkContext().setLogLevel("ERROR") # Make it less noisy. spark_root_dir = os.path.dirname(os.path.dirname(__file__)) html_output_dir = os.path.join(spark_root_dir, "docs") generate_functions_table_html(jvm, html_output_dir) generate_functions_examples_html(jvm, jspark, html_output_dir)