[SPARK-35587][PYTHON][DOCS] Initial porting of Koalas documentation

### What changes were proposed in this pull request? This PR proposes to port Koalas documentation to PySpark documentation as its initial step. It ports almost as is except these differences: - Renamed import from `databricks.koalas` to `pyspark.pandas`. - Renamed `to_koalas` -> `to_pandas_on_spark` - Renamed `(Series|DataFrame).koalas` -> `(Series|DataFrame).pandas_on_spark` - Added a `ps_` prefix in the RST file names of Koalas documentation Other then that, - Excluded `python/docs/build/html` in linter - Fixed GA dependency installataion ### Why are the changes needed? To document pandas APIs on Spark. ### Does this PR introduce _any_ user-facing change? Yes, it adds new documentations. ### How was this patch tested? Manually built the docs and checked the output. Closes #32726 from HyukjinKwon/SPARK-35587. Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
2021-06-04 11:11:09 +09:00 · 2021-06-04 11:11:09 +09:00 · 3d158f9c91
parent 745bd090f7
commit 3d158f9c91
32 changed files with 17966 additions and 21 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@ -215,7 +215,7 @@ jobs:
    # Ubuntu 20.04. See also SPARK-33162.
    - name: Install Python packages (Python 3.6)
      run: |
-        python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner plotly>=4.8
+        python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner 'plotly>=4.8'
        python3.6 -m pip list
    - name: List Python packages (Python 3.9)
      run: |
@ -384,6 +384,7 @@ jobs:
        # Jinja2 3.0.0+ causes error when building with Sphinx.
        #   See also https://issues.apache.org/jira/browse/SPARK-35375.
        python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0'
+        python3.6 -m pip install sphinx_plotly_directive 'pyarrow<3.0.0' pandas 'plotly>=4.8'
        apt-get update -y
        apt-get install -y ruby ruby-dev
        Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
--- a/dev/tox.ini
+++ b/dev/tox.ini
@ -20,5 +20,5 @@ exclude=*/target/*,python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/

 [flake8]
 select = E901,E999,F821,F822,F823,F401,F405,B006
-exclude = */target/*,python/pyspark/cloudpickle/*.py,shared.py*,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*,python/out,python/pyspark/sql/pandas/functions.pyi,python/pyspark/sql/column.pyi,python/pyspark/worker.pyi,python/pyspark/java_gateway.pyi
+exclude = python/docs/build/html/*,*/target/*,python/pyspark/cloudpickle/*.py,shared.py*,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*,python/out,python/pyspark/sql/pandas/functions.pyi,python/pyspark/sql/column.pyi,python/pyspark/worker.pyi,python/pyspark/java_gateway.pyi
 max-line-length = 100
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@ -116,6 +116,15 @@ if not (ENV['SKIP_API'] == '1')
  if not (ENV['SKIP_PYTHONDOC'] == '1')
    # Build Sphinx docs for Python

+    puts "Moving to project root and building API docs."
+    cd("..")
+
+    puts "Running 'build/sbt clean package -Phive' from " + pwd + "; this may take a few minutes..."
+    system("build/sbt clean package -Phive") || raise("PySpark doc generation failed")
+
+    puts "Moving back into docs dir."
+    cd("docs")
+
    puts "Moving to python/docs directory and building sphinx."
    cd("../python/docs")
    system("make html") || raise("Python doc generation failed")
@ -153,15 +162,18 @@ if not (ENV['SKIP_API'] == '1')
  if not (ENV['SKIP_SQLDOC'] == '1')
    # Build SQL API docs

-    puts "Moving to project root and building API docs."
-    curr_dir = pwd
-    cd("..")
+    if ENV['SKIP_PYTHONDOC'] == '1'
+      # SQL documentation build requires the full build to run queries.
+      # If the build was not done in PySpark documentation generation, we should build it here.
+      puts "Moving to project root and building API docs."
+      cd("..")

-    puts "Running 'build/sbt clean package -Phive' from " + pwd + "; this may take a few minutes..."
-    system("build/sbt clean package -Phive") || raise("SQL doc generation failed")
+      puts "Running 'build/sbt clean package -Phive' from " + pwd + "; this may take a few minutes..."
+      system("build/sbt clean package -Phive") || raise("SQL doc generation failed")

-    puts "Moving back into docs dir."
-    cd("docs")
+      puts "Moving back into docs dir."
+      cd("docs")
+    end

    puts "Moving to SQL directory and building docs."
    cd("../sql")
--- a/python/docs/source/conf.py
+++ b/python/docs/source/conf.py
@ -49,8 +49,17 @@ extensions = [
    # For ipython directive in reStructuredText files. It is generated by the notebook.
    'IPython.sphinxext.ipython_console_highlighting',
    'numpydoc',  # handle NumPy documentation formatted docstrings.
+    'sphinx_plotly_directive',  # For visualize plot result
 ]

+# plotly plot directive
+plotly_include_source = True
+plotly_html_show_formats = False
+plotly_html_show_source_link = False
+plotly_pre_code = """import numpy as np
+import pandas as pd
+import pyspark.pandas as ps"""
+
 numpydoc_show_class_members = False

 # Links used globally in the RST files.
--- a/python/docs/source/development/index.rst
+++ b/python/docs/source/development/index.rst
@ -20,9 +20,17 @@ Development
 ===========

 .. toctree::
-    :maxdepth: 2
+   :maxdepth: 2

-    contributing
-    testing
-    debugging
-    setting_ide
+   contributing
+   testing
+   debugging
+   setting_ide
+
+For pandas APIs on Spark:
+
+.. toctree::
+   :maxdepth: 2
+
+   ps_contributing
+   ps_design
--- a/python/docs/source/development/ps_contributing.rst
+++ b/python/docs/source/development/ps_contributing.rst
@ -0,0 +1,192 @@
+==================
+Contributing Guide
+==================
+
+.. contents:: Table of contents:
+   :depth: 1
+   :local:
+
+Types of Contributions
+======================
+
+The largest amount of work consists simply of implementing the pandas API using Spark's built-in functions, which is usually straightforward. But there are many different forms of contributions in addition to writing code:
+
+1. Use the project and provide feedback, by creating new tickets or commenting on existing relevant tickets.
+
+2. Review existing pull requests.
+
+3. Improve the project's documentation.
+
+4. Write blog posts or tutorial articles evangelizing Koalas and help new users learn Koalas.
+
+5. Give a talk about Koalas at your local meetup or a conference.
+
+
+Step-by-step Guide For Code Contributions
+=========================================
+
+1. Read and understand the `Design Principles <design.rst>`_ for the project. Contributions should follow these principles.
+
+2. Signaling your work: If you are working on something, comment on the relevant ticket that you are doing so to avoid multiple people taking on the same work at the same time. It is also a good practice to signal that your work has stalled or you have moved on and want somebody else to take over.
+
+3. Understand what the functionality is in pandas or in Spark.
+
+4. Implement the functionality, with test cases providing close to 100% statement coverage. Document the functionality.
+
+5. Run existing and new test cases to make sure they still pass. Also run `dev/reformat` script to reformat Python files by using `Black <https://github.com/psf/black>`_, and run the linter `dev/lint-python`.
+
+6. Build the docs (`make html` in `docs` directory) and verify the docs related to your change look OK.
+
+7. Submit a pull request, and be responsive to code review feedback from other community members.
+
+That's it. Your contribution, once merged, will be available in the next release.
+
+
+Environment Setup
+=================
+
+Conda
+-----
+
+If you are using Conda, the Koalas installation and development environment are as follows.
+
+.. code-block:: bash
+
+    # Python 3.6+ is required
+    conda create --name koalas-dev-env python=3.6
+    conda activate koalas-dev-env
+    conda install -c conda-forge pyspark=2.4
+    pip install -r requirements-dev.txt
+    pip install -e .  # installs koalas from current checkout
+
+Once setup, make sure you switch to `koalas-dev-env` before development:
+
+.. code-block:: bash
+
+    conda activate koalas-dev-env
+
+pip
+---
+
+With Python 3.6+, pip can be used as below to install and set up the development environment.
+
+.. code-block:: bash
+
+    pip install pyspark==2.4
+    pip install -r requirements-dev.txt
+    pip install -e .  # installs koalas from current checkout
+
+Running Tests
+=============
+
+There is a script `./dev/pytest` which is exactly same as `pytest` but with some default settings to run Koalas tests easily.
+
+To run all the tests, similar to our CI pipeline:
+
+.. code-block:: bash
+
+    # Run all unittest and doctest
+    ./dev/pytest
+
+To run a specific test file:
+
+.. code-block:: bash
+
+    # Run unittest
+    ./dev/pytest -k test_dataframe.py
+
+    # Run doctest
+    ./dev/pytest -k series.py --doctest-modules databricks
+
+To run a specific doctest/unittest:
+
+.. code-block:: bash
+
+    # Run unittest
+    ./dev/pytest -k "DataFrameTest and test_Dataframe"
+
+    # Run doctest
+    ./dev/pytest -k DataFrame.corr --doctest-modules databricks
+
+Note that `-k` is used for simplicity although it takes an expression. You can use `--verbose` to check what to filter. See `pytest --help` for more details.
+
+
+Building Documentation
+======================
+
+To build documentation via Sphinx:
+
+.. code-block:: bash
+
+     cd docs && make clean html
+
+It generates HTMLs under `docs/build/html` directory. Open `docs/build/html/index.html` to check if documentation is built properly.
+
+
+Coding Conventions
+==================
+
+We follow `PEP 8 <https://www.python.org/dev/peps/pep-0008/>`_ with one exception: lines can be up to 100 characters in length, not 79.
+
+Doctest Conventions
+===================
+
+When writing doctests, usually the doctests in pandas are converted into Koalas to make sure the same codes work in Koalas.
+In general, doctests should be grouped logically by separating a newline.
+
+For instance, the first block is for the statements for preparation, the second block is for using the function with a specific argument,
+and third block is for another argument. As a example, please refer `DataFrame.rsub <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rsub.html#pandas.DataFrame.rsub>`_ in pandas.
+
+These blocks should be consistently separated in Koalas, and more doctests should be added if the coverage of the doctests or the number of examples to show is not enough even though they are different from pandas'.
+
+Release Guide
+=============
+
+Release Cadence
+---------------
+
+Koalas 1.8.0 is the last minor release because Koalas will be officially included to PySpark.
+There will be only maintenance releases. Users are expected to directly use PySpark with Apache Spark 3.2+.
+
+Release Instructions
+--------------------
+
+Only project maintainers can do the following to publish a release.
+
+1. Make sure version is set correctly in `pyspark.pandas/version.py`.
+
+2. Make sure the build is green.
+
+3. Create a new release on GitHub. Tag it as the same version as the setup.py. If the version is "0.1.0", tag the commit as "v0.1.0".
+
+4. Upload the package to PyPi:
+
+  .. code-block:: bash
+
+      rm -rf dist/koalas*
+      python setup.py sdist bdist_wheel
+      export package_version=$(python setup.py --version)
+      echo $package_version
+
+      python3 -m pip install --user --upgrade twine
+
+      # for test
+      python3 -m twine upload --repository-url https://test.pypi.org/legacy/ dist/koalas-$package_version-py3-none-any.whl dist/koalas-$package_version.tar.gz
+
+      # for release
+      python3 -m twine upload --repository-url https://upload.pypi.org/legacy/ dist/koalas-$package_version-py3-none-any.whl dist/koalas-$package_version.tar.gz
+
+5. Verify the uploaded package can be installed and executed. One unofficial tip is to run the doctests of Koalas within a Python interpreter after installing it.
+
+  .. code-block:: python
+
+      import os
+
+      from pytest import main
+      import databricks
+
+      test_path = os.path.abspath(os.path.dirname(databricks.__file__))
+      main(['-k', '-to_delta -read_delta', '--verbose', '--showlocals', '--doctest-modules', test_path])
+
+Note that this way might require additional settings, for instance, environment variables.
+
--- a/python/docs/source/development/ps_design.rst
+++ b/python/docs/source/development/ps_design.rst
@ -0,0 +1,85 @@
+=================
+Design Principles
+=================
+
+.. currentmodule:: pyspark.pandas
+
+This section outlines design principles guiding the Koalas project.
+
+Be Pythonic
+-----------
+
+Koalas targets Python data scientists. We want to stick to the convention that users are already familiar with as much as possible. Here are some examples:
+
+- Function names and parameters use snake_case, rather than CamelCase. This is different from PySpark's design. For example, Koalas has `to_pandas()`, whereas PySpark has `toPandas()` for converting a DataFrame into a pandas DataFrame. In limited cases, to maintain compatibility with Spark, we also provide Spark's variant as an alias.
+
+- Koalas respects to the largest extent the conventions of the Python numerical ecosystem, and allows the use of NumPy types, etc. that can be supported by Spark.
+
+- Koalas docs' style and infrastructure simply follow rest of the PyData projects'.
+
+Unify small data (pandas) API and big data (Spark) API, but pandas first
+------------------------------------------------------------------------
+
+The Koalas DataFrame is meant to provide the best of pandas and Spark under a single API, with easy and clear conversions between each API when necessary. When Spark and pandas have similar APIs with subtle differences, the principle is to honor the contract of the pandas API first.
+
+There are different classes of functions:
+
+ 1. Functions that are found in both Spark and pandas under the same name (`count`, `dtypes`, `head`). The return value is the same as the return type in pandas (and not Spark's).
+    
+ 2. Functions that are found in Spark but that have a clear equivalent in pandas, e.g. `alias` and `rename`. These functions will be implemented as the alias of the pandas function, but should be marked that they are aliases of the same functions. They are provided so that existing users of PySpark can get the benefits of Koalas without having to adapt their code.
+ 
+ 3. Functions that are only found in pandas. When these functions are appropriate for distributed datasets, they should become available in Koalas.
+ 
+ 4. Functions that are only found in Spark that are essential to controlling the distributed nature of the computations, e.g. `cache`. These functions should be available in Koalas.
+
+We are still debating whether data transformation functions only available in Spark should be added to Koalas, e.g. `select`. We would love to hear your feedback on that.
+
+Return Koalas data structure for big data, and pandas data structure for small data
+-----------------------------------------------------------------------------------
+
+Often developers face the question whether a particular function should return a Koalas DataFrame/Series, or a pandas DataFrame/Series. The principle is: if the returned object can be large, use a Koalas DataFrame/Series. If the data is bound to be small, use a pandas DataFrame/Series. For example, `DataFrame.dtypes` return a pandas Series, because the number of columns in a DataFrame is bounded and small, whereas `DataFrame.head()` or `Series.unique()` returns a Koalas DataFrame/Series, because the resulting object can be large.
+
+Provide discoverable APIs for common data science tasks
+-------------------------------------------------------
+
+At the risk of overgeneralization, there are two API design approaches: the first focuses on providing APIs for common tasks; the second starts with abstractions, and enable users to accomplish their tasks by composing primitives. While the world is not black and white, pandas takes more of the former approach, while Spark has taken more of the later.
+
+One example is value count (count by some key column), one of the most common operations in data science. pandas `DataFrame.value_count` returns the result in sorted order, which in 90% of the cases is what users prefer when exploring data, whereas Spark's does not sort, which is more desirable when building data pipelines, as users can accomplish the pandas behavior by adding an explicit `orderBy`.
+
+Similar to pandas, Koalas should also lean more towards the former, providing discoverable APIs for common data science tasks. In most cases, this principle is well taken care of by simply implementing pandas' APIs. However, there will be circumstances in which pandas' APIs don't address a specific need, e.g. plotting for big data.
+
+Provide well documented APIs, with examples
+-------------------------------------------
+
+All functions and parameters should be documented. Most functions should be documented with examples, because those are the easiest to understand than a blob of text explaining what the function does.
+
+A recommended way to add documentation is to start with the docstring of the corresponding function in PySpark or pandas, and adapt it for Koalas. If you are adding a new function, also add it to the API reference doc index page in `docs/source/reference` directory. The examples in docstring also improve our test coverage.
+
+Guardrails to prevent users from shooting themselves in the foot
+----------------------------------------------------------------
+
+Certain operations in pandas are prohibitively expensive as data scales, and we don't want to give users the illusion that they can rely on such operations in Koalas. That is to say, methods implemented in Koalas should be safe to perform by default on large datasets. As a result, the following capabilities are not implemented in Koalas:
+
+1. Capabilities that are fundamentally not parallelizable: e.g. imperatively looping over each element
+2. Capabilities that require materializing the entire working set in a single node's memory. This is why we do not implement `pandas.DataFrame.to_xarray <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_xarray.html>`_. Another example is the `_repr_html_` call caps the total number of records shown to a maximum of 1000, to prevent users from blowing up their driver node simply by typing the name of the DataFrame in a notebook.
+
+A few exceptions, however, exist. One common pattern with "big data science" is that while the initial dataset is large, the working set becomes smaller as the analysis goes deeper. For example, data scientists often perform aggregation on datasets and want to then convert the aggregated dataset to some local data structure. To help data scientists, we offer the following:
+
+- :func:`DataFrame.to_pandas`: returns a pandas DataFrame, koalas only
+- :func:`DataFrame.to_numpy`: returns a numpy array, works with both pandas and Koalas
+
+Note that it is clear from the names that these functions return some local data structure that would require materializing data in a single node's memory. For these functions, we also explicitly document them with a warning note that the resulting data structure must be small.
+
+Be a lean API layer and move fast
+---------------------------------
+
+Koalas is designed as an API overlay layer on top of Spark. The project should be lightweight, and most functions should be implemented as wrappers
+around Spark or pandas - the Koalas library is designed to be used only in the Spark's driver side in general.
+Koalas does not accept heavyweight implementations, e.g. execution engine changes.
+
+This approach enables us to move fast. For the considerable future, we aim to be making monthly releases. If we find a critical bug, we will be making a new release as soon as the bug fix is available.
+
+High test coverage
+------------------
+
+Koalas should be well tested. The project tracks its test coverage with over 90% across the entire codebase, and close to 100% for critical parts. Pull requests will not be accepted unless they have close to 100% statement coverage from the codecov report.
--- a/python/docs/source/getting_started/index.rst
+++ b/python/docs/source/getting_started/index.rst
@ -26,7 +26,17 @@ There are more guides shared with other languages such as
 at `the Spark documentation <https://spark.apache.org/docs/latest/index.html#where-to-go-from-here>`_.

 .. toctree::
-    :maxdepth: 2
+   :maxdepth: 2
+
+   install
+   quickstart
+
+For pandas APIs on Spark:
+
+.. toctree::
+   :maxdepth: 2
+
+   ps_install
+   ps_10mins
+   ps_videos_blogs

-    install
-    quickstart
--- a/python/docs/source/getting_started/ps_10mins.ipynb
+++ b/python/docs/source/getting_started/ps_10mins.ipynb
--- a/python/docs/source/getting_started/ps_install.rst
+++ b/python/docs/source/getting_started/ps_install.rst
@ -0,0 +1,145 @@
+============
+Installation
+============
+
+Koalas requires PySpark so please make sure your PySpark is available.
+
+To install Koalas, you can use:
+
+- `Conda <https://anaconda.org/conda-forge/koalas>`__
+- `PyPI <https://pypi.org/project/koalas>`__
+- `Installation from source <../development/ps_contributing.rst#environment-setup>`__
+
+To install PySpark, you can use:
+
+- `Installation with the official release channel <https://spark.apache.org/downloads.html>`__
+- `Conda <https://anaconda.org/conda-forge/pyspark>`__
+- `PyPI <https://pypi.org/project/pyspark>`__
+- `Installation from source <https://github.com/apache/spark#building-spark>`__
+
+
+Python version support
+----------------------
+
+Officially Python 3.5 to 3.8.
+
+.. note::
+   Koalas support for Python 3.5 is deprecated and will be dropped in the future release.
+   At that point, existing Python 3.5 workflows that use Koalas will continue to work without
+   modification, but Python 3.5 users will no longer get access to the latest Koalas features
+   and bugfixes. We recommend that you upgrade to Python 3.6 or newer.
+
+Installing Koalas
+-----------------
+
+Installing with Conda
+~~~~~~~~~~~~~~~~~~~~~~
+
+First you will need `Conda <http://conda.pydata.org/docs/>`__ to be installed.
+After that, we should create a new conda environment. A conda environment is similar with a
+virtualenv that allows you to specify a specific version of Python and set of libraries.
+Run the following commands from a terminal window::
+
+    conda create --name koalas-dev-env
+
+This will create a minimal environment with only Python installed in it.
+To put your self inside this environment run::
+
+    conda activate koalas-dev-env
+
+The final step required is to install Koalas. This can be done with the
+following command::
+
+    conda install -c conda-forge koalas
+
+To install a specific Koalas version::
+
+    conda install -c conda-forge koalas=1.3.0
+
+
+Installing from PyPI
+~~~~~~~~~~~~~~~~~~~~
+
+Koalas can be installed via pip from
+`PyPI <https://pypi.org/project/koalas>`__::
+
+    pip install koalas
+
+
+Installing from source
+~~~~~~~~~~~~~~~~~~~~~~
+
+See the `Contribution Guide <../development/ps_contributing.rst#environment-setup>`__ for complete instructions.
+
+
+Installing PySpark
+------------------
+
+Installing with the official release channel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can install PySpark by downloading a release in `the official release channel <https://spark.apache.org/downloads.html>`__.
+Once you download the release, un-tar it first as below::
+
+    tar xzvf spark-2.4.4-bin-hadoop2.7.tgz
+
+After that, make sure set ``SPARK_HOME`` environment variable to indicate the directory you untar-ed::
+
+    cd spark-2.4.4-bin-hadoop2.7
+    export SPARK_HOME=`pwd`
+
+Also, make sure your ``PYTHONPATH`` can find the PySpark and Py4J under ``$SPARK_HOME/python/lib``::
+
+    export PYTHONPATH=$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH
+
+
+Installing with Conda
+~~~~~~~~~~~~~~~~~~~~~~
+
+PySpark can be installed via `Conda <https://anaconda.org/conda-forge/pyspark>`__::
+
+    conda install -c conda-forge pyspark
+
+
+Installing with PyPI
+~~~~~~~~~~~~~~~~~~~~~~
+
+PySpark can be installed via pip from `PyPI <https://pypi.org/project/pyspark>`__::
+
+    pip install pyspark
+
+
+Installing from source
+~~~~~~~~~~~~~~~~~~~~~~
+
+To install PySpark from source, refer `Building Spark <https://github.com/apache/spark#building-spark>`__.
+
+Likewise, make sure you set ``SPARK_HOME`` environment variable to the git-cloned directory, and your
+``PYTHONPATH`` environment variable can find the PySpark and Py4J under ``$SPARK_HOME/python/lib``::
+
+    export PYTHONPATH=$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH
+
+
+Dependencies
+------------
+
+============= ================
+Package       Required version
+============= ================
+`pandas`      >=0.23.2
+`pyspark`     >=2.4.0
+`pyarrow`     >=0.10
+`numpy`       >=1.14
+============= ================
+
+
+Optional dependencies
+~~~~~~~~~~~~~~~~~~~~~
+
+============= ================
+Package       Required version
+============= ================
+`mlflow`      >=1.0
+`plotly`      >=4.8
+`matplotlib`  >=3.0.0,<3.3.0
+============= ================
--- a/python/docs/source/getting_started/ps_videos_blogs.rst
+++ b/python/docs/source/getting_started/ps_videos_blogs.rst
@ -0,0 +1,130 @@
+======================
+Koalas Talks and Blogs
+======================
+
+Blog Posts
+----------
+
+- `Interoperability between Koalas and Apache Spark (Aug 11, 2020) <https://databricks.com/blog/2020/08/11/interoperability-between-koalas-and-apache-spark.html>`_
+- `Introducing Koalas 1.0 (Jun 24, 2020) <https://databricks.com/blog/2020/06/24/introducing-koalas-1-0.html>`_
+- `10 Minutes from pandas to Koalas on Apache Spark (Mar 31, 2020) <https://databricks.com/blog/2020/03/31/10-minutes-from-pandas-to-koalas-on-apache-spark.html>`_
+- `Guest Blog: How Virgin Hyperloop One Reduced Processing Time from Hours to Minutes with Koalas (Aug 22, 2019) <https://databricks.com/blog/2019/08/22/guest-blog-how-virgin-hyperloop-one-reduced-processing-time-from-hours-to-minutes-with-koalas.html>`_
+- `Koalas: Easy Transition from pandas to Apache Spark (Apr 24, 2019) <https://databricks.com/blog/2019/04/24/koalas-easy-transition-from-pandas-to-apache-spark.html>`_
+
+
+Data + AI Summit 2020 EUROPE (Nov 18-19, 2020)
+----------------------------------------------
+
+Project Zen: Making Spark Pythonic
+==================================
+
+.. raw:: html
+
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/-vJLTEOdLvA" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+Koalas: Interoperability Between Koalas and Apache Spark
+========================================================
+
+.. raw:: html
+
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/eI0Wh2Epo0Q" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+Spark + AI Summit 2020 (Jun 24, 2020)
+-------------------------------------
+
+Introducing Apache Spark 3.0: A retrospective of the Last 10 Years, and a Look Forward to the Next 10 Years to Come.
+====================================================================================================================
+
+.. raw:: html
+
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/OLJKIogf2nU?start=555" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+Koalas: Making an Easy Transition from Pandas to Apache Spark
+=============================================================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/G_-9VbyHcx8" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+Koalas: Pandas on Apache Spark
+==============================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/iUpBSHoqzLM" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+Webinar @ Databricks (Mar 27, 2020)
+-----------------------------------
+
+Reducing Time-To-Insight for Virgin Hyperloop's Data
+====================================================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://player.vimeo.com/video/397032070" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen"="" allowfullscreen=""></iframe>
+
+
+PyData New York 2019 (Nov 4, 2019)
+----------------------------------
+
+Pandas vs Koalas: The Ultimate Showdown
+=======================================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/xcGEQUURAuk?start=1470" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+Spark + AI Summit Europe 2019 (Oct 16, 2019)
+--------------------------------------------
+
+New Developments in the Open Source Ecosystem: Apache Spark 3.0, Delta Lake, and Koalas
+=======================================================================================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/scM_WQMhB3A?start=1470" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+Koalas: Making an Easy Transition from Pandas to Apache Spark
+=============================================================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/Wfj2Vuse7as" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+Koalas: Pandas on Apache Spark
+==============================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/NpAMbzerAp0" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+PyBay 2019 (Aug 17, 2019)
+-------------------------
+
+Koalas Easy Transition from pandas to Apache Spark
+==================================================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/cMDLoGkidEE?v=xcGEQUURAuk?start=1470" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+
+Spark + AI Summit 2019 (Apr 24, 2019)
+-------------------------------------
+
+Official Announcement of Koalas Open Source Project
+===================================================
+
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/Shzb15DZ9Qg" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
--- a/python/docs/source/index.rst
+++ b/python/docs/source/index.rst
@ -38,6 +38,15 @@ Spark SQL is a Spark module for structured data processing. It provides
 a programming abstraction called DataFrame and can also act as distributed
 SQL query engine.

+**pandas APIs on Spark**
+
+pandas APIs on Spark allow you to scale your pandas workload out.
+With this package, you can:
+
+* Be immediately productive with Spark, with no learning curve, if you are already familiar with pandas.
+* Have a single codebase that works both with pandas (tests, smaller datasets) and with Spark (distributed datasets).
+* Switch to pandas API and PySpark API contexts easily without any overhead.
+
 **Streaming**

 Running on top of Spark, the streaming feature in Apache Spark enables powerful
--- a/python/docs/source/reference/index.rst
+++ b/python/docs/source/reference/index.rst
@ -32,3 +32,18 @@ This page lists an overview of all public PySpark modules, classes, functions an
   pyspark.mllib
   pyspark
   pyspark.resource
+
+For pandas APIs on Spark:
+
+.. toctree::
+   :maxdepth: 2
+
+   ps_io
+   ps_general_functions
+   ps_series
+   ps_frame
+   ps_indexing
+   ps_window
+   ps_groupby
+   ps_ml
+   ps_extensions
--- a/python/docs/source/reference/ps_extensions.rst
+++ b/python/docs/source/reference/ps_extensions.rst
@ -0,0 +1,21 @@
+.. _api.extensions:
+
+==========
+Extensions
+==========
+.. currentmodule:: pyspark.pandas.extensions
+
+Accessors
+---------
+
+Accessors can be written and registered with Koalas Dataframes, Series, and
+Index objects. Accessors allow developers to extend the functionality of
+Koalas objects seamlessly by writing arbitrary classes and methods which are
+then wrapped in one of the following decorators.
+
+.. autosummary::
+   :toctree: api/
+
+   register_dataframe_accessor
+   register_series_accessor
+   register_index_accessor
--- a/python/docs/source/reference/ps_frame.rst
+++ b/python/docs/source/reference/ps_frame.rst
@ -0,0 +1,327 @@
+.. _api.dataframe:
+
+=========
+DataFrame
+=========
+.. currentmodule:: pyspark.pandas
+
+Constructor
+-----------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame
+
+Attributes and underlying data
+------------------------------
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.index
+   DataFrame.columns
+   DataFrame.empty
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.dtypes
+   DataFrame.shape
+   DataFrame.axes
+   DataFrame.ndim
+   DataFrame.size
+   DataFrame.select_dtypes
+   DataFrame.values
+
+Conversion
+----------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.copy
+   DataFrame.isna
+   DataFrame.astype
+   DataFrame.isnull
+   DataFrame.notna
+   DataFrame.notnull
+   DataFrame.pad
+   DataFrame.bool
+
+Indexing, iteration
+-------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.at
+   DataFrame.iat
+   DataFrame.head
+   DataFrame.idxmax
+   DataFrame.idxmin
+   DataFrame.loc
+   DataFrame.iloc
+   DataFrame.items
+   DataFrame.iteritems
+   DataFrame.iterrows
+   DataFrame.itertuples
+   DataFrame.keys
+   DataFrame.pop
+   DataFrame.tail
+   DataFrame.xs
+   DataFrame.get
+   DataFrame.where
+   DataFrame.mask
+   DataFrame.query
+
+Binary operator functions
+-------------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.add
+   DataFrame.radd
+   DataFrame.div
+   DataFrame.rdiv
+   DataFrame.truediv
+   DataFrame.rtruediv
+   DataFrame.mul
+   DataFrame.rmul
+   DataFrame.sub
+   DataFrame.rsub
+   DataFrame.pow
+   DataFrame.rpow
+   DataFrame.mod
+   DataFrame.rmod
+   DataFrame.floordiv
+   DataFrame.rfloordiv
+   DataFrame.lt
+   DataFrame.gt
+   DataFrame.le
+   DataFrame.ge
+   DataFrame.ne
+   DataFrame.eq
+   DataFrame.dot
+
+Function application, GroupBy & Window
+--------------------------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.apply
+   DataFrame.applymap
+   DataFrame.pipe
+   DataFrame.agg
+   DataFrame.aggregate
+   DataFrame.groupby
+   DataFrame.rolling
+   DataFrame.expanding
+   DataFrame.transform
+
+.. _api.dataframe.stats:
+
+Computations / Descriptive Stats
+--------------------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.abs
+   DataFrame.all
+   DataFrame.any
+   DataFrame.clip
+   DataFrame.corr
+   DataFrame.count
+   DataFrame.describe
+   DataFrame.kurt
+   DataFrame.kurtosis
+   DataFrame.mad
+   DataFrame.max
+   DataFrame.mean
+   DataFrame.min
+   DataFrame.median
+   DataFrame.pct_change
+   DataFrame.prod
+   DataFrame.product
+   DataFrame.quantile
+   DataFrame.nunique
+   DataFrame.sem
+   DataFrame.skew
+   DataFrame.sum
+   DataFrame.std
+   DataFrame.var
+   DataFrame.cummin
+   DataFrame.cummax
+   DataFrame.cumsum
+   DataFrame.cumprod
+   DataFrame.round
+   DataFrame.diff
+   DataFrame.eval
+
+Reindexing / Selection / Label manipulation
+-------------------------------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.add_prefix
+   DataFrame.add_suffix
+   DataFrame.align
+   DataFrame.at_time
+   DataFrame.between_time
+   DataFrame.drop
+   DataFrame.droplevel
+   DataFrame.drop_duplicates
+   DataFrame.duplicated
+   DataFrame.equals
+   DataFrame.filter
+   DataFrame.first
+   DataFrame.head
+   DataFrame.last
+   DataFrame.rename
+   DataFrame.rename_axis
+   DataFrame.reset_index
+   DataFrame.set_index
+   DataFrame.swapaxes
+   DataFrame.swaplevel
+   DataFrame.take
+   DataFrame.isin
+   DataFrame.sample
+   DataFrame.truncate
+
+.. _api.dataframe.missing:
+
+Missing data handling
+---------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.backfill
+   DataFrame.dropna
+   DataFrame.fillna
+   DataFrame.replace
+   DataFrame.bfill
+   DataFrame.ffill
+
+Reshaping, sorting, transposing
+-------------------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.pivot_table
+   DataFrame.pivot
+   DataFrame.sort_index
+   DataFrame.sort_values
+   DataFrame.nlargest
+   DataFrame.nsmallest
+   DataFrame.stack
+   DataFrame.unstack
+   DataFrame.melt
+   DataFrame.explode
+   DataFrame.squeeze
+   DataFrame.T
+   DataFrame.transpose
+   DataFrame.reindex
+   DataFrame.reindex_like
+   DataFrame.rank
+
+Combining / joining / merging
+-----------------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.append
+   DataFrame.assign
+   DataFrame.merge
+   DataFrame.join
+   DataFrame.update
+   DataFrame.insert
+
+Time series-related
+-------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.shift
+   DataFrame.first_valid_index
+   DataFrame.last_valid_index
+
+Serialization / IO / Conversion
+-------------------------------
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.from_records
+   DataFrame.info
+   DataFrame.to_table
+   DataFrame.to_delta
+   DataFrame.to_parquet
+   DataFrame.to_spark_io
+   DataFrame.to_csv
+   DataFrame.to_pandas
+   DataFrame.to_html
+   DataFrame.to_numpy
+   DataFrame.to_pandas_on_spark
+   DataFrame.to_spark
+   DataFrame.to_string
+   DataFrame.to_json
+   DataFrame.to_dict
+   DataFrame.to_excel
+   DataFrame.to_clipboard
+   DataFrame.to_markdown
+   DataFrame.to_records
+   DataFrame.to_latex
+   DataFrame.style
+
+Spark-related
+-------------
+``DataFrame.spark`` provides features that does not exist in pandas but
+in Spark. These can be accessed by ``DataFrame.spark.<function/property>``.
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.spark.schema
+   DataFrame.spark.print_schema
+   DataFrame.spark.frame
+   DataFrame.spark.cache
+   DataFrame.spark.persist
+   DataFrame.spark.hint
+   DataFrame.spark.to_table
+   DataFrame.spark.to_spark_io
+   DataFrame.spark.explain
+   DataFrame.spark.apply
+   DataFrame.spark.repartition
+   DataFrame.spark.coalesce
+   DataFrame.spark.checkpoint
+   DataFrame.spark.local_checkpoint
+
+.. _api.dataframe.plot:
+
+Plotting
+--------
+``DataFrame.plot`` is both a callable method and a namespace attribute for
+specific plotting methods of the form ``DataFrame.plot.<kind>``.
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.plot
+   DataFrame.plot.area
+   DataFrame.plot.barh
+   DataFrame.plot.bar
+   DataFrame.plot.hist
+   DataFrame.plot.line
+   DataFrame.plot.pie
+   DataFrame.plot.scatter
+   DataFrame.plot.density
+   DataFrame.hist
+   DataFrame.kde
+
+Koalas-specific
+---------------
+``DataFrame.pandas_on_spark`` provides Koalas-specific features that exists only in Koalas.
+These can be accessed by ``DataFrame.pandas_on_spark.<function/property>``.
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.pandas_on_spark.attach_id_column
+   DataFrame.pandas_on_spark.apply_batch
+   DataFrame.pandas_on_spark.transform_batch
--- a/python/docs/source/reference/ps_general_functions.rst
+++ b/python/docs/source/reference/ps_general_functions.rst
@ -0,0 +1,49 @@
+.. _api.general_functions:
+
+=================
+General functions
+=================
+.. currentmodule:: pyspark.pandas
+
+Working with options
+--------------------
+
+.. autosummary::
+   :toctree: api/
+
+    reset_option
+    get_option
+    set_option
+    option_context
+
+Data manipulations and SQL
+--------------------------
+.. autosummary::
+   :toctree: api/
+
+   melt
+   merge
+   get_dummies
+   concat
+   sql
+   broadcast
+
+Top-level missing data
+----------------------
+
+.. autosummary::
+   :toctree: api/
+
+   to_numeric
+   isna
+   isnull
+   notna
+   notnull
+
+Top-level dealing with datetimelike
+-----------------------------------
+.. autosummary::
+   :toctree: api/
+
+   to_datetime
+   date_range
--- a/python/docs/source/reference/ps_groupby.rst
+++ b/python/docs/source/reference/ps_groupby.rst
@ -0,0 +1,88 @@
+.. _api.groupby:
+
+=======
+GroupBy
+=======
+.. currentmodule:: pyspark.pandas
+
+GroupBy objects are returned by groupby calls: :func:`DataFrame.groupby`, :func:`Series.groupby`, etc.
+
+.. currentmodule:: pyspark.pandas.groupby
+
+
+Indexing, iteration
+-------------------
+.. autosummary::
+   :toctree: api/
+
+   GroupBy.get_group
+
+Function application
+--------------------
+.. autosummary::
+   :toctree: api/
+
+   GroupBy.apply
+   GroupBy.transform
+
+The following methods are available only for `DataFrameGroupBy` objects.
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrameGroupBy.agg
+   DataFrameGroupBy.aggregate
+
+Computations / Descriptive Stats
+--------------------------------
+.. autosummary::
+   :toctree: api/
+
+   GroupBy.all
+   GroupBy.any
+   GroupBy.count
+   GroupBy.cumcount
+   GroupBy.cummax
+   GroupBy.cummin
+   GroupBy.cumprod
+   GroupBy.cumsum
+   GroupBy.filter
+   GroupBy.first
+   GroupBy.last
+   GroupBy.max
+   GroupBy.mean
+   GroupBy.median
+   GroupBy.min
+   GroupBy.rank
+   GroupBy.std
+   GroupBy.sum
+   GroupBy.var
+   GroupBy.nunique
+   GroupBy.size
+   GroupBy.diff
+   GroupBy.idxmax
+   GroupBy.idxmin
+   GroupBy.fillna
+   GroupBy.bfill
+   GroupBy.ffill
+   GroupBy.head
+   GroupBy.backfill
+   GroupBy.shift
+   GroupBy.tail
+
+The following methods are available only for `DataFrameGroupBy` objects.
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrameGroupBy.describe
+
+The following methods are available only for `SeriesGroupBy` objects.
+
+.. autosummary::
+   :toctree: api/
+
+   SeriesGroupBy.nsmallest
+   SeriesGroupBy.nlargest
+   SeriesGroupBy.value_counts
+   SeriesGroupBy.unique
--- a/python/docs/source/reference/ps_indexing.rst
+++ b/python/docs/source/reference/ps_indexing.rst
@ -0,0 +1,359 @@
+.. _api.indexing:
+
+=============
+Index objects
+=============
+
+Index
+-----
+.. currentmodule:: pyspark.pandas
+
+.. autosummary::
+   :toctree: api/
+
+   Index
+
+Properties
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.is_monotonic
+   Index.is_monotonic_increasing
+   Index.is_monotonic_decreasing
+   Index.is_unique
+   Index.has_duplicates
+   Index.hasnans
+   Index.dtype
+   Index.inferred_type
+   Index.is_all_dates
+   Index.shape
+   Index.name
+   Index.names
+   Index.ndim
+   Index.size
+   Index.nlevels
+   Index.empty
+   Index.T
+   Index.values
+
+Modifying and computations
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: api/
+
+   Index.all
+   Index.any
+   Index.argmin
+   Index.argmax
+   Index.copy
+   Index.delete
+   Index.equals
+   Index.factorize
+   Index.identical
+   Index.insert
+   Index.is_boolean
+   Index.is_categorical
+   Index.is_floating
+   Index.is_integer
+   Index.is_interval
+   Index.is_numeric
+   Index.is_object
+   Index.drop
+   Index.drop_duplicates
+   Index.min
+   Index.max
+   Index.rename
+   Index.repeat
+   Index.take
+   Index.unique
+   Index.nunique
+   Index.value_counts
+
+Compatibility with MultiIndex
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: api/
+
+   Index.set_names
+   Index.droplevel
+
+Missing Values
+~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.fillna
+   Index.dropna
+   Index.isna
+   Index.notna
+
+Conversion
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.astype
+   Index.item
+   Index.to_list
+   Index.to_series
+   Index.to_frame
+   Index.view
+   Index.to_numpy
+
+Spark-related
+-------------
+``Index.spark`` provides features that does not exist in pandas but
+in Spark. These can be accessed by ``Index.spark.<function/property>``.
+
+.. autosummary::
+   :toctree: api/
+
+   Index.spark.data_type
+   Index.spark.column
+   Index.spark.transform
+
+Sorting
+~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.sort_values
+
+Time-specific operations
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.shift
+
+Combining / joining / set operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.append
+   Index.intersection
+   Index.union
+   Index.difference
+   Index.symmetric_difference
+
+Selecting
+~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.asof
+   Index.isin
+
+.. _api.numeric:
+
+Numeric Index
+-------------
+.. autosummary::
+   :toctree: api/
+
+   Int64Index
+   Float64Index
+
+.. _api.categorical:
+
+CategoricalIndex
+----------------
+.. autosummary::
+   :toctree: api/
+
+   CategoricalIndex
+
+Categorical components
+~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   CategoricalIndex.codes
+   CategoricalIndex.categories
+   CategoricalIndex.ordered
+
+.. _api.multiindex:
+
+MultiIndex
+----------
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex
+
+MultiIndex Constructors
+~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.from_arrays
+   MultiIndex.from_tuples
+   MultiIndex.from_product
+   MultiIndex.from_frame
+
+MultiIndex Properties
+~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.has_duplicates
+   MultiIndex.hasnans
+   MultiIndex.inferred_type
+   MultiIndex.is_all_dates
+   MultiIndex.shape
+   MultiIndex.names
+   MultiIndex.ndim
+   MultiIndex.empty
+   MultiIndex.T
+   MultiIndex.size
+   MultiIndex.nlevels
+   MultiIndex.levshape
+   MultiIndex.values
+
+MultiIndex components
+~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.swaplevel
+
+MultiIndex components
+~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.droplevel
+
+MultiIndex Missing Values
+~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.fillna
+   MultiIndex.dropna
+
+MultiIndex Modifying and computations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.equals
+   MultiIndex.identical
+   MultiIndex.insert
+   MultiIndex.drop
+   MultiIndex.copy
+   MultiIndex.delete
+   MultiIndex.rename
+   MultiIndex.repeat
+   MultiIndex.take
+   MultiIndex.unique
+   MultiIndex.min
+   MultiIndex.max
+   MultiIndex.value_counts
+
+MultiIndex Combining / joining / set operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.append
+   MultiIndex.intersection
+   MultiIndex.union
+   MultiIndex.difference
+   MultiIndex.symmetric_difference
+
+MultiIndex Conversion
+~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.astype
+   MultiIndex.item
+   MultiIndex.to_list
+   MultiIndex.to_series
+   MultiIndex.to_frame
+   MultiIndex.view
+   MultiIndex.to_numpy
+
+MultiIndex Spark-related
+------------------------
+``MultiIndex.spark`` provides features that does not exist in pandas but
+in Spark. These can be accessed by ``MultiIndex.spark.<function/property>``.
+
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.spark.data_type
+   MultiIndex.spark.column
+   MultiIndex.spark.transform
+
+MultiIndex Sorting
+~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.sort_values
+
+.. _api.datetimes:
+
+DatatimeIndex
+-------------
+.. autosummary::
+   :toctree: api/
+
+   DatetimeIndex
+
+Time/date components
+~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DatetimeIndex.year
+   DatetimeIndex.month
+   DatetimeIndex.day
+   DatetimeIndex.hour
+   DatetimeIndex.minute
+   DatetimeIndex.second
+   DatetimeIndex.microsecond
+   DatetimeIndex.week
+   DatetimeIndex.weekofyear
+   DatetimeIndex.dayofweek
+   DatetimeIndex.day_of_week
+   DatetimeIndex.weekday
+   DatetimeIndex.dayofyear
+   DatetimeIndex.day_of_year
+   DatetimeIndex.quarter
+   DatetimeIndex.is_month_start
+   DatetimeIndex.is_month_end
+   DatetimeIndex.is_quarter_start
+   DatetimeIndex.is_quarter_end
+   DatetimeIndex.is_year_start
+   DatetimeIndex.is_year_end
+   DatetimeIndex.is_leap_year
+   DatetimeIndex.daysinmonth
+   DatetimeIndex.days_in_month
+
+Selecting
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DatetimeIndex.indexer_between_time
+   DatetimeIndex.indexer_at_time
+
+Time-specific operations
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DatetimeIndex.normalize
+   DatetimeIndex.strftime
+   DatetimeIndex.round
+   DatetimeIndex.floor
+   DatetimeIndex.ceil
+   DatetimeIndex.month_name
+   DatetimeIndex.day_name
--- a/python/docs/source/reference/ps_io.rst
+++ b/python/docs/source/reference/ps_io.rst
@ -0,0 +1,103 @@
+.. _api.io:
+
+============
+Input/Output
+============
+.. currentmodule:: pyspark.pandas
+
+
+Data Generator
+--------------
+.. autosummary::
+   :toctree: api/
+
+   range
+
+Spark Metastore Table
+---------------------
+.. autosummary::
+   :toctree: api/
+
+   read_table
+   DataFrame.to_table
+
+Delta Lake
+----------
+.. autosummary::
+   :toctree: api/
+
+   read_delta
+   DataFrame.to_delta
+
+Parquet
+-------
+.. autosummary::
+   :toctree: api/
+
+   read_parquet
+   DataFrame.to_parquet
+
+ORC
+-------
+.. autosummary::
+   :toctree: api/
+
+   read_orc
+   DataFrame.to_orc
+
+Generic Spark I/O
+-----------------
+.. autosummary::
+   :toctree: api/
+
+   read_spark_io
+   DataFrame.to_spark_io
+
+Flat File / CSV
+---------------
+.. autosummary::
+   :toctree: api/
+
+   read_csv
+   DataFrame.to_csv
+
+Clipboard
+---------
+.. autosummary::
+   :toctree: api/
+
+   read_clipboard
+   DataFrame.to_clipboard
+
+Excel
+-----
+.. autosummary::
+   :toctree: api/
+
+   read_excel
+   DataFrame.to_excel
+
+JSON
+----
+.. autosummary::
+   :toctree: api/
+
+   read_json
+   DataFrame.to_json
+
+HTML
+----
+.. autosummary::
+   :toctree: api/
+
+   read_html
+   DataFrame.to_html
+
+SQL
+---
+.. autosummary::
+   :toctree: api/
+
+   read_sql_table
+   read_sql_query
+   read_sql
--- a/python/docs/source/reference/ps_ml.rst
+++ b/python/docs/source/reference/ps_ml.rst
@ -0,0 +1,28 @@
+.. _api.ml:
+
+==========================
+Machine Learning utilities
+==========================
+.. currentmodule:: pyspark.pandas.mlflow
+
+MLflow
+------
+
+Arbitrary MLflow models can be used with Koalas Dataframes,
+provided they implement the 'pyfunc' flavor. This is the case
+for most frameworks supported by MLflow (scikit-learn, pytorch,
+tensorflow, ...). See comprehensive examples in
+:func:`load_model` for more information.
+
+.. note::
+   The MLflow package must be installed in order to use this module.
+   If MLflow is not installed in your environment already, you
+   can install it with the following command:
+
+   **pip install koalas[mlflow]**
+
+.. autosummary::
+   :toctree: api/
+
+   PythonModelWrapper
+   load_model
--- a/python/docs/source/reference/ps_series.rst
+++ b/python/docs/source/reference/ps_series.rst
@ -0,0 +1,454 @@
+.. _api.series:
+
+======
+Series
+======
+.. currentmodule:: pyspark.pandas
+
+Constructor
+-----------
+.. autosummary::
+   :toctree: api/
+
+   Series
+
+Attributes
+----------
+
+.. autosummary::
+   :toctree: api/
+
+   Series.index
+   Series.dtype
+   Series.dtypes
+   Series.ndim
+   Series.name
+   Series.shape
+   Series.axes
+   Series.size
+   Series.empty
+   Series.T
+   Series.hasnans
+   Series.values
+
+Conversion
+----------
+.. autosummary::
+   :toctree: api/
+
+   Series.astype
+   Series.copy
+   Series.bool
+
+Indexing, iteration
+-------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.at
+   Series.iat
+   Series.loc
+   Series.iloc
+   Series.keys
+   Series.pop
+   Series.items
+   Series.iteritems
+   Series.item
+   Series.xs
+   Series.get
+
+Binary operator functions
+-------------------------
+
+.. autosummary::
+   :toctree: api/
+
+   Series.add
+   Series.div
+   Series.mul
+   Series.radd
+   Series.rdiv
+   Series.rmul
+   Series.rsub
+   Series.rtruediv
+   Series.sub
+   Series.truediv
+   Series.pow
+   Series.rpow
+   Series.mod
+   Series.rmod
+   Series.floordiv
+   Series.rfloordiv
+   Series.divmod
+   Series.rdivmod
+   Series.combine_first
+   Series.lt
+   Series.gt
+   Series.le
+   Series.ge
+   Series.ne
+   Series.eq
+   Series.product
+   Series.dot
+
+Function application, GroupBy & Window
+--------------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.apply
+   Series.agg
+   Series.aggregate
+   Series.transform
+   Series.map
+   Series.groupby
+   Series.rolling
+   Series.expanding
+   Series.pipe
+
+.. _api.series.stats:
+
+Computations / Descriptive Stats
+--------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.abs
+   Series.all
+   Series.any
+   Series.between
+   Series.clip
+   Series.corr
+   Series.count
+   Series.cummax
+   Series.cummin
+   Series.cumsum
+   Series.cumprod
+   Series.describe
+   Series.filter
+   Series.kurt
+   Series.mad
+   Series.max
+   Series.mean
+   Series.min
+   Series.mode
+   Series.nlargest
+   Series.nsmallest
+   Series.pct_change
+   Series.prod
+   Series.nunique
+   Series.is_unique
+   Series.quantile
+   Series.rank
+   Series.sem
+   Series.skew
+   Series.std
+   Series.sum
+   Series.median
+   Series.var
+   Series.kurtosis
+   Series.unique
+   Series.value_counts
+   Series.round
+   Series.diff
+   Series.is_monotonic
+   Series.is_monotonic_increasing
+   Series.is_monotonic_decreasing
+
+Reindexing / Selection / Label manipulation
+-------------------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.align
+   Series.drop
+   Series.droplevel
+   Series.drop_duplicates
+   Series.equals
+   Series.add_prefix
+   Series.add_suffix
+   Series.first
+   Series.head
+   Series.idxmax
+   Series.idxmin
+   Series.isin
+   Series.last
+   Series.rename
+   Series.rename_axis
+   Series.reindex
+   Series.reindex_like
+   Series.reset_index
+   Series.sample
+   Series.swaplevel
+   Series.swapaxes
+   Series.take
+   Series.tail
+   Series.where
+   Series.mask
+   Series.truncate
+
+Missing data handling
+---------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.backfill
+   Series.bfill
+   Series.isna
+   Series.isnull
+   Series.notna
+   Series.notnull
+   Series.pad
+   Series.dropna
+   Series.fillna
+
+Reshaping, sorting, transposing
+-------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.argsort
+   Series.argmin
+   Series.argmax
+   Series.sort_index
+   Series.sort_values
+   Series.unstack
+   Series.explode
+   Series.repeat
+   Series.squeeze
+   Series.factorize
+
+Combining / joining / merging
+-----------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.append
+   Series.compare
+   Series.replace
+   Series.update
+
+Time series-related
+-------------------
+
+.. autosummary::
+   :toctree: api/
+
+   Series.asof
+   Series.shift
+   Series.first_valid_index
+   Series.last_valid_index
+   Series.at_time
+   Series.between_time
+
+Spark-related
+-------------
+``Series.spark`` provides features that does not exist in pandas but
+in Spark. These can be accessed by ``Series.spark.<function/property>``.
+
+.. autosummary::
+   :toctree: api/
+
+   Series.spark.data_type
+   Series.spark.nullable
+   Series.spark.column
+   Series.spark.transform
+   Series.spark.apply
+
+Accessors
+---------
+
+Koalas provides dtype-specific methods under various accessors.
+These are separate namespaces within :class:`Series` that only apply
+to specific data types.
+
+=========== ===========================
+Data Type                      Accessor
+=========== ===========================
+Datetime    :ref:`dt <api.series.dt>`
+String      :ref:`str <api.series.str>`
+Categorical :ref:`cat <api.series.cat>`
+=========== ===========================
+
+.. _api.series.dt:
+
+Date Time Handling
+------------------
+
+``Series.dt`` can be used to access the values of the series as
+datetimelike and return several properties.
+These can be accessed like ``Series.dt.<property>``.
+
+Datetime Properties
+~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: api/
+
+   Series.dt.date
+   Series.dt.year
+   Series.dt.month
+   Series.dt.day
+   Series.dt.hour
+   Series.dt.minute
+   Series.dt.second
+   Series.dt.microsecond
+   Series.dt.week
+   Series.dt.weekofyear
+   Series.dt.dayofweek
+   Series.dt.weekday
+   Series.dt.dayofyear
+   Series.dt.quarter
+   Series.dt.is_month_start
+   Series.dt.is_month_end
+   Series.dt.is_quarter_start
+   Series.dt.is_quarter_end
+   Series.dt.is_year_start
+   Series.dt.is_year_end
+   Series.dt.is_leap_year
+   Series.dt.daysinmonth
+   Series.dt.days_in_month
+
+Datetime Methods
+~~~~~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: api/
+
+   Series.dt.normalize
+   Series.dt.strftime
+   Series.dt.round
+   Series.dt.floor
+   Series.dt.ceil
+   Series.dt.month_name
+   Series.dt.day_name
+
+.. _api.series.str:
+
+String Handling
+---------------
+
+``Series.str`` can be used to access the values of the series as
+strings and apply several methods to it. These can be accessed
+like ``Series.str.<function/property>``.
+
+.. autosummary::
+   :toctree: api/
+
+   Series.str.capitalize
+   Series.str.cat
+   Series.str.center
+   Series.str.contains
+   Series.str.count
+   Series.str.decode
+   Series.str.encode
+   Series.str.endswith
+   Series.str.extract
+   Series.str.extractall
+   Series.str.find
+   Series.str.findall
+   Series.str.get
+   Series.str.get_dummies
+   Series.str.index
+   Series.str.isalnum
+   Series.str.isalpha
+   Series.str.isdigit
+   Series.str.isspace
+   Series.str.islower
+   Series.str.isupper
+   Series.str.istitle
+   Series.str.isnumeric
+   Series.str.isdecimal
+   Series.str.join
+   Series.str.len
+   Series.str.ljust
+   Series.str.lower
+   Series.str.lstrip
+   Series.str.match
+   Series.str.normalize
+   Series.str.pad
+   Series.str.partition
+   Series.str.repeat
+   Series.str.replace
+   Series.str.rfind
+   Series.str.rindex
+   Series.str.rjust
+   Series.str.rpartition
+   Series.str.rsplit
+   Series.str.rstrip
+   Series.str.slice
+   Series.str.slice_replace
+   Series.str.split
+   Series.str.startswith
+   Series.str.strip
+   Series.str.swapcase
+   Series.str.title
+   Series.str.translate
+   Series.str.upper
+   Series.str.wrap
+   Series.str.zfill
+
+.. _api.series.cat:
+
+Categorical accessor
+--------------------
+Categorical-dtype specific methods and attributes are available under
+the ``Series.cat`` accessor.
+
+.. autosummary::
+   :toctree: api/
+
+   Series.cat.categories
+   Series.cat.ordered
+   Series.cat.codes
+
+.. _api.series.plot:
+
+Plotting
+-------------------------------
+``Series.plot`` is both a callable method and a namespace attribute for
+specific plotting methods of the form ``Series.plot.<kind>``.
+
+.. autosummary::
+   :toctree: api/
+
+   Series.plot
+   Series.plot.area
+   Series.plot.bar
+   Series.plot.barh
+   Series.plot.box
+   Series.plot.density
+   Series.plot.hist
+   Series.plot.line
+   Series.plot.pie
+   Series.plot.kde
+   Series.hist
+
+Serialization / IO / Conversion
+-------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.to_pandas
+   Series.to_numpy
+   Series.to_list
+   Series.to_string
+   Series.to_dict
+   Series.to_clipboard
+   Series.to_latex
+   Series.to_markdown
+   Series.to_json
+   Series.to_csv
+   Series.to_excel
+   Series.to_frame
+
+Koalas-specific
+---------------
+``Series.pandas_on_spark`` provides Koalas-specific features that exists only in Koalas.
+These can be accessed by ``Series.pandas_on_spark.<function/property>``.
+
+.. autosummary::
+   :toctree: api/
+
+   Series.pandas_on_spark.transform_batch
+
--- a/python/docs/source/reference/ps_window.rst
+++ b/python/docs/source/reference/ps_window.rst
@ -0,0 +1,31 @@
+======
+Window
+======
+.. currentmodule:: pyspark.pandas.window
+
+Rolling objects are returned by ``.rolling`` calls: :func:`pandas_on_spark.DataFrame.rolling`, :func:`pandas_on_spark.Series.rolling`, etc.
+Expanding objects are returned by ``.expanding`` calls: :func:`pandas_on_spark.DataFrame.expanding`, :func:`pandas_on_spark.Series.expanding`, etc.
+
+Standard moving window functions
+--------------------------------
+
+.. autosummary::
+   :toctree: api/
+
+   Rolling.count
+   Rolling.sum
+   Rolling.min
+   Rolling.max
+   Rolling.mean
+
+Standard expanding window functions
+-----------------------------------
+
+.. autosummary::
+   :toctree: api/
+
+   Expanding.count
+   Expanding.sum
+   Expanding.min
+   Expanding.max
+   Expanding.mean
--- a/python/docs/source/user_guide/index.rst
+++ b/python/docs/source/user_guide/index.rst
@ -32,7 +32,21 @@ at `the Spark documentation <https://spark.apache.org/docs/latest/index.html#whe
 PySpark specific user guide is as follows:

 .. toctree::
-    :maxdepth: 2
+   :maxdepth: 2

-    python_packaging
-    arrow_pandas
+   python_packaging
+   arrow_pandas
+
+For pandas APIs on Spark:
+
+.. toctree::
+   :maxdepth: 2
+
+   ps_options
+   ps_pandas_pyspark
+   ps_transform_apply
+   ps_types
+   ps_typehints
+   ps_from_to_dbms
+   ps_best_practices
+   ps_faq
--- a/python/docs/source/user_guide/ps_best_practices.rst
+++ b/python/docs/source/user_guide/ps_best_practices.rst
@ -0,0 +1,313 @@
+==============
+Best Practices
+==============
+
+Leverage PySpark APIs
+---------------------
+
+Koalas uses Spark under the hood; therefore, many features and performance optimization are available
+in Koalas as well. Leverage and combine those cutting-edge features with Koalas.
+
+Existing Spark context and Spark sessions are used out of the box in Koalas. If you already have your own
+configured Spark context or sessions running, Koalas uses them.
+
+If there is no Spark context or session running in your environment (e.g., ordinary Python interpreter),
+such configurations can be set to ``SparkContext`` and/or ``SparkSession``.
+Once Spark context and/or session is created, Koalas can use this context and/or session automatically.
+For example, if you want to configure the executor memory in Spark, you can do as below:
+
+.. code-block:: python
+
+   from pyspark import SparkConf, SparkContext
+   conf = SparkConf()
+   conf.set('spark.executor.memory', '2g')
+   # Koalas automatically uses this Spark context with the configurations set.
+   SparkContext(conf=conf)
+
+   import pyspark.pandas as ks
+   ...
+
+Another common configuration might be Arrow optimization in PySpark. In case of SQL configuration,
+it can be set into Spark session as below:
+
+.. code-block:: python
+
+   from pyspark.sql import SparkSession
+   builder = SparkSession.builder.appName("Koalas")
+   builder = builder.config("spark.sql.execution.arrow.enabled", "true")
+   # Koalas automatically uses this Spark session with the configurations set.
+   builder.getOrCreate()
+
+   import pyspark.pandas as ks
+   ...
+
+All Spark features such as history server, web UI and deployment modes can be used as are with Koalas.
+If you are interested in performance tuning, please see also `Tuning Spark <https://spark.apache.org/docs/latest/tuning.html>`_.
+
+
+Check execution plans
+---------------------
+
+Expensive operations can be predicted by leveraging PySpark API `DataFrame.spark.explain()`
+before the actual computation since Koalas is based on lazy execution. For example, see below.
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> kdf = ks.DataFrame({'id': range(10)})
+   >>> kdf = kdf[kdf.id > 5]
+   >>> kdf.spark.explain()
+   == Physical Plan ==
+   *(1) Filter (id#1L > 5)
+   +- *(1) Scan ExistingRDD[__index_level_0__#0L,id#1L]
+
+
+Whenever you are not sure about such cases, you can check the actual execution plans and
+foresee the expensive cases.
+
+Even though Koalas tries its best to optimize and reduce such shuffle operations by leveraging Spark
+optimizers, it is best to avoid shuffling in the application side whenever possible.
+
+
+Use checkpoint
+--------------
+
+After a bunch of operations on Koalas objects, the underlying Spark planner can slow down due to the huge and complex plan.
+If the Spark plan becomes huge or it takes the planning long time, ``DataFrame.spark.checkpoint()``
+or ``DataFrame.spark.local_checkpoint()`` would be helpful.
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> kdf = ks.DataFrame({'id': range(10)})
+   >>> kdf = kdf[kdf.id > 5]
+   >>> kdf['id'] = kdf['id'] + (10 * kdf['id'] + kdf['id'])
+   >>> kdf = kdf.groupby('id').head(2)
+   >>> kdf.spark.explain()
+   == Physical Plan ==
+   *(3) Project [__index_level_0__#0L, id#31L]
+   +- *(3) Filter (isnotnull(__row_number__#44) AND (__row_number__#44 <= 2))
+      +- Window [row_number() windowspecdefinition(__groupkey_0__#36L, __natural_order__#16L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS __row_number__#44], [__groupkey_0__#36L], [__natural_order__#16L ASC NULLS FIRST]
+         +- *(2) Sort [__groupkey_0__#36L ASC NULLS FIRST, __natural_order__#16L ASC NULLS FIRST], false, 0
+            +- Exchange hashpartitioning(__groupkey_0__#36L, 200), true, [id=#33]
+               +- *(1) Project [__index_level_0__#0L, (id#1L + ((id#1L * 10) + id#1L)) AS __groupkey_0__#36L, (id#1L + ((id#1L * 10) + id#1L)) AS id#31L, __natural_order__#16L]
+                  +- *(1) Project [__index_level_0__#0L, id#1L, monotonically_increasing_id() AS __natural_order__#16L]
+                     +- *(1) Filter (id#1L > 5)
+                        +- *(1) Scan ExistingRDD[__index_level_0__#0L,id#1L]
+
+   >>> kdf = kdf.spark.local_checkpoint()  # or kdf.spark.checkpoint()
+   >>> kdf.spark.explain()
+   == Physical Plan ==
+   *(1) Project [__index_level_0__#0L, id#31L]
+   +- *(1) Scan ExistingRDD[__index_level_0__#0L,id#31L,__natural_order__#59L]
+
+As you can see, the previous Spark plan is dropped and starts with a simple plan.
+The result of the previous DataFrame is stored in the configured file system when calling ``DataFrame.spark.checkpoint()``,
+or in the executor when calling ``DataFrame.spark.local_checkpoint()``.
+
+
+Avoid shuffling
+---------------
+
+Some operations such as ``sort_values`` are more difficult to do in a parallel or distributed
+environment than in in-memory on a single machine because it needs to send data to other nodes,
+and exchange the data across multiple nodes via networks. See the example below.
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> kdf = ks.DataFrame({'id': range(10)}).sort_values(by="id")
+   >>> kdf.spark.explain()
+   == Physical Plan ==
+   *(2) Sort [id#9L ASC NULLS LAST], true, 0
+   +- Exchange rangepartitioning(id#9L ASC NULLS LAST, 200), true, [id=#18]
+      +- *(1) Scan ExistingRDD[__index_level_0__#8L,id#9L]
+
+As you can see, it requires ``Exchange`` which requires a shuffle and it is likely expensive.
+
+
+Avoid computation on single partition
+-------------------------------------
+
+Another common case is the computation on a single partition. Currently, some APIs such as
+`DataFrame.rank <https://koalas.readthedocs.io/en/latest/reference/api/pyspark.pandas.DataFrame.rank.html>`_
+uses PySpark’s Window without specifying partition specification. This leads to move all data into a single
+partition in single machine and could cause serious performance degradation.
+Such APIs should be avoided very large dataset.
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> kdf = ks.DataFrame({'id': range(10)})
+   >>> kdf.rank().spark.explain()
+   == Physical Plan ==
+   *(4) Project [__index_level_0__#16L, id#24]
+   +- Window [avg(cast(_w0#26 as bigint)) windowspecdefinition(id#17L, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS id#24], [id#17L]
+      +- *(3) Project [__index_level_0__#16L, _w0#26, id#17L]
+         +- Window [row_number() windowspecdefinition(id#17L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _w0#26], [id#17L ASC NULLS FIRST]
+            +- *(2) Sort [id#17L ASC NULLS FIRST], false, 0
+               +- Exchange SinglePartition, true, [id=#48]
+                  +- *(1) Scan ExistingRDD[__index_level_0__#16L,id#17L]
+
+Instead, use 
+`GroupBy.rank <https://koalas.readthedocs.io/en/latest/reference/api/pyspark.pandas.groupby.GroupBy.rank.html>`_
+as it is less expensive because data can be distributed and computed for each group.
+
+
+Avoid reserved column names
+---------------------------
+
+Columns with leading ``__`` and trailing ``__`` are reserved in Koalas. To handle internal behaviors for, such as, index,
+Koalas uses some internal columns. Therefore, it is discouraged to use such column names and not guaranteed to work.
+
+
+Do not use duplicated column names
+----------------------------------
+
+It is disallowed to use duplicated column names because Spark SQL does not allow this in general. Koalas inherits
+this behavior. For instance, see below:
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> kdf = ks.DataFrame({'a': [1, 2], 'b':[3, 4]})
+   >>> kdf.columns = ["a", "a"]
+   ...
+   Reference 'a' is ambiguous, could be: a, a.;
+
+Additionally, it is strongly discouraged to use case sensitive column names. Koalas disallows it by default.
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> kdf = ks.DataFrame({'a': [1, 2], 'A':[3, 4]})
+   ...
+   Reference 'a' is ambiguous, could be: a, a.;
+
+However, you can turn on ``spark.sql.caseSensitive`` in Spark configuration to enable it if you use on your own risk.
+
+.. code-block:: python
+
+   >>> from pyspark.sql import SparkSession
+   >>> builder = SparkSession.builder.appName("Koalas")
+   >>> builder = builder.config("spark.sql.caseSensitive", "true")
+   >>> builder.getOrCreate()
+
+   >>> import pyspark.pandas as ks
+   >>> kdf = ks.DataFrame({'a': [1, 2], 'A':[3, 4]})
+   >>> kdf
+      a  A
+   0  1  3
+   1  2  4
+
+
+Specify the index column in conversion from Spark DataFrame to Koalas DataFrame
+-------------------------------------------------------------------------------
+
+When Koalas Dataframe is converted from Spark DataFrame, it loses the index information, which results in using
+the default index in Koalas DataFrame. The default index is inefficient in general comparing to explicitly specifying
+the index column. Specify the index column whenever possible.
+
+See  `working with PySpark <pandas_pyspark.rst#pyspark>`_
+
+
+Use ``distributed`` or ``distributed-sequence`` default index
+-------------------------------------------------------------
+
+One common issue when Koalas users face is the slow performance by default index. Koalas attaches
+a default index when the index is unknown, for example, Spark DataFrame is directly converted to Koalas DataFrame.
+
+This default index is ``sequence`` which requires the computation on single partition which is discouraged. If you plan
+to handle large data in production, make it distributed by configuring the default index to ``distributed`` or
+``distributed-sequence`` .
+
+See `Default Index Type <options.rst#default-index-type>`_ for more details about configuring default index.
+
+
+Reduce the operations on different DataFrame/Series
+---------------------------------------------------
+
+Koalas disallows the operations on different DataFrames (or Series) by default to prevent expensive operations.
+It internally performs a join operation which can be expensive in general, which is discouraged. Whenever possible,
+this operation should be avoided.
+
+See `Operations on different DataFrames <options.rst#operations-on-different-dataframes>`_ for more details.
+
+
+Use Koalas APIs directly whenever possible
+------------------------------------------
+
+Although Koalas has most of the pandas-equivalent APIs, there are several APIs not implemented yet or explicitly unsupported.
+
+As an example, Koalas does not implement ``__iter__()`` to prevent users from collecting all data into the client (driver) side from the whole cluster.
+Unfortunately, many external APIs such as Python built-in functions such as min, max, sum, etc. require the given argument to be iterable.
+In case of pandas, it works properly out of the box as below:
+
+.. code-block:: python
+
+   >>> import pandas as pd
+   >>> max(pd.Series([1, 2, 3]))
+   3
+   >>> min(pd.Series([1, 2, 3]))
+   1
+   >>> sum(pd.Series([1, 2, 3]))
+   6
+
+pandas dataset lives in the single machine, and is naturally iterable locally within the same machine.
+However, Koalas dataset lives across multiple machines, and they are computed in a distributed manner.
+It is difficult to be locally iterable and it is very likely users collect the entire data into the client side without knowing it.
+Therefore, it is best to stick to using Koalas APIs.
+The examples above can be converted as below:
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> ks.Series([1, 2, 3]).max()
+   3
+   >>> ks.Series([1, 2, 3]).min()
+   1
+   >>> ks.Series([1, 2, 3]).sum()
+   6
+
+Another common pattern from pandas users might be to rely on list comprehension or generator expression.
+However, it also assumes the dataset is locally iterable under the hood.
+Therefore, it works seamlessly in pandas as below:
+
+.. code-block:: python
+
+   >>> import pandas as pd
+   >>> data = []
+   >>> countries = ['London', 'New York', 'Helsinki']
+   >>> pser = pd.Series([20., 21., 12.], index=countries)
+   >>> for temperature in pser:
+   ...     assert temperature > 0
+   ...     if temperature > 1000:
+   ...         temperature = None
+   ...     data.append(temperature ** 2)
+   ...
+   >>> pd.Series(data, index=countries)
+   London      400.0
+   New York    441.0
+   Helsinki    144.0
+   dtype: float64
+
+However, for Koalas it does not work as the same reason above.
+The example above can be also changed to directly using Koalas APIs as below:
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> import numpy as np
+   >>> countries = ['London', 'New York', 'Helsinki']
+   >>> kser = ks.Series([20., 21., 12.], index=countries)
+   >>> def square(temperature) -> np.float64:
+   ...     assert temperature > 0
+   ...     if temperature > 1000:
+   ...         temperature = None
+   ...     return temperature ** 2
+   ...
+   >>> kser.apply(square)
+   London      400.0
+   New York    441.0
+   Helsinki    144.0
+   dtype: float64
--- a/python/docs/source/user_guide/ps_faq.rst
+++ b/python/docs/source/user_guide/ps_faq.rst
@ -0,0 +1,86 @@
+===
+FAQ
+===
+
+What's the project's status?
+----------------------------
+
+Koalas 1.0.0 was released, and it is much more stable now.
+You might still face the following differences:
+
+ - Most of pandas-equivalent APIs are implemented but still some may be missing.
+   Please create a GitHub issue if your favorite function is not yet supported.
+   We also document all APIs that are not yet supported in the `missing directory <https://github.com/pyspark.pandas/tree/master/databricks/koalas/missing>`_.
+
+ - Some behaviors may be different, in particular in the treatment of nulls: Pandas uses
+   Not a Number (NaN) special constants to indicate missing values, while Spark has a
+   special flag on each value to indicate missing values. We would love to hear from you
+   if you come across any discrepancies
+
+ - Because Spark is lazy in nature, some operations like creating new columns only get
+   performed when Spark needs to print or write the dataframe.
+
+Is it Koalas or koalas?
+-----------------------
+
+It's Koalas. Unlike pandas, we use upper case here.
+
+Should I use PySpark's DataFrame API or Koalas?
+-----------------------------------------------
+
+If you are already familiar with pandas and want to leverage Spark for big data, we recommend
+using Koalas. If you are learning Spark from ground up, we recommend you start with PySpark's API.
+
+Does Koalas support Structured Streaming?
+-----------------------------------------
+
+No, Koalas does not support Structured Streaming officially.
+
+As a workaround, you can use Koalas APIs with `foreachBatch` in Structured Streaming which allows batch APIs:
+
+.. code-block:: python
+
+   >>> def func(batch_df, batch_id):
+   ...     koalas_df = ks.DataFrame(batch_df)
+   ...     koalas_df['a'] = 1
+   ...     print(koalas_df)
+
+   >>> spark.readStream.format("rate").load().writeStream.foreachBatch(func).start()
+                   timestamp  value  a
+   0 2020-02-21 09:49:37.574      4  1
+                   timestamp  value  a
+   0 2020-02-21 09:49:38.574      5  1
+   ...
+
+How can I request support for a method?
+---------------------------------------
+
+File a GitHub issue: https://github.com/pyspark.pandas/issues
+
+Databricks customers are also welcome to file a support ticket to request a new feature.
+
+How is Koalas different from Dask?
+----------------------------------
+
+Different projects have different focuses. Spark is already deployed in virtually every
+organization, and often is the primary interface to the massive amount of data stored in data lakes.
+Koalas was inspired by Dask, and aims to make the transition from pandas to Spark easy for data
+scientists.
+
+How can I contribute to Koalas?
+-------------------------------
+
+See `Contributing Guide <https://koalas.readthedocs.io/en/latest/development/contributing.html>`_.
+
+Why a new project (instead of putting this in Apache Spark itself)?
+-------------------------------------------------------------------
+
+Two reasons:
+
+1. We want a venue in which we can rapidly iterate and make new releases. The overhead of making a
+release as a separate project is minuscule (in the order of minutes). A release on Spark takes a
+lot longer (in the order of days)
+
+2. Koalas takes a different approach that might contradict Spark's API design principles, and those
+principles cannot be changed lightly given the large user base of Spark. A new, separate project
+provides an opportunity for us to experiment with new design principles.
--- a/python/docs/source/user_guide/ps_from_to_dbms.rst
+++ b/python/docs/source/user_guide/ps_from_to_dbms.rst
@ -0,0 +1,107 @@
+====================
+From/to other DBMSes
+====================
+.. currentmodule:: pyspark.pandas
+
+
+The APIs interacting with other DBMSes in Koalas are slightly different from the ones in pandas
+because Koalas leverages JDBC APIs in PySpark to read and write from/to other DBMSes.
+
+The APIs to read/write from/to external DBMSes are as follows:
+
+.. autosummary::
+
+    read_sql_table
+    read_sql_query
+    read_sql
+
+..
+    TODO: we should implement and document `DataFrame.to_sql`.
+
+Koalas needs a canonical JDBC URL for ``con``, and is able to take extra keyword arguments for `the options in PySpark JDBC APIs <https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html>`_:
+
+.. code-block:: python
+
+    ks.read_sql(..., dbtable="...", driver="", keytab="", ...)
+
+
+Reading and writing DataFrames
+------------------------------
+
+In the example below, you will read and write a table in SQLite.
+
+Firstly, create the ``example`` database as below via Python's SQLite library. This will be read to Koalas later:
+
+.. code-block:: python
+
+    import sqlite3
+
+    con = sqlite3.connect('example.db')
+    cur = con.cursor()
+    # Create table
+    cur.execute(
+        '''CREATE TABLE stocks
+           (date text, trans text, symbol text, qty real, price real)''')
+    # Insert a row of data
+    cur.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
+    # Save (commit) the changes
+    con.commit()
+    con.close()
+
+Koalas requires a JDBC driver to read so it requires the driver for your particular database to be on the Spark's classpath. For SQLite JDBC driver, you can download it, for example, as below:
+
+.. code-block:: bash
+
+    curl -O https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar
+
+After that, you should add it into your Spark session first. Once you add them, Koalas will automatically detect the Spark session and leverage it.
+
+.. code-block:: python
+
+    import os
+
+    from pyspark.sql import SparkSession
+
+    (SparkSession.builder
+        .master("local")
+        .appName("SQLite JDBC")
+        .config(
+            "spark.jars",
+            "{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd()))
+        .config(
+            "spark.driver.extraClassPath",
+            "{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd()))
+        .getOrCreate())
+
+Now, you're ready to read the table:
+
+.. code-block:: python
+
+    import pyspark.pandas as ks
+
+    df = ks.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))
+    df
+
+.. code-block:: text
+
+             date trans symbol    qty  price
+    0  2006-01-05   BUY   RHAT  100.0  35.14
+
+You can also write it back to the ``stocks`` table as below:
+
+..
+     TODO: switch to use DataFrame.to_sql in the example
+
+.. code-block:: python
+
+    df.price += 1
+    df.to_spark_io(
+        format="jdbc", mode="append",
+        dbtable="stocks", url="jdbc:sqlite:{}/example.db".format(os.getcwd()))
+    ks.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))
+
+.. code-block:: text
+
+             date trans symbol    qty  price
+    0  2006-01-05   BUY   RHAT  100.0  35.14
+    1  2006-01-05   BUY   RHAT  100.0  36.14
--- a/python/docs/source/user_guide/ps_options.rst
+++ b/python/docs/source/user_guide/ps_options.rst
@ -0,0 +1,274 @@
+====================
+Options and settings
+====================
+.. currentmodule:: pyspark.pandas
+
+Koalas has an options system that lets you customize some aspects of its behaviour,
+display-related options being those the user is most likely to adjust.
+
+Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``).
+You can get/set options directly as attributes of the top-level ``options`` attribute:
+
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> ks.options.display.max_rows
+   1000
+   >>> ks.options.display.max_rows = 10
+   >>> ks.options.display.max_rows
+   10
+
+The API is composed of 3 relevant functions, available directly from the ``koalas``
+namespace:
+
+* :func:`get_option` / :func:`set_option` - get/set the value of a single option.
+* :func:`reset_option` - reset one or more options to their default value.
+
+**Note:** Developers can check out `pyspark.pandas/config.py <https://github.com/databricks/koalas/blob/master/databricks/koalas/config.py>`_ for more information.
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> ks.get_option("display.max_rows")
+   1000
+   >>> ks.set_option("display.max_rows", 101)
+   >>> ks.get_option("display.max_rows")
+   101
+
+
+Getting and setting options
+---------------------------
+
+As described above, :func:`get_option` and :func:`set_option`
+are available from the koalas namespace.  To change an option, call
+``set_option('option name', new_value)``.
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> ks.get_option('compute.max_rows')
+   1000
+   >>> ks.set_option('compute.max_rows', 2000)
+   >>> ks.get_option('compute.max_rows')
+   2000
+
+All options also have a default value, and you can use ``reset_option`` to do just that:
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> ks.reset_option("display.max_rows")
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>> ks.get_option("display.max_rows")
+   1000
+   >>> ks.set_option("display.max_rows", 999)
+   >>> ks.get_option("display.max_rows")
+   999
+   >>> ks.reset_option("display.max_rows")
+   >>> ks.get_option("display.max_rows")
+   1000
+
+``option_context`` context manager has been exposed through
+the top-level API, allowing you to execute code with given option values. Option values
+are restored automatically when you exit the `with` block:
+
+.. code-block:: python
+
+   >>> with ks.option_context("display.max_rows", 10, "compute.max_rows", 5):
+   ...    print(ks.get_option("display.max_rows"))
+   ...    print(ks.get_option("compute.max_rows"))
+   10
+   5
+   >>> print(ks.get_option("display.max_rows"))
+   >>> print(ks.get_option("compute.max_rows"))
+   1000
+   1000
+
+
+Operations on different DataFrames
+----------------------------------
+
+Koalas disallows the operations on different DataFrames (or Series) by default to prevent expensive
+operations. It internally performs a join operation which can be expensive in general.
+
+This can be enabled by setting `compute.ops_on_diff_frames` to `True` to allow such cases.
+See the examples below.
+
+.. code-block:: python
+
+    >>> import pyspark.pandas as ks
+    >>> ks.set_option('compute.ops_on_diff_frames', True)
+    >>> kdf1 = ks.range(5)
+    >>> kdf2 = ks.DataFrame({'id': [5, 4, 3]})
+    >>> (kdf1 - kdf2).sort_index()
+        id
+    0 -5.0
+    1 -3.0
+    2 -1.0
+    3  NaN
+    4  NaN
+    >>> ks.reset_option('compute.ops_on_diff_frames')
+
+.. code-block:: python
+
+    >>> import pyspark.pandas as ks
+    >>> ks.set_option('compute.ops_on_diff_frames', True)
+    >>> kdf = ks.range(5)
+    >>> kser_a = ks.Series([1, 2, 3, 4])
+    >>> # 'kser_a' is not from 'kdf' DataFrame. So it is considered as a Series not from 'kdf'.
+    >>> kdf['new_col'] = kser_a
+    >>> kdf
+       id  new_col
+    0   0      1.0
+    1   1      2.0
+    3   3      4.0
+    2   2      3.0
+    4   4      NaN
+    >>> ks.reset_option('compute.ops_on_diff_frames')
+
+
+Default Index type
+------------------
+
+In Koalas, the default index is used in several cases, for instance,
+when Spark DataFrame is converted into Koalas DataFrame. In this case, internally Koalas attaches a
+default index into Koalas DataFrame.
+
+There are several types of the default index that can be configured by `compute.default_index_type` as below:
+
+**sequence**: It implements a sequence that increases one by one, by PySpark's Window function without
+specifying partition. Therefore, it can end up with whole partition in single node.
+This index type should be avoided when the data is large. This is default. See the example below:
+
+.. code-block:: python
+
+    >>> import pyspark.pandas as ks
+    >>> ks.set_option('compute.default_index_type', 'sequence')
+    >>> kdf = ks.range(3)
+    >>> ks.reset_option('compute.default_index_type')
+    >>> kdf.index
+    Int64Index([0, 1, 2], dtype='int64')
+
+This is conceptually equivalent to the PySpark example as below:
+
+.. code-block:: python
+
+    >>> from pyspark.sql import functions as F, Window
+    >>> import pyspark.pandas as ks
+    >>> spark_df = ks.range(3).to_spark()
+    >>> sequential_index = F.row_number().over(
+    ...    Window.orderBy(F.monotonically_increasing_id().asc())) - 1
+    >>> spark_df.select(sequential_index).rdd.map(lambda r: r[0]).collect()
+    [0, 1, 2]
+
+**distributed-sequence**: It implements a sequence that increases one by one, by group-by and
+group-map approach in a distributed manner. It still generates the sequential index globally.
+If the default index must be the sequence in a large dataset, this
+index has to be used.
+Note that if more data are added to the data source after creating this index,
+then it does not guarantee the sequential index. See the example below:
+
+.. code-block:: python
+
+    >>> import pyspark.pandas as ks
+    >>> ks.set_option('compute.default_index_type', 'distributed-sequence')
+    >>> kdf = ks.range(3)
+    >>> ks.reset_option('compute.default_index_type')
+    >>> kdf.index
+    Int64Index([0, 1, 2], dtype='int64')
+
+This is conceptually equivalent to the PySpark example as below:
+
+.. code-block:: python
+
+    >>> import pyspark.pandas as ks
+    >>> spark_df = ks.range(3).to_spark()
+    >>> spark_df.rdd.zipWithIndex().map(lambda p: p[1]).collect()
+    [0, 1, 2]
+
+**distributed**: It implements a monotonically increasing sequence simply by using
+PySpark's `monotonically_increasing_id` function in a fully distributed manner. The
+values are indeterministic. If the index does not have to be a sequence that increases
+one by one, this index should be used. Performance-wise, this index almost does not
+have any penalty comparing to other index types. See the example below:
+
+.. code-block:: python
+
+    >>> import pyspark.pandas as ks
+    >>> ks.set_option('compute.default_index_type', 'distributed')
+    >>> kdf = ks.range(3)
+    >>> ks.reset_option('compute.default_index_type')
+    >>> kdf.index
+    Int64Index([25769803776, 60129542144, 94489280512], dtype='int64')
+
+This is conceptually equivalent to the PySpark example as below:
+
+.. code-block:: python
+
+    >>> from pyspark.sql import functions as F
+    >>> import pyspark.pandas as ks
+    >>> spark_df = ks.range(3).to_spark()
+    >>> spark_df.select(F.monotonically_increasing_id()) \
+    ...     .rdd.map(lambda r: r[0]).collect()
+    [25769803776, 60129542144, 94489280512]
+
+.. warning::
+    It is very unlikely for this type of index to be used for computing two
+    different dataframes because it is not guaranteed to have the same indexes in two dataframes.
+    If you use this default index and turn on `compute.ops_on_diff_frames`, the result
+    from the operations between two different DataFrames will likely be an unexpected
+    output due to the indeterministic index values.
+
+
+Available options
+-----------------
+
+=============================== ============== =====================================================
+Option                          Default        Description
+=============================== ============== =====================================================
+display.max_rows                1000           This sets the maximum number of rows Koalas should
+                                               output when printing out various output. For example,
+                                               this value determines the number of rows to be shown
+                                               at the repr() in a dataframe. Set `None` to unlimit
+                                               the input length. Default is 1000.
+compute.max_rows                1000           'compute.max_rows' sets the limit of the current
+                                               Koalas DataFrame. Set `None` to unlimit the input
+                                               length. When the limit is set, it is executed by the
+                                               shortcut by collecting the data into the driver, and
+                                               then using the pandas API. If the limit is unset, the
+                                               operation is executed by PySpark. Default is 1000.
+compute.shortcut_limit          1000           'compute.shortcut_limit' sets the limit for a
+                                               shortcut. It computes specified number of rows and
+                                               use its schema. When the dataframe length is larger
+                                               than this limit, Koalas uses PySpark to compute.
+compute.ops_on_diff_frames      False          This determines whether or not to operate between two
+                                               different dataframes. For example, 'combine_frames'
+                                               function internally performs a join operation which
+                                               can be expensive in general. So, if
+                                               `compute.ops_on_diff_frames` variable is not True,
+                                               that method throws an exception.
+compute.default_index_type      'sequence'     This sets the default index type: sequence,
+                                               distributed and distributed-sequence.
+compute.ordered_head            False          'compute.ordered_head' sets whether or not to operate
+                                               head with natural ordering. Koalas does not guarantee
+                                               the row ordering so `head` could return some rows
+                                               from distributed partitions. If
+                                               'compute.ordered_head' is set to True, Koalas
+                                               performs natural ordering beforehand, but it will
+                                               cause a performance overhead.
+plotting.max_rows               1000           'plotting.max_rows' sets the visual limit on top-n-
+                                               based plots such as `plot.bar` and `plot.pie`. If it
+                                               is set to 1000, the first 1000 data points will be
+                                               used for plotting. Default is 1000.
+plotting.sample_ratio           None           'plotting.sample_ratio' sets the proportion of data
+                                               that will be plotted for sample-based plots such as
+                                               `plot.line` and `plot.area`. This option defaults to
+                                               'plotting.max_rows' option.
+plotting.backend                'plotly'       Backend to use for plotting. Default is plotly.
+                                               Supports any package that has a top-level `.plot`
+                                               method. Known options are: [matplotlib, plotly].
+=============================== ============== =====================================================
--- a/python/docs/source/user_guide/ps_pandas_pyspark.rst
+++ b/python/docs/source/user_guide/ps_pandas_pyspark.rst
@ -0,0 +1,118 @@
+===============================
+Working with pandas and PySpark
+===============================
+
+.. currentmodule:: pyspark.pandas
+
+Users from pandas and/or PySpark face API compatibility issue sometimes when they
+work with Koalas. Since Koalas does not target 100% compatibility of both pandas and
+PySpark, users need to do some workaround to port their pandas and/or PySpark codes or
+get familiar with Koalas in this case. This page aims to describe it.
+
+
+pandas
+------
+
+pandas users can access to full pandas APIs by calling :func:`DataFrame.to_pandas`.
+Koalas DataFrame and pandas DataFrame are similar. However, the former is distributed
+and the latter is in a single machine. When converting to each other, the data is
+transferred between multiple machines and the single client machine.
+
+For example, if you need to call ``pandas_df.values`` of pandas DataFrame, you can do
+as below:
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>>
+   >>> kdf = ks.range(10)
+   >>> pdf = kdf.to_pandas()
+   >>> pdf.values
+   array([[0],
+          [1],
+          [2],
+          [3],
+          [4],
+          [5],
+          [6],
+          [7],
+          [8],
+          [9]])
+
+pandas DataFrame can be a Koalas DataFrame easily as below:
+
+.. code-block:: python
+
+   >>> ks.from_pandas(pdf)
+      id
+   0   0
+   1   1
+   2   2
+   3   3
+   4   4
+   5   5
+   6   6
+   7   7
+   8   8
+   9   9
+
+Note that converting Koalas DataFrame to pandas requires to collect all the data into the client machine; therefore,
+if possible, it is recommended to use Koalas or PySpark APIs instead.
+
+
+PySpark
+-------
+
+PySpark users can access to full PySpark APIs by calling :func:`DataFrame.to_spark`.
+Koalas DataFrame and Spark DataFrame are virtually interchangeable.
+
+For example, if you need to call ``spark_df.filter(...)`` of Spark DataFrame, you can do
+as below:
+
+.. code-block:: python
+
+   >>> import pyspark.pandas as ks
+   >>>
+   >>> kdf = ks.range(10)
+   >>> sdf = kdf.to_spark().filter("id > 5")
+   >>> sdf.show()
+   +---+
+   | id|
+   +---+
+   |  6|
+   |  7|
+   |  8|
+   |  9|
+   +---+
+
+Spark DataFrame can be a Koalas DataFrame easily as below:
+
+.. code-block:: python
+
+   >>> sdf.to_koalas()
+      id
+   0   6
+   1   7
+   2   8
+   3   9
+
+However, note that it requires to create new default index in case Koalas DataFrame is created from
+Spark DataFrame. See `Default Index Type <options.rst#default-index-type>`_. In order to avoid this overhead, specify the column
+to use as an index when possible.
+
+.. code-block:: python
+
+   >>> # Create a Koalas DataFrame with an explicit index.
+   ... kdf = ks.DataFrame({'id': range(10)}, index=range(10))
+   >>> # Keep the explicit index.
+   ... sdf = kdf.to_spark(index_col='index')
+   >>> # Call Spark APIs
+   ... sdf = sdf.filter("id > 5")
+   >>> # Uses the explicit index to avoid to create default index.
+   ... sdf.to_koalas(index_col='index')
+          id
+   index
+   6       6
+   7       7
+   8       8
+   9       9
--- a/python/docs/source/user_guide/ps_transform_apply.rst
+++ b/python/docs/source/user_guide/ps_transform_apply.rst
@ -0,0 +1,121 @@
+==============================
+Transform and apply a function
+==============================
+
+.. NOTE: the images are stored at https://github.com/pyspark.pandas/issues/1443. Feel free to edit and/or add.
+
+.. currentmodule:: pyspark.pandas
+
+There are many APIs that allow users to apply a function against Koalas DataFrame such as
+:func:`DataFrame.transform`, :func:`DataFrame.apply`, :func:`DataFrame.koalas.transform_batch`,
+:func:`DataFrame.koalas.apply_batch`, :func:`Series.koalas.transform_batch`, etc. Each has a distinct
+purpose and works differently internally. This section describes the differences among
+them where users are confused often.
+
+``transform`` and ``apply``
+---------------------------
+
+The main difference between :func:`DataFrame.transform` and :func:`DataFrame.apply` is that the former requires
+to return the same length of the input and the latter does not require this. See the example below:
+
+.. code-block:: python
+
+   >>> kdf = ks.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
+   >>> def pandas_plus(pser):
+   ...     return pser + 1  # should always return the same length as input.
+   ...
+   >>> kdf.transform(pandas_plus)
+
+.. code-block:: python
+
+   >>> kdf = ks.DataFrame({'a': [1,2,3], 'b':[5,6,7]})
+   >>> def pandas_plus(pser):
+   ...     return pser[pser % 2 == 1]  # allows an arbitrary length
+   ...
+   >>> kdf.apply(pandas_plus)
+
+In this case, each function takes a pandas Series, and Koalas computes the functions in a distributed manner as below.
+
+.. image:: https://user-images.githubusercontent.com/6477701/80076790-a1cf0680-8587-11ea-8b08-8dc694071ba0.png
+  :alt: transform and apply
+  :align: center
+  :width: 550
+
+In case of 'column' axis, the function takes each row as a pandas Series.
+
+.. code-block:: python
+
+   >>> kdf = ks.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
+   >>> def pandas_plus(pser):
+   ...     return sum(pser)  # allows an arbitrary length
+   ...
+   >>> kdf.apply(pandas_plus, axis='columns')
+
+The example above calculates the summation of each row as a pandas Series. See below:
+
+.. image:: https://user-images.githubusercontent.com/6477701/80076898-c2975c00-8587-11ea-9b2c-69c9729e9294.png
+  :alt: apply axis
+  :align: center
+  :width: 600
+
+In the examples above, the type hints were not used for simplicity but it is encouraged to use to avoid performance penalty.
+Please refer the API documentations.
+
+
+``koalas.transform_batch`` and ``koalas.apply_batch``
+-----------------------------------------------------
+
+In :func:`DataFrame.koalas.transform_batch`, :func:`DataFrame.koalas.apply_batch`, :func:`Series.koalas.transform_batch`, etc., the ``batch``
+postfix means each chunk in Koalas DataFrame or Series. The APIs slice the Koalas DataFrame or Series, and
+then applies the given function with pandas DataFrame or Series as input and output. See the examples below:
+
+.. code-block:: python
+
+   >>> kdf = ks.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
+   >>> def pandas_plus(pdf):
+   ...     return pdf + 1  # should always return the same length as input.
+   ...
+   >>> kdf.koalas.transform_batch(pandas_plus)
+
+.. code-block:: python
+
+   >>> kdf = ks.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
+   >>> def pandas_plus(pdf):
+   ...     return pdf[pdf.a > 1]  # allow arbitrary length
+   ...
+   >>> kdf.koalas.apply_batch(pandas_plus)
+
+The functions in both examples take a pandas DataFrame as a chunk of Koalas DataFrame, and output a pandas DataFrame.
+Koalas combines the pandas DataFrames as a Koalas DataFrame.
+
+Note that :func:`DataFrame.koalas.transform_batch` has the length restriction - the length of input and output should be
+the same whereas :func:`DataFrame.koalas.apply_batch` does not.  However, it is important to know that
+the output belongs to the same DataFrame when :func:`DataFrame.koalas.transform_batch` returns a Series, and
+you can avoid a shuffle by the operations between different DataFrames. In case of :func:`DataFrame.koalas.apply_batch`, its output is always
+treated that it belongs to a new different DataFrame. See also
+`Operations on different DataFrames <options.rst#operations-on-different-dataframes>`_ for more details.
+
+.. image:: https://user-images.githubusercontent.com/6477701/80076779-9f6cac80-8587-11ea-8c92-07d7b992733b.png
+  :alt: koalas.transform_batch and koalas.apply_batch in Frame
+  :align: center
+  :width: 650
+
+In case of :func:`Series.koalas.transform_batch`, it is also similar with :func:`DataFrame.koalas.transform_batch`; however, it takes
+a pandas Series as a chunk of Koalas Series.
+
+.. code-block:: python
+
+   >>> kdf = ks.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
+   >>> def pandas_plus(pser):
+   ...     return pser + 1  # should always return the same length as input.
+   ...
+   >>> kdf.a.koalas.transform_batch(pandas_plus)
+
+Under the hood, each batch of Koalas Series is split to multiple pandas Series, and each function computes on that as below:
+
+.. image:: https://user-images.githubusercontent.com/6477701/80076795-a3003380-8587-11ea-8b73-186e4047f8c0.png
+  :alt: koalas.transform_batch in Series
+  :width: 350
+  :align: center
+
+There are more details such as the type inference and preventing its performance penalty. Please refer the API documentations.
--- a/python/docs/source/user_guide/ps_typehints.rst
+++ b/python/docs/source/user_guide/ps_typehints.rst
@ -0,0 +1,137 @@
+====================
+Type Hints In Koalas
+====================
+
+.. currentmodule:: pyspark.pandas
+
+Koalas, by default, infers the schema by taking some top records from the output,
+in particular, when you use APIs that allow users to apply a function against Koalas DataFrame
+such as :func:`DataFrame.transform`, :func:`DataFrame.apply`, :func:`DataFrame.koalas.apply_batch`,
+:func:`DataFrame.koalas.apply_batch`, :func:`Series.koalas.apply_batch`, etc.
+
+However, this is potentially expensive. If there are several expensive operations such as a shuffle
+in the upstream of the execution plan, Koalas will end up with executing the Spark job twice, once
+for schema inference, and once for processing actual data with the schema.
+
+To avoid the consequences, Koalas has its own type hinting style to specify the schema to avoid
+schema inference. Koalas understands the type hints specified in the return type and converts it
+as a Spark schema for pandas UDFs used internally. The way of type hinting has been evolved over
+the time.
+
+In this chapter, it covers the recommended way and the supported ways in details.
+
+.. note::
+    The variadic generics support is experimental and unstable in Koalas.
+    The way of typing can change between minor releases without a warning.
+    See also `PEP 646 <https://www.python.org/dev/peps/pep-0646/>`_ for variadic generics in Python.
+
+
+Koalas DataFrame and Pandas DataFrame
+-------------------------------------
+
+In the early Koalas version, it was introduced to specify a type hint in the function in order to use
+it as a Spark schema. As an example, you can specify the return type hint as below by using Koalas
+:class:`DataFrame`.
+
+.. code-block:: python
+
+    >>> def pandas_div(pdf) -> ks.DataFrame[float, float]:
+    ...    # pdf is a pandas DataFrame.
+    ...    return pdf[['B', 'C']] / pdf[['B', 'C']]
+    ...
+    >>> df = ks.DataFrame({'A': ['a', 'a', 'b'], 'B': [1, 2, 3], 'C': [4, 6, 5]})
+    >>> df.groupby('A').apply(pandas_div)
+
+The function ``pandas_div`` actually takes and outputs a pandas DataFrame instead of Koalas :class:`DataFrame`.
+However, Koalas has to force to set the mismatched type hints.
+
+From Koalas 1.0 with Python 3.7+, now you can specify the type hints by using pandas instances.
+
+.. code-block:: python
+
+    >>> def pandas_div(pdf) -> pd.DataFrame[float, float]:
+    ...    # pdf is a pandas DataFrame.
+    ...    return pdf[['B', 'C']] / pdf[['B', 'C']]
+    ...
+    >>> df = ks.DataFrame({'A': ['a', 'a', 'b'], 'B': [1, 2, 3], 'C': [4, 6, 5]})
+    >>> df.groupby('A').apply(pandas_div)
+
+Likewise, pandas Series can be also used as a type hints:
+
+.. code-block:: python
+
+    >>> def sqrt(x) -> pd.Series[float]:
+    ...     return np.sqrt(x)
+    ...
+    >>> df = ks.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
+    >>> df.apply(sqrt, axis=0)
+
+Currently, both Koalas and pandas instances can be used to specify the type hints; however, Koalas
+plans to move gradually towards using pandas instances only as the stability becomes proven.
+
+
+Type Hinting with Names
+-----------------------
+
+In Koalas 1.0, the new style of type hinting was introduced to overcome the limitations in the existing type
+hinting especially for DataFrame. When you use a DataFrame as the return type hint, for example,
+``DataFrame[int, int]``, there is no way to specify the names of each Series. In the old way, Koalas just generates
+the column names as ``c#`` and this easily leads users to lose or forgot the Series mappings. See the example below:
+
+.. code-block:: python
+
+    >>> def transform(pdf) -> pd.DataFrame[int, int]:
+    ...     pdf['A'] = pdf.id + 1
+    ...     return pdf
+    ...
+    >>> ks.range(5).koalas.apply_batch(transform)
+
+.. code-block:: bash
+
+       c0  c1
+    0   0   1
+    1   1   2
+    2   2   3
+    3   3   4
+    4   4   5
+
+The new style of type hinting in Koalas is similar with the regular Python type hints in variables. The Series name
+is specified as a string, and the type is specified after a colon. The following example shows a simple case with
+the Series names, ``id`` and ``A``, and ``int`` types respectively.
+
+.. code-block:: python
+
+    >>> def transform(pdf) -> pd.DataFrame["id": int, "A": int]:
+    ...     pdf['A'] = pdf.id + 1
+    ...     return pdf
+    ...
+    >>> ks.range(5).koalas.apply_batch(transform)
+
+.. code-block:: bash
+
+       id   A
+    0   0   1
+    1   1   2
+    2   2   3
+    3   3   4
+    4   4   5
+
+In addition, Koalas also dynamically supports ``dtype`` instance and the column index in pandas so that users can
+programmatically generate the return type and schema.
+
+.. code-block:: python
+
+    >>> def transform(pdf) -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
+    ...    return pdf + 1
+    ...
+    >>> kdf.koalas.apply_batch(transform)
+
+Likewise, ``dtype`` instances from pandas DataFrame can be used alone and let Koalas generate column names.
+
+.. code-block:: python
+
+    >>> def transform(pdf) -> pd.DataFrame[pdf.dtypes]:
+    ...     return pdf + 1
+    ...
+    >>> kdf.koalas.apply_batch(transform)
+
--- a/python/docs/source/user_guide/ps_types.rst
+++ b/python/docs/source/user_guide/ps_types.rst
@ -0,0 +1,228 @@
+======================
+Type Support In Koalas
+======================
+
+.. currentmodule:: pyspark.pandas
+
+In this chapter, we will briefly show you how data types change when converting Koalas DataFrame from/to PySpark DataFrame or pandas DataFrame.
+
+
+Type casting between PySpark and Koalas
+---------------------------------------
+
+When converting a Koalas DataFrame from/to PySpark DataFrame, the data types are automatically casted to the appropriate type.
+
+The example below shows how data types are casted from PySpark DataFrame to Koalas DataFrame.
+
+.. code-block:: python
+
+    # 1. Create a PySpark DataFrame
+    >>> sdf = spark.createDataFrame([
+    ...     (1, Decimal(1.0), 1., 1., 1, 1, 1, datetime(2020, 10, 27), "1", True, datetime(2020, 10, 27)),
+    ... ], 'tinyint tinyint, decimal decimal, float float, double double, integer integer, long long, short short, timestamp timestamp, string string, boolean boolean, date date')
+
+    # 2. Check the PySpark data types
+    >>> sdf
+    DataFrame[tinyint: tinyint, decimal: decimal(10,0), float: float, double: double, integer: int, long: bigint, short: smallint, timestamp: timestamp, string: string, boolean: boolean, date: date]
+
+    # 3. Convert PySpark DataFrame to Koalas DataFrame
+    >>> kdf = sdf.to_koalas()
+
+    # 4. Check the Koalas data types
+    >>> kdf.dtypes
+    tinyint                int8
+    decimal              object
+    float               float32
+    double              float64
+    integer               int32
+    long                  int64
+    short                 int16
+    timestamp    datetime64[ns]
+    string               object
+    boolean                bool
+    date                 object
+    dtype: object
+
+
+The example below shows how data types are casted from Koalas DataFrame to PySpark DataFrame.
+
+.. code-block:: python
+
+    # 1. Create a Koalas DataFrame
+    >>> kdf = ks.DataFrame({"int8": [1], "bool": [True], "float32": [1.0], "float64": [1.0], "int32": [1], "int64": [1], "int16": [1], "datetime": [datetime.datetime(2020, 10, 27)], "object_string": ["1"], "object_decimal": [decimal.Decimal("1.1")], "object_date": [datetime.date(2020, 10, 27)]})
+
+    # 2. Type casting by using `astype`
+    >>> kdf['int8'] = kdf['int8'].astype('int8')
+    >>> kdf['int16'] = kdf['int16'].astype('int16')
+    >>> kdf['int32'] = kdf['int32'].astype('int32')
+    >>> kdf['float32'] = kdf['float32'].astype('float32')
+
+    # 3. Check the Koalas data types
+    >>> kdf.dtypes
+    int8                        int8
+    bool                        bool
+    float32                  float32
+    float64                  float64
+    int32                      int32
+    int64                      int64
+    int16                      int16
+    datetime          datetime64[ns]
+    object_string             object
+    object_decimal            object
+    object_date               object
+    dtype: object
+
+    # 4. Convert Koalas DataFrame to PySpark DataFrame
+    >>> sdf = kdf.to_spark()
+
+    # 5. Check the PySpark data types
+    >>> sdf
+    DataFrame[int8: tinyint, bool: boolean, float32: float, float64: double, int32: int, int64: bigint, int16: smallint, datetime: timestamp, object_string: string, object_decimal: decimal(2,1), object_date: date]
+
+
+Type casting between pandas and Koalas
+--------------------------------------
+
+When converting Koalas DataFrame to pandas DataFrame, and the data types are basically same as pandas.
+
+.. code-block:: python
+
+    # Convert Koalas DataFrame to pandas DataFrame
+    >>> pdf = kdf.to_pandas()
+
+    # Check the pandas data types
+    >>> pdf.dtypes
+    int8                        int8
+    bool                        bool
+    float32                  float32
+    float64                  float64
+    int32                      int32
+    int64                      int64
+    int16                      int16
+    datetime          datetime64[ns]
+    object_string             object
+    object_decimal            object
+    object_date               object
+    dtype: object
+
+
+However, there are several data types only provided by pandas.
+
+.. code-block:: python
+
+    # pd.Catrgorical type is not supported in Koalas yet.
+    >>> ks.Series([pd.Categorical([1, 2, 3])])
+    Traceback (most recent call last):
+    ...
+    pyarrow.lib.ArrowInvalid: Could not convert [1, 2, 3]
+    Categories (3, int64): [1, 2, 3] with type Categorical: did not recognize Python value type when inferring an Arrow data type
+
+
+These kind of pandas specific data types below are not currently supported in Koalas but planned to be supported.
+
+* pd.Timedelta
+* pd.Categorical
+* pd.CategoricalDtype
+
+
+The pandas specific data types below are not planned to be supported in Koalas yet.
+
+* pd.SparseDtype
+* pd.DatetimeTZDtype
+* pd.UInt*Dtype
+* pd.BooleanDtype
+* pd.StringDtype
+
+
+Internal type mapping
+---------------------
+
+The table below shows which NumPy data types are matched to which PySpark data types internally in Koalas.
+
+============= =======================
+NumPy         PySpark
+============= =======================
+np.character  BinaryType
+np.bytes\_    BinaryType
+np.string\_   BinaryType
+np.int8       ByteType
+np.byte       ByteType
+np.int16      ShortType
+np.int32      IntegerType
+np.int64      LongType
+np.int        LongType
+np.float32    FloatType
+np.float      DoubleType
+np.float64    DoubleType
+np.str        StringType
+np.unicode\_  StringType
+np.bool       BooleanType
+np.datetime64 TimestampType
+np.ndarray    ArrayType(StringType())
+============= =======================
+
+
+The table below shows which Python data types are matched to which PySpark data types internally in Koalas.
+
+================= ===================
+Python            PySpark
+================= ===================
+bytes             BinaryType
+int               LongType
+float             DoubleType
+str               StringType
+bool              BooleanType
+datetime.datetime TimestampType
+datetime.date     DateType
+decimal.Decimal   DecimalType(38, 18)
+================= ===================
+
+For decimal type, Koalas uses Spark's system default precision and scale.
+
+You can check this mapping by using `as_spark_type` function.
+
+.. code-block:: python
+
+    >>> import typing
+    >>> import numpy as np
+    >>> from pyspark.pandas.typedef import as_spark_type
+
+    >>> as_spark_type(int)
+    LongType
+
+    >>> as_spark_type(np.int32)
+    IntegerType
+
+    >>> as_spark_type(typing.List[float])
+    ArrayType(DoubleType,true)
+
+
+You can also check the underlying PySpark data type of `Series` or schema of `DataFrame` by using Spark accessor.
+
+.. code-block:: python
+
+    >>> ks.Series([0.3, 0.1, 0.8]).spark.data_type
+    DoubleType
+
+    >>> ks.Series(["welcome", "to", "Koalas"]).spark.data_type
+    StringType
+
+    >>> ks.Series([[False, True, False]]).spark.data_type
+    ArrayType(BooleanType,true)
+
+    >>> ks.DataFrame({"d": [0.3, 0.1, 0.8], "s": ["welcome", "to", "Koalas"], "b": [False, True, False]}).spark.print_schema()
+    root
+     |-- d: double (nullable = false)
+     |-- s: string (nullable = false)
+     |-- b: boolean (nullable = false)
+
+.. note::
+
+    Koalas currently does not support multiple types of data in single column.
+
+    .. code-block:: python
+    
+        >>> ks.Series([1, "A"])
+        Traceback (most recent call last):
+        ...
+        TypeError: an integer is required (got type str)
--- a/python/pyspark/pandas/accessors.py
+++ b/python/pyspark/pandas/accessors.py
@ -752,7 +752,7 @@ class PandasOnSparkSeriesMethods(object):
        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch : Similar but it takes pandas DataFrame as its
-        internal batch.
+            internal batch.

        Examples
        --------