spark-instrumented-optimizer/python/pyspark/pandas/strings.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
String functions on pandas-on-Spark Series
"""
from typing import Union, TYPE_CHECKING, cast, Optional, List

import numpy as np

from pyspark.sql.types import StringType, BinaryType, ArrayType, LongType, MapType
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType

from pyspark.pandas.spark import functions as SF

if TYPE_CHECKING:
    import pyspark.pandas as ps  # noqa: F401 (SPARK-34943)


class StringMethods(object):
    """String methods for pandas-on-Spark Series"""

    def __init__(self, series: "ps.Series"):
        if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)):
            raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type))
        self._data = series

    # Methods
    def capitalize(self) -> "ps.Series":
        """
        Convert Strings in the series to be capitalized.

        Examples
        --------
        >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
        >>> s
        0                 lower
        1              CAPITALS
        2    this is a sentence
        3              SwApCaSe
        dtype: object

        >>> s.str.capitalize()
        0                 Lower
        1              Capitals
        2    This is a sentence
        3              Swapcase
        dtype: object
        """

        def pandas_capitalize(s) -> "ps.Series[str]":
            return s.str.capitalize()

        return self._data.koalas.transform_batch(pandas_capitalize)

    def title(self) -> "ps.Series":
        """
        Convert Strings in the series to be titlecase.

        Examples
        --------
        >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
        >>> s
        0                 lower
        1              CAPITALS
        2    this is a sentence
        3              SwApCaSe
        dtype: object

        >>> s.str.title()
        0                 Lower
        1              Capitals
        2    This Is A Sentence
        3              Swapcase
        dtype: object
        """

        def pandas_title(s) -> "ps.Series[str]":
            return s.str.title()

        return self._data.koalas.transform_batch(pandas_title)

    def lower(self) -> "ps.Series":
        """
        Convert strings in the Series/Index to all lowercase.

        Examples
        --------
        >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
        >>> s
        0                 lower
        1              CAPITALS
        2    this is a sentence
        3              SwApCaSe
        dtype: object

        >>> s.str.lower()
        0                 lower
        1              capitals
        2    this is a sentence
        3              swapcase
        dtype: object
        """
        return self._data.spark.transform(F.lower)

    def upper(self) -> "ps.Series":
        """
        Convert strings in the Series/Index to all uppercase.

        Examples
        --------
        >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
        >>> s
        0                 lower
        1              CAPITALS
        2    this is a sentence
        3              SwApCaSe
        dtype: object

        >>> s.str.upper()
        0                 LOWER
        1              CAPITALS
        2    THIS IS A SENTENCE
        3              SWAPCASE
        dtype: object
        """
        return self._data.spark.transform(F.upper)

    def swapcase(self) -> "ps.Series":
        """
        Convert strings in the Series/Index to be swapcased.

        Examples
        --------
        >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
        >>> s
        0                 lower
        1              CAPITALS
        2    this is a sentence
        3              SwApCaSe
        dtype: object

        >>> s.str.swapcase()
        0                 LOWER
        1              capitals
        2    THIS IS A SENTENCE
        3              sWaPcAsE
        dtype: object
        """

        def pandas_swapcase(s) -> "ps.Series[str]":
            return s.str.swapcase()

        return self._data.koalas.transform_batch(pandas_swapcase)

    def startswith(self, pattern, na=None) -> "ps.Series":
        """
        Test if the start of each string element matches a pattern.

        Equivalent to :func:`str.startswith`.

        Parameters
        ----------
        pattern : str
            Character sequence. Regular expressions are not accepted.
        na : object, default None
            Object shown if element is not a string. NaN converted to None.

        Returns
        -------
        Series of bool or object
            pandas-on-Spark Series of booleans indicating whether the given pattern
            matches the start of each string element.

        Examples
        --------
        >>> s = ps.Series(['bat', 'Bear', 'cat', np.nan])
        >>> s
        0     bat
        1    Bear
        2     cat
        3    None
        dtype: object

        >>> s.str.startswith('b')
        0     True
        1    False
        2    False
        3     None
        dtype: object

        Specifying na to be False instead of None.

        >>> s.str.startswith('b', na=False)
        0     True
        1    False
        2    False
        3    False
        dtype: bool
        """

        def pandas_startswith(s) -> "ps.Series[bool]":
            return s.str.startswith(pattern, na)

        return self._data.koalas.transform_batch(pandas_startswith)

    def endswith(self, pattern, na=None) -> "ps.Series":
        """
        Test if the end of each string element matches a pattern.

        Equivalent to :func:`str.endswith`.

        Parameters
        ----------
        pattern : str
            Character sequence. Regular expressions are not accepted.
        na : object, default None
            Object shown if element is not a string. NaN converted to None.

        Returns
        -------
        Series of bool or object
            pandas-on-Spark Series of booleans indicating whether the given pattern
            matches the end of each string element.

        Examples
        --------
        >>> s = ps.Series(['bat', 'Bear', 'cat', np.nan])
        >>> s
        0     bat
        1    Bear
        2     cat
        3    None
        dtype: object

        >>> s.str.endswith('t')
        0     True
        1    False
        2     True
        3     None
        dtype: object

        Specifying na to be False instead of None.

        >>> s.str.endswith('t', na=False)
        0     True
        1    False
        2     True
        3    False
        dtype: bool
        """

        def pandas_endswith(s) -> "ps.Series[bool]":
            return s.str.endswith(pattern, na)

        return self._data.koalas.transform_batch(pandas_endswith)

    def strip(self, to_strip=None) -> "ps.Series":
        """
        Remove leading and trailing characters.

        Strip whitespaces (including newlines) or a set of specified
        characters from each string in the Series/Index from left and
        right sides. Equivalent to :func:`str.strip`.

        Parameters
        ----------
        to_strip : str
            Specifying the set of characters to be removed. All combinations
            of this set of characters will be stripped. If None then
            whitespaces are removed.

        Returns
        -------
        Series of objects

        Examples
        --------
        >>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None])
        >>> s
        0      1. Ant.
        1    2. Bee!\\t
        2         None
        dtype: object

        >>> s.str.strip()
        0    1. Ant.
        1    2. Bee!
        2       None
        dtype: object

        >>> s.str.strip('12.')
        0        Ant
        1     Bee!\\t
        2       None
        dtype: object

        >>> s.str.strip('.!\\t')
        0    1. Ant
        1    2. Bee
        2      None
        dtype: object
        """

        def pandas_strip(s) -> "ps.Series[str]":
            return s.str.strip(to_strip)

        return self._data.koalas.transform_batch(pandas_strip)

    def lstrip(self, to_strip=None) -> "ps.Series":
        """
        Remove leading characters.

        Strip whitespaces (including newlines) or a set of specified
        characters from each string in the Series/Index from left side.
        Equivalent to :func:`str.lstrip`.

        Parameters
        ----------
        to_strip : str
            Specifying the set of characters to be removed. All combinations
            of this set of characters will be stripped. If None then
            whitespaces are removed.

        Returns
        -------
        Series of object

        Examples
        --------
        >>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None])
        >>> s
        0      1. Ant.
        1    2. Bee!\\t
        2         None
        dtype: object

        >>> s.str.lstrip('12.')
        0       Ant.
        1     Bee!\\t
        2       None
        dtype: object
        """

        def pandas_lstrip(s) -> "ps.Series[str]":
            return s.str.lstrip(to_strip)

        return self._data.koalas.transform_batch(pandas_lstrip)

    def rstrip(self, to_strip=None) -> "ps.Series":
        """
        Remove trailing characters.

        Strip whitespaces (including newlines) or a set of specified
        characters from each string in the Series/Index from right side.
        Equivalent to :func:`str.rstrip`.

        Parameters
        ----------
        to_strip : str
            Specifying the set of characters to be removed. All combinations
            of this set of characters will be stripped. If None then
            whitespaces are removed.

        Returns
        -------
        Series of object

        Examples
        --------
        >>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None])
        >>> s
        0      1. Ant.
        1    2. Bee!\\t
        2         None
        dtype: object

        >>> s.str.rstrip('.!\\t')
        0    1. Ant
        1    2. Bee
        2      None
        dtype: object
        """

        def pandas_rstrip(s) -> "ps.Series[str]":
            return s.str.rstrip(to_strip)

        return self._data.koalas.transform_batch(pandas_rstrip)

    def get(self, i) -> "ps.Series":
        """
        Extract element from each string or string list/tuple in the Series
        at the specified position.

        Parameters
        ----------
        i : int
            Position of element to extract.

        Returns
        -------
        Series of objects

        Examples
        --------
        >>> s1 = ps.Series(["String", "123"])
        >>> s1
        0    String
        1       123
        dtype: object

        >>> s1.str.get(1)
        0    t
        1    2
        dtype: object

        >>> s1.str.get(-1)
        0    g
        1    3
        dtype: object

        >>> s2 = ps.Series([["a", "b", "c"], ["x", "y"]])
        >>> s2
        0    [a, b, c]
        1       [x, y]
        dtype: object

        >>> s2.str.get(0)
        0    a
        1    x
        dtype: object

        >>> s2.str.get(2)
        0       c
        1    None
        dtype: object
        """

        def pandas_get(s) -> "ps.Series[str]":
            return s.str.get(i)

        return self._data.koalas.transform_batch(pandas_get)

    def isalnum(self) -> "ps.Series":
        """
        Check whether all characters in each string are alphanumeric.

        This is equivalent to running the Python string method
        :func:`str.isalnum` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s1 = ps.Series(['one', 'one1', '1', ''])

        >>> s1.str.isalnum()
        0     True
        1     True
        2     True
        3    False
        dtype: bool

        Note that checks against characters mixed with any additional
        punctuation or whitespace will evaluate to false for an alphanumeric
        check.

        >>> s2 = ps.Series(['A B', '1.5', '3,000'])
        >>> s2.str.isalnum()
        0    False
        1    False
        2    False
        dtype: bool
        """

        def pandas_isalnum(s) -> "ps.Series[bool]":
            return s.str.isalnum()

        return self._data.koalas.transform_batch(pandas_isalnum)

    def isalpha(self) -> "ps.Series":
        """
        Check whether all characters in each string are alphabetic.

        This is equivalent to running the Python string method
        :func:`str.isalpha` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s1 = ps.Series(['one', 'one1', '1', ''])

        >>> s1.str.isalpha()
        0     True
        1    False
        2    False
        3    False
        dtype: bool
        """

        def pandas_isalpha(s) -> "ps.Series[bool]":
            return s.str.isalpha()

        return self._data.koalas.transform_batch(pandas_isalpha)

    def isdigit(self) -> "ps.Series":
        """
        Check whether all characters in each string are digits.

        This is equivalent to running the Python string method
        :func:`str.isdigit` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s = ps.Series(['23', '³', '⅕', ''])

        The s.str.isdecimal method checks for characters used to form numbers
        in base 10.

        >>> s.str.isdecimal()
        0     True
        1    False
        2    False
        3    False
        dtype: bool

        The s.str.isdigit method is the same as s.str.isdecimal but also
        includes special digits, like superscripted and subscripted digits in
        unicode.

        >>> s.str.isdigit()
        0     True
        1     True
        2    False
        3    False
        dtype: bool

        The s.str.isnumeric method is the same as s.str.isdigit but also
        includes other characters that can represent quantities such as unicode
        fractions.

        >>> s.str.isnumeric()
        0     True
        1     True
        2     True
        3    False
        dtype: bool
        """

        def pandas_isdigit(s) -> "ps.Series[bool]":
            return s.str.isdigit()

        return self._data.koalas.transform_batch(pandas_isdigit)

    def isspace(self) -> "ps.Series":
        """
        Check whether all characters in each string are whitespaces.

        This is equivalent to running the Python string method
        :func:`str.isspace` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s = ps.Series([' ', '\\t\\r\\n ', ''])
        >>> s.str.isspace()
        0     True
        1     True
        2    False
        dtype: bool
        """

        def pandas_isspace(s) -> "ps.Series[bool]":
            return s.str.isspace()

        return self._data.koalas.transform_batch(pandas_isspace)

    def islower(self) -> "ps.Series":
        """
        Check whether all characters in each string are lowercase.

        This is equivalent to running the Python string method
        :func:`str.islower` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
        >>> s.str.islower()
        0     True
        1    False
        2    False
        3    False
        dtype: bool
        """

        def pandas_isspace(s) -> "ps.Series[bool]":
            return s.str.islower()

        return self._data.koalas.transform_batch(pandas_isspace)

    def isupper(self) -> "ps.Series":
        """
        Check whether all characters in each string are uppercase.

        This is equivalent to running the Python string method
        :func:`str.isupper` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
        >>> s.str.isupper()
        0    False
        1    False
        2     True
        3    False
        dtype: bool
        """

        def pandas_isspace(s) -> "ps.Series[bool]":
            return s.str.isupper()

        return self._data.koalas.transform_batch(pandas_isspace)

    def istitle(self) -> "ps.Series":
        """
        Check whether all characters in each string are titlecase.

        This is equivalent to running the Python string method
        :func:`str.istitle` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])

        The s.str.istitle method checks for whether all words are in title
        case (whether only the first letter of each word is capitalized).
        Words are assumed to be as any sequence of non-numeric characters
        separated by whitespace characters.

        >>> s.str.istitle()
        0    False
        1     True
        2    False
        3    False
        dtype: bool
        """

        def pandas_istitle(s) -> "ps.Series[bool]":
            return s.str.istitle()

        return self._data.koalas.transform_batch(pandas_istitle)

    def isnumeric(self) -> "ps.Series":
        """
        Check whether all characters in each string are numeric.

        This is equivalent to running the Python string method
        :func:`str.isnumeric` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s1 = ps.Series(['one', 'one1', '1', ''])
        >>> s1.str.isnumeric()
        0    False
        1    False
        2     True
        3    False
        dtype: bool

        >>> s2 = ps.Series(['23', '³', '⅕', ''])

        The s2.str.isdecimal method checks for characters used to form numbers
        in base 10.

        >>> s2.str.isdecimal()
        0     True
        1    False
        2    False
        3    False
        dtype: bool

        The s2.str.isdigit method is the same as s2.str.isdecimal but also
        includes special digits, like superscripted and subscripted digits in
        unicode.

        >>> s2.str.isdigit()
        0     True
        1     True
        2    False
        3    False
        dtype: bool

        The s2.str.isnumeric method is the same as s2.str.isdigit but also
        includes other characters that can represent quantities such as unicode
        fractions.

        >>> s2.str.isnumeric()
        0     True
        1     True
        2     True
        3    False
        dtype: bool
        """

        def pandas_isnumeric(s) -> "ps.Series[bool]":
            return s.str.isnumeric()

        return self._data.koalas.transform_batch(pandas_isnumeric)

    def isdecimal(self) -> "ps.Series":
        """
        Check whether all characters in each string are decimals.

        This is equivalent to running the Python string method
        :func:`str.isdecimal` for each element of the Series/Index.
        If a string has zero characters, False is returned for that check.

        Examples
        --------
        >>> s = ps.Series(['23', '³', '⅕', ''])

        The s.str.isdecimal method checks for characters used to form numbers
        in base 10.

        >>> s.str.isdecimal()
        0     True
        1    False
        2    False
        3    False
        dtype: bool

        The s.str.isdigit method is the same as s.str.isdecimal but also
        includes special digits, like superscripted and subscripted digits in
        unicode.

        >>> s.str.isdigit()
        0     True
        1     True
        2    False
        3    False
        dtype: bool

        The s.str.isnumeric method is the same as s.str.isdigit but also
        includes other characters that can represent quantities such as unicode
        fractions.

        >>> s.str.isnumeric()
        0     True
        1     True
        2     True
        3    False
        dtype: bool
        """

        def pandas_isdecimal(s) -> "ps.Series[bool]":
            return s.str.isdecimal()

        return self._data.koalas.transform_batch(pandas_isdecimal)

    def cat(self, others=None, sep=None, na_rep=None, join=None) -> "ps.Series":
        """
        Not supported.
        """
        raise NotImplementedError()

    def center(self, width, fillchar=" ") -> "ps.Series":
        """
        Filling left and right side of strings in the Series/Index with an
        additional character. Equivalent to :func:`str.center`.

        Parameters
        ----------
        width : int
            Minimum width of resulting string; additional characters will be
            filled with fillchar.
        fillchar : str
            Additional character for filling, default is whitespace.

        Returns
        -------
        Series of objects

        Examples
        --------
        >>> s = ps.Series(["caribou", "tiger"])
        >>> s
        0    caribou
        1      tiger
        dtype: object

        >>> s.str.center(width=10, fillchar='-')
        0    -caribou--
        1    --tiger---
        dtype: object
        """

        def pandas_center(s) -> "ps.Series[str]":
            return s.str.center(width, fillchar)

        return self._data.koalas.transform_batch(pandas_center)

    def contains(self, pat, case=True, flags=0, na=None, regex=True) -> "ps.Series":
        """
        Test if pattern or regex is contained within a string of a Series.

        Return boolean Series based on whether a given pattern or regex is
        contained within a string of a Series.

        Analogous to :func:`match`, but less strict, relying on
        :func:`re.search` instead of :func:`re.match`.

        Parameters
        ----------
        pat : str
            Character sequence or regular expression.
        case : bool, default True
            If True, case sensitive.
        flags : int, default 0 (no flags)
            Flags to pass through to the re module, e.g. re.IGNORECASE.
        na : default None
            Fill value for missing values. NaN converted to None.
        regex : bool, default True
            If True, assumes the pat is a regular expression.
            If False, treats the pat as a literal string.


        Returns
        -------
        Series of boolean values or object
            A Series of boolean values indicating whether the given pattern is
            contained within the string of each element of the Series.

        Examples
        --------
        Returning a Series of booleans using only a literal pattern.

        >>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
        >>> s1.str.contains('og', regex=False)
        0    False
        1     True
        2    False
        3    False
        4     None
        dtype: object

        Specifying case sensitivity using case.

        >>> s1.str.contains('oG', case=True, regex=True)
        0    False
        1    False
        2    False
        3    False
        4     None
        dtype: object

        Specifying na to be False instead of NaN replaces NaN values with
        False. If Series does not contain NaN values the resultant dtype will
        be bool, otherwise, an object dtype.

        >>> s1.str.contains('og', na=False, regex=True)
        0    False
        1     True
        2    False
        3    False
        4    False
        dtype: bool

        Returning ‘house’ or ‘dog’ when either expression occurs in a string.

        >>> s1.str.contains('house|dog', regex=True)
        0    False
        1     True
        2     True
        3    False
        4     None
        dtype: object

        Ignoring case sensitivity using flags with regex.

        >>> import re
        >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
        0    False
        1    False
        2     True
        3    False
        4     None
        dtype: object

        Returning any digit using regular expression.

        >>> s1.str.contains('[0-9]', regex=True)
        0    False
        1    False
        2    False
        3     True
        4     None
        dtype: object

        Ensure pat is a not a literal pattern when regex is set to True.
        Note in the following example one might expect only s2[1] and s2[3]
        to return True. However, ‘.0’ as a regex matches any character followed
        by a 0.

        >>> s2 = ps.Series(['40','40.0','41','41.0','35'])
        >>> s2.str.contains('.0', regex=True)
        0     True
        1     True
        2    False
        3     True
        4    False
        dtype: bool
        """

        def pandas_contains(s) -> "ps.Series[bool]":
            return s.str.contains(pat, case, flags, na, regex)

        return self._data.koalas.transform_batch(pandas_contains)

    def count(self, pat, flags=0) -> "ps.Series":
        """
        Count occurrences of pattern in each string of the Series.

        This function is used to count the number of times a particular regex
        pattern is repeated in each of the string elements of the Series.

        Parameters
        ----------
        pat : str
            Valid regular expression.
        flags : int, default 0 (no flags)
            Flags for the re module.

        Returns
        -------
        Series of int
            A Series containing the integer counts of pattern matches.

        Examples
        --------
        >>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.NaN, 'CABA', 'cat'])
        >>> s.str.count('a')
        0    0.0
        1    0.0
        2    2.0
        3    2.0
        4    NaN
        5    0.0
        6    1.0
        dtype: float64

        Escape '$' to find the literal dollar sign.

        >>> s = ps.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
        >>> s.str.count('\\$')
        0    1
        1    0
        2    1
        3    2
        4    2
        5    0
        dtype: int64
        """

        def pandas_count(s) -> "ps.Series[int]":
            return s.str.count(pat, flags)

        return self._data.koalas.transform_batch(pandas_count)

    def decode(self, encoding, errors="strict") -> "ps.Series":
        """
        Not supported.
        """
        raise NotImplementedError()

    def encode(self, encoding, errors="strict") -> "ps.Series":
        """
        Not supported.
        """
        raise NotImplementedError()

    def extract(self, pat, flags=0, expand=True) -> "ps.Series":
        """
        Not supported.
        """
        raise NotImplementedError()

    def extractall(self, pat, flags=0) -> "ps.Series":
        """
        Not supported.
        """
        raise NotImplementedError()

    def find(self, sub, start=0, end=None) -> "ps.Series":
        """
        Return lowest indexes in each strings in the Series where the
        substring is fully contained between [start:end].

        Return -1 on failure. Equivalent to standard :func:`str.find`.

        Parameters
        ----------
        sub : str
            Substring being searched.
        start : int
            Left edge index.
        end : int
            Right edge index.

        Returns
        -------
        Series of int
            Series of lowest matching indexes.

        Examples
        --------
        >>> s = ps.Series(['apple', 'oranges', 'bananas'])

        >>> s.str.find('a')
        0    0
        1    2
        2    1
        dtype: int64

        >>> s.str.find('a', start=2)
        0   -1
        1    2
        2    3
        dtype: int64

        >>> s.str.find('a', end=1)
        0    0
        1   -1
        2   -1
        dtype: int64

        >>> s.str.find('a', start=2, end=2)
        0   -1
        1   -1
        2   -1
        dtype: int64
        """

        def pandas_find(s) -> "ps.Series[int]":
            return s.str.find(sub, start, end)

        return self._data.koalas.transform_batch(pandas_find)

    def findall(self, pat, flags=0) -> "ps.Series":
        """
        Find all occurrences of pattern or regular expression in the Series.

        Equivalent to applying :func:`re.findall` to all the elements in
        the Series.

        Parameters
        ----------
        pat : str
            Pattern or regular expression.
        flags : int, default 0 (no flags)
            `re` module flags, e.g. `re.IGNORECASE`.

        Returns
        -------
        Series of object
            All non-overlapping matches of pattern or regular expression in
            each string of this Series.

        Examples
        --------
        >>> s = ps.Series(['Lion', 'Monkey', 'Rabbit'])

        The search for the pattern ‘Monkey’ returns one match:

        >>> s.str.findall('Monkey')
        0          []
        1    [Monkey]
        2          []
        dtype: object

        On the other hand, the search for the pattern ‘MONKEY’ doesn’t return
        any match:

        >>> s.str.findall('MONKEY')
        0    []
        1    []
        2    []
        dtype: object

        Flags can be added to the pattern or regular expression. For instance,
        to find the pattern ‘MONKEY’ ignoring the case:

        >>> import re
        >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
        0          []
        1    [Monkey]
        2          []
        dtype: object

        When the pattern matches more than one string in the Series, all
        matches are returned:

        >>> s.str.findall('on')
        0    [on]
        1    [on]
        2      []
        dtype: object

        Regular expressions are supported too. For instance, the search for all
        the strings ending with the word ‘on’ is shown next:

        >>> s.str.findall('on$')
        0    [on]
        1      []
        2      []
        dtype: object

        If the pattern is found more than once in the same string, then a list
        of multiple strings is returned:

        >>> s.str.findall('b')
        0        []
        1        []
        2    [b, b]
        dtype: object
        """
        # type hint does not support to specify array type yet.
        pudf = pandas_udf(
            lambda s: s.str.findall(pat, flags),
            returnType=ArrayType(StringType(), containsNull=True),
            functionType=PandasUDFType.SCALAR,
        )
        return self._data._with_new_scol(scol=pudf(self._data.spark.column))

    def index(self, sub, start=0, end=None) -> "ps.Series":
        """
        Return lowest indexes in each strings where the substring is fully
        contained between [start:end].

        This is the same as :func:`str.find` except instead of returning -1,
        it raises a ValueError when the substring is not found. Equivalent to
        standard :func:`str.index`.

        Parameters
        ----------
        sub : str
            Substring being searched.
        start : int
            Left edge index.
        end : int
            Right edge index.

        Returns
        -------
        Series of int
            Series of lowest matching indexes.

        Examples
        --------
        >>> s = ps.Series(['apple', 'oranges', 'bananas'])

        >>> s.str.index('a')
        0    0
        1    2
        2    1
        dtype: int64

        The following expression throws an exception:

        >>> s.str.index('a', start=2) # doctest: +SKIP
        """

        def pandas_index(s) -> "ps.Series[np.int64]":
            return s.str.index(sub, start, end)

        return self._data.koalas.transform_batch(pandas_index)

    def join(self, sep) -> "ps.Series":
        """
        Join lists contained as elements in the Series with passed delimiter.

        If the elements of a Series are lists themselves, join the content of
        these lists using the delimiter passed to the function. This function
        is an equivalent to calling :func:`str.join` on the lists.

        Parameters
        ----------
        sep : str
            Delimiter to use between list entries.

        Returns
        -------
        Series of object
            Series with list entries concatenated by intervening occurrences of
            the delimiter.

        See Also
        --------
        str.split : Split strings around given separator/delimiter.
        str.rsplit : Splits string around given separator/delimiter,
            starting from the right.

        Examples
        --------
        Example with a list that contains a None element.

        >>> s = ps.Series([['lion', 'elephant', 'zebra'],
        ...                ['cat', None, 'dog']])
        >>> s
        0    [lion, elephant, zebra]
        1           [cat, None, dog]
        dtype: object

        Join all lists using a ‘-‘. The list containing None will produce None.

        >>> s.str.join('-')
        0    lion-elephant-zebra
        1                   None
        dtype: object
        """

        def pandas_join(s) -> "ps.Series[str]":
            return s.str.join(sep)

        return self._data.koalas.transform_batch(pandas_join)

    def len(self) -> "ps.Series":
        """
        Computes the length of each element in the Series.

        The element may be a sequence (such as a string, tuple or list).

        Returns
        -------
        Series of int
            A Series of integer values indicating the length of each element in
            the Series.

        Examples
        --------
        Returns the length (number of characters) in a string. Returns the
        number of entries for lists or tuples.

        >>> s1 = ps.Series(['dog', 'monkey'])
        >>> s1.str.len()
        0    3
        1    6
        dtype: int64

        >>> s2 = ps.Series([["a", "b", "c"], []])
        >>> s2.str.len()
        0    3
        1    0
        dtype: int64
        """
        if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
            return self._data.spark.transform(lambda c: F.size(c).cast(LongType()))
        else:
            return self._data.spark.transform(lambda c: F.length(c).cast(LongType()))

    def ljust(self, width, fillchar=" ") -> "ps.Series":
        """
        Filling right side of strings in the Series with an additional
        character. Equivalent to :func:`str.ljust`.

        Parameters
        ----------
        width : int
            Minimum width of resulting string; additional characters will be
            filled with `fillchar`.
        fillchar : str
            Additional character for filling, default is whitespace.

        Returns
        -------
        Series of object

        Examples
        --------
        >>> s = ps.Series(["caribou", "tiger"])
        >>> s
        0    caribou
        1      tiger
        dtype: object

        >>> s.str.ljust(width=10, fillchar='-')
        0    caribou---
        1    tiger-----
        dtype: object
        """

        def pandas_ljust(s) -> "ps.Series[str]":
            return s.str.ljust(width, fillchar)

        return self._data.koalas.transform_batch(pandas_ljust)

    def match(self, pat, case=True, flags=0, na=np.NaN) -> "ps.Series":
        """
        Determine if each string matches a regular expression.

        Analogous to :func:`contains`, but more strict, relying on
        :func:`re.match` instead of :func:`re.search`.

        Parameters
        ----------
        pat : str
            Character sequence or regular expression.
        case : bool, default True
            If True, case sensitive.
        flags : int, default 0 (no flags)
            Flags to pass through to the re module, e.g. re.IGNORECASE.
        na : default NaN
            Fill value for missing values.

        Returns
        -------
        Series of boolean values or object
            A Series of boolean values indicating whether the given pattern can
            be matched in the string of each element of the Series.

        Examples
        --------
        >>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
        >>> s.str.match('dog')
        0    False
        1     True
        2    False
        3    False
        4     None
        dtype: object

        >>> s.str.match('mouse|dog', case=False)
        0     True
        1     True
        2    False
        3    False
        4     None
        dtype: object

        >>> s.str.match('.+and.+', na=True)
        0    False
        1    False
        2     True
        3    False
        4     True
        dtype: bool

        >>> import re
        >>> s.str.match('MOUSE', flags=re.IGNORECASE)
        0     True
        1    False
        2    False
        3    False
        4     None
        dtype: object
        """

        def pandas_match(s) -> "ps.Series[bool]":
            return s.str.match(pat, case, flags, na)

        return self._data.koalas.transform_batch(pandas_match)

    def normalize(self, form) -> "ps.Series":
        """
        Return the Unicode normal form for the strings in the Series.

        For more information on the forms, see the
        :func:`unicodedata.normalize`.

        Parameters
        ----------
        form : {‘NFC’, ‘NFKC’, ‘NFD’, ‘NFKD’}
            Unicode form.

        Returns
        -------
        Series of objects
            A Series of normalized strings.
        """

        def pandas_normalize(s) -> "ps.Series[str]":
            return s.str.normalize(form)

        return self._data.koalas.transform_batch(pandas_normalize)

    def pad(self, width, side="left", fillchar=" ") -> "ps.Series":
        """
        Pad strings in the Series up to width.

        Parameters
        ----------
        width : int
            Minimum width of resulting string; additional characters will be
            filled with character defined in `fillchar`.
        side : {‘left’, ‘right’, ‘both’}, default ‘left’
            Side from which to fill resulting string.
        fillchar : str, default ' '
            Additional character for filling, default is whitespace.

        Returns
        -------
        Series of object
            Returns Series with minimum number of char in object.

        Examples
        --------
        >>> s = ps.Series(["caribou", "tiger"])
        >>> s
        0    caribou
        1      tiger
        dtype: object

        >>> s.str.pad(width=10)
        0       caribou
        1         tiger
        dtype: object

        >>> s.str.pad(width=10, side='right', fillchar='-')
        0    caribou---
        1    tiger-----
        dtype: object

        >>> s.str.pad(width=10, side='both', fillchar='-')
        0    -caribou--
        1    --tiger---
        dtype: object
        """

        def pandas_pad(s) -> "ps.Series[str]":
            return s.str.pad(width, side, fillchar)

        return self._data.koalas.transform_batch(pandas_pad)

    def partition(self, sep=" ", expand=True) -> "ps.Series":
        """
        Not supported.
        """
        raise NotImplementedError()

    def repeat(self, repeats) -> "ps.Series":
        """
        Duplicate each string in the Series.

        Parameters
        ----------
        repeats : int
            Repeat the string given number of times (int). Sequence of int
            is not supported.

        Returns
        -------
        Series of object
            Series or Index of repeated string objects specified by input
            parameter repeats.

        Examples
        --------
        >>> s = ps.Series(['a', 'b', 'c'])
        >>> s
        0    a
        1    b
        2    c
        dtype: object

        Single int repeats string in Series

        >>> s.str.repeat(repeats=2)
        0    aa
        1    bb
        2    cc
        dtype: object
        """
        if not isinstance(repeats, int):
            raise ValueError("repeats expects an int parameter")
        return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats))

    def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True) -> "ps.Series":
        """
        Replace occurrences of pattern/regex in the Series with some other
        string. Equivalent to :func:`str.replace` or :func:`re.sub`.

        Parameters
        ----------
        pat : str or compiled regex
            String can be a character sequence or regular expression.
        repl : str or callable
            Replacement string or a callable. The callable is passed the regex
            match object and must return a replacement string to be used. See
            :func:`re.sub`.
        n : int, default -1 (all)
            Number of replacements to make from start.
        case : boolean, default None
            If True, case sensitive (the default if pat is a string).
            Set to False for case insensitive.
            Cannot be set if pat is a compiled regex.
        flags: int, default 0 (no flags)
            re module flags, e.g. re.IGNORECASE.
            Cannot be set if pat is a compiled regex.
        regex : boolean, default True
            If True, assumes the passed-in pattern is a regular expression.
            If False, treats the pattern as a literal string.
            Cannot be set to False if pat is a compile regex or repl is a
            callable.

        Returns
        -------
        Series of object
            A copy of the string with all matching occurrences of pat replaced
            by repl.

        Examples
        --------
        When pat is a string and regex is True (the default), the given pat is
        compiled as a regex. When repl is a string, it replaces matching regex
        patterns as with :func:`re.sub`. NaN value(s) in the Series are changed
        to None:

        >>> ps.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
        0     bao
        1     baz
        2    None
        dtype: object

        When pat is a string and regex is False, every pat is replaced with
        repl as with :func:`str.replace`:

        >>> ps.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
        0     bao
        1     fuz
        2    None
        dtype: object

        When repl is a callable, it is called on every pat using
        :func:`re.sub`. The callable should expect one positional argument (a
        regex object) and return a string.

        Reverse every lowercase alphabetic word:

        >>> repl = lambda m: m.group(0)[::-1]
        >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
        0    oof 123
        1    rab zab
        2       None
        dtype: object

        Using regex groups (extract second group and swap case):

        >>> pat = r"(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
        >>> repl = lambda m: m.group('two').swapcase()
        >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
        0    tWO
        1    bAR
        dtype: object

        Using a compiled regex with flags:

        >>> import re
        >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
        >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
        0     foo
        1     bar
        2    None
        dtype: object
        """

        def pandas_replace(s) -> "ps.Series[str]":
            return s.str.replace(pat, repl, n=n, case=case, flags=flags, regex=regex)

        return self._data.koalas.transform_batch(pandas_replace)

    def rfind(self, sub, start=0, end=None) -> "ps.Series":
        """
        Return highest indexes in each strings in the Series where the
        substring is fully contained between [start:end].

        Return -1 on failure. Equivalent to standard :func:`str.rfind`.

        Parameters
        ----------
        sub : str
            Substring being searched.
        start : int
            Left edge index.
        end : int
            Right edge index.

        Returns
        -------
        Series of int
            Series of highest matching indexes.

        Examples
        --------
        >>> s = ps.Series(['apple', 'oranges', 'bananas'])

        >>> s.str.rfind('a')
        0    0
        1    2
        2    5
        dtype: int64

        >>> s.str.rfind('a', start=2)
        0   -1
        1    2
        2    5
        dtype: int64

        >>> s.str.rfind('a', end=1)
        0    0
        1   -1
        2   -1
        dtype: int64

        >>> s.str.rfind('a', start=2, end=2)
        0   -1
        1   -1
        2   -1
        dtype: int64
        """

        def pandas_rfind(s) -> "ps.Series[int]":
            return s.str.rfind(sub, start, end)

        return self._data.koalas.transform_batch(pandas_rfind)

    def rindex(self, sub, start=0, end=None) -> "ps.Series":
        """
        Return highest indexes in each strings where the substring is fully
        contained between [start:end].

        This is the same as :func:`str.rfind` except instead of returning -1,
        it raises a ValueError when the substring is not found. Equivalent to
        standard :func:`str.rindex`.

        Parameters
        ----------
        sub : str
            Substring being searched.
        start : int
            Left edge index.
        end : int
            Right edge index.

        Returns
        -------
        Series of int
            Series of highest matching indexes.

        Examples
        --------
        >>> s = ps.Series(['apple', 'oranges', 'bananas'])

        >>> s.str.rindex('a')
        0    0
        1    2
        2    5
        dtype: int64

        The following expression throws an exception:

        >>> s.str.rindex('a', start=2) # doctest: +SKIP
        """

        def pandas_rindex(s) -> "ps.Series[np.int64]":
            return s.str.rindex(sub, start, end)

        return self._data.koalas.transform_batch(pandas_rindex)

    def rjust(self, width, fillchar=" ") -> "ps.Series":
        """
        Filling left side of strings in the Series with an additional
        character. Equivalent to :func:`str.rjust`.

        Parameters
        ----------
        width : int
            Minimum width of resulting string; additional characters will be
            filled with `fillchar`.
        fillchar : str
            Additional character for filling, default is whitespace.

        Returns
        -------
        Series of object

        Examples
        --------
        >>> s = ps.Series(["caribou", "tiger"])
        >>> s
        0    caribou
        1      tiger
        dtype: object

        >>> s.str.rjust(width=10)
        0       caribou
        1         tiger
        dtype: object

        >>> s.str.rjust(width=10, fillchar='-')
        0    ---caribou
        1    -----tiger
        dtype: object
        """

        def pandas_rjust(s) -> "ps.Series[str]":
            return s.str.rjust(width, fillchar)

        return self._data.koalas.transform_batch(pandas_rjust)

    def rpartition(self, sep=" ", expand=True) -> "ps.Series":
        """
        Not supported.
        """
        raise NotImplementedError()

    def slice(self, start=None, stop=None, step=None) -> "ps.Series":
        """
        Slice substrings from each element in the Series.

        Parameters
        ----------
        start : int, optional
            Start position for slice operation.
        stop : int, optional
            Stop position for slice operation.
        step : int, optional
            Step size for slice operation.

        Returns
        -------
        Series of object
            Series from sliced substrings from original string objects.

        Examples
        --------
        >>> s = ps.Series(["koala", "fox", "chameleon"])
        >>> s
        0        koala
        1          fox
        2    chameleon
        dtype: object

        >>> s.str.slice(start=1)
        0        oala
        1          ox
        2    hameleon
        dtype: object

        >>> s.str.slice(stop=2)
        0    ko
        1    fo
        2    ch
        dtype: object

        >>> s.str.slice(step=2)
        0      kaa
        1       fx
        2    caeen
        dtype: object

        >>> s.str.slice(start=0, stop=5, step=3)
        0    kl
        1     f
        2    cm
        dtype: object
        """

        def pandas_slice(s) -> "ps.Series[str]":
            return s.str.slice(start, stop, step)

        return self._data.koalas.transform_batch(pandas_slice)

    def slice_replace(self, start=None, stop=None, repl=None) -> "ps.Series":
        """
        Slice substrings from each element in the Series.

        Parameters
        ----------
        start : int, optional
            Start position for slice operation. If not specified (None), the
            slice is unbounded on the left, i.e. slice from the start of the
            string.
        stop : int, optional
            Stop position for slice operation. If not specified (None), the
            slice is unbounded on the right, i.e. slice until the end of the
            string.
        repl : str, optional
            String for replacement. If not specified (None), the sliced region
            is replaced with an empty string.

        Returns
        -------
        Series of object
            Series from sliced substrings from original string objects.

        Examples
        --------
        >>> s = ps.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
        >>> s
        0        a
        1       ab
        2      abc
        3     abdc
        4    abcde
        dtype: object

        Specify just start, meaning replace start until the end of the string
        with repl.

        >>> s.str.slice_replace(1, repl='X')
        0    aX
        1    aX
        2    aX
        3    aX
        4    aX
        dtype: object

        Specify just stop, meaning the start of the string to stop is replaced
        with repl, and the rest of the string is included.

        >>> s.str.slice_replace(stop=2, repl='X')
        0       X
        1       X
        2      Xc
        3     Xdc
        4    Xcde
        dtype: object

        Specify start and stop, meaning the slice from start to stop is
        replaced with repl. Everything before or after start and stop is
        included as is.

        >>> s.str.slice_replace(start=1, stop=3, repl='X')
        0      aX
        1      aX
        2      aX
        3     aXc
        4    aXde
        dtype: object
        """

        def pandas_slice_replace(s) -> "ps.Series[str]":
            return s.str.slice_replace(start, stop, repl)

        return self._data.koalas.transform_batch(pandas_slice_replace)

    def split(self, pat=None, n=-1, expand=False) -> Union["ps.Series", "ps.DataFrame"]:
        """
        Split strings around given separator/delimiter.

        Splits the string in the Series from the beginning, at the specified
        delimiter string. Equivalent to :func:`str.split`.

        Parameters
        ----------
        pat : str, optional
            String or regular expression to split on. If not specified, split
            on whitespace.
        n : int, default -1 (all)
            Limit number of splits in output. None, 0 and -1 will be
            interpreted as return all splits.
        expand : bool, default False
            Expand the splitted strings into separate columns.

            * If ``True``, `n` must be a positive integer, and return DataFrame expanding
              dimensionality.
            * If ``False``, return Series, containing lists of strings.

        Returns
        -------
        Series, DataFrame
            Type matches caller unless `expand=True` (see Notes).

        See Also
        --------
        str.rsplit : Splits string around given separator/delimiter,
            starting from the right.
        str.join : Join lists contained as elements in the Series/Index
            with passed delimiter.

        Notes
        -----
        The handling of the `n` keyword depends on the number of found splits:

        - If found splits > `n`,  make first `n` splits only
        - If found splits <= `n`, make all splits
        - If for a certain row the number of found splits < `n`,
          append `None` for padding up to `n` if ``expand=True``

        If using ``expand=True``, Series callers return DataFrame objects with `n + 1` columns.

        .. note:: Even if `n` is much larger than found splits, the number of columns does NOT
            shrink unlike pandas.

        Examples
        --------
        >>> s = ps.Series(["this is a regular sentence",
        ...                "https://docs.python.org/3/tutorial/index.html",
        ...                np.nan])

        In the default setting, the string is split by whitespace.

        >>> s.str.split()
        0                   [this, is, a, regular, sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        Without the n parameter, the outputs of rsplit and split are identical.

        >>> s.str.rsplit()
        0                   [this, is, a, regular, sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        The n parameter can be used to limit the number of splits on the
        delimiter. The outputs of split and rsplit are different.

        >>> s.str.split(n=2)
        0                     [this, is, a regular sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        >>> s.str.rsplit(n=2)
        0                     [this is a, regular, sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        The pat parameter can be used to split by other characters.

        >>> s.str.split(pat = "/")
        0                         [this is a regular sentence]
        1    [https:, , docs.python.org, 3, tutorial, index...
        2                                                 None
        dtype: object

        When using ``expand=True``, the split elements will expand out into
        separate columns. If NaN is present, it is propagated throughout
        the columns during the split.

        >>> s.str.split(n=4, expand=True)
                                                       0     1     2        3         4
        0                                           this    is     a  regular  sentence
        1  https://docs.python.org/3/tutorial/index.html  None  None     None      None
        2                                           None  None  None     None      None

        For slightly more complex use cases like splitting the html document name
        from a url, a combination of parameter settings can be used.

        >>> s.str.rsplit("/", n=1, expand=True)
                                            0           1
        0          this is a regular sentence        None
        1  https://docs.python.org/3/tutorial  index.html
        2                                None        None

        Remember to escape special characters when explicitly using regular
        expressions.

        >>> s = ps.Series(["1+1=2"])
        >>> s.str.split(r"\\+|=", n=2, expand=True)
           0  1  2
        0  1  1  2
        """
        from pyspark.pandas.frame import DataFrame

        if expand and n <= 0:
            raise NotImplementedError("expand=True is currently only supported with n > 0.")

        # type hint does not support to specify array type yet.
        pudf = pandas_udf(
            lambda s: s.str.split(pat, n),
            returnType=ArrayType(StringType(), containsNull=True),
            functionType=PandasUDFType.SCALAR,
        )
        kser = self._data._with_new_scol(pudf(self._data.spark.column), dtype=self._data.dtype)

        if expand:
            kdf = kser.to_frame()
            scol = kdf._internal.data_spark_columns[0]
            spark_columns = [scol[i].alias(str(i)) for i in range(n + 1)]
            column_labels = [(i,) for i in range(n + 1)]
            internal = kdf._internal.with_new_columns(
                spark_columns,
                column_labels=cast(Optional[List], column_labels),
                data_dtypes=([self._data.dtype] * len(column_labels)),
            )
            return DataFrame(internal)
        else:
            return kser

    def rsplit(self, pat=None, n=-1, expand=False) -> Union["ps.Series", "ps.DataFrame"]:
        """
        Split strings around given separator/delimiter.

        Splits the string in the Series from the end, at the specified
        delimiter string. Equivalent to :func:`str.rsplit`.

        Parameters
        ----------
        pat : str, optional
            String or regular expression to split on. If not specified, split
            on whitespace.
        n : int, default -1 (all)
            Limit number of splits in output. None, 0 and -1 will be
            interpreted as return all splits.
        expand : bool, default False
            Expand the splitted strings into separate columns.

            * If ``True``, `n` must be a positive integer, and return DataFrame expanding
              dimensionality.
            * If ``False``, return Series, containing lists of strings.

        Returns
        -------
        Series, DataFrame
            Type matches caller unless `expand=True` (see Notes).

        See Also
        --------
        str.split : Split strings around given separator/delimiter.
        str.join : Join lists contained as elements in the Series/Index
            with passed delimiter.

        Notes
        -----
        The handling of the `n` keyword depends on the number of found splits:

        - If found splits > `n`,  make first `n` splits only
        - If found splits <= `n`, make all splits
        - If for a certain row the number of found splits < `n`,
          append `None` for padding up to `n` if ``expand=True``

        If using ``expand=True``, Series callers return DataFrame objects with `n + 1` columns.

        .. note:: Even if `n` is much larger than found splits, the number of columns does NOT
            shrink unlike pandas.

        Examples
        --------
        >>> s = ps.Series(["this is a regular sentence",
        ...                "https://docs.python.org/3/tutorial/index.html",
        ...                np.nan])

        In the default setting, the string is split by whitespace.

        >>> s.str.split()
        0                   [this, is, a, regular, sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        Without the n parameter, the outputs of rsplit and split are identical.

        >>> s.str.rsplit()
        0                   [this, is, a, regular, sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        The n parameter can be used to limit the number of splits on the
        delimiter. The outputs of split and rsplit are different.

        >>> s.str.split(n=2)
        0                     [this, is, a regular sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        >>> s.str.rsplit(n=2)
        0                     [this is a, regular, sentence]
        1    [https://docs.python.org/3/tutorial/index.html]
        2                                               None
        dtype: object

        When using ``expand=True``, the split elements will expand out into
        separate columns. If NaN is present, it is propagated throughout
        the columns during the split.

        >>> s.str.split(n=4, expand=True)
                                                       0     1     2        3         4
        0                                           this    is     a  regular  sentence
        1  https://docs.python.org/3/tutorial/index.html  None  None     None      None
        2                                           None  None  None     None      None

        For slightly more complex use cases like splitting the html document name
        from a url, a combination of parameter settings can be used.

        >>> s.str.rsplit("/", n=1, expand=True)
                                            0           1
        0          this is a regular sentence        None
        1  https://docs.python.org/3/tutorial  index.html
        2                                None        None

        Remember to escape special characters when explicitly using regular
        expressions.

        >>> s = ps.Series(["1+1=2"])
        >>> s.str.split(r"\\+|=", n=2, expand=True)
           0  1  2
        0  1  1  2
        """
        from pyspark.pandas.frame import DataFrame

        if expand and n <= 0:
            raise NotImplementedError("expand=True is currently only supported with n > 0.")

        # type hint does not support to specify array type yet.
        pudf = pandas_udf(
            lambda s: s.str.rsplit(pat, n),
            returnType=ArrayType(StringType(), containsNull=True),
            functionType=PandasUDFType.SCALAR,
        )
        kser = self._data._with_new_scol(pudf(self._data.spark.column), dtype=self._data.dtype)

        if expand:
            kdf = kser.to_frame()
            scol = kdf._internal.data_spark_columns[0]
            spark_columns = [scol[i].alias(str(i)) for i in range(n + 1)]
            column_labels = [(i,) for i in range(n + 1)]
            internal = kdf._internal.with_new_columns(
                spark_columns,
                column_labels=cast(Optional[List], column_labels),
                data_dtypes=([self._data.dtype] * len(column_labels)),
            )
            return DataFrame(internal)
        else:
            return kser

    def translate(self, table) -> "ps.Series":
        """
        Map all characters in the string through the given mapping table.
        Equivalent to standard :func:`str.translate`.

        Parameters
        ----------
        table : dict
            Table is a mapping of Unicode ordinals to Unicode ordinals,
            strings, or None. Unmapped characters are left untouched.
            Characters mapped to None are deleted. :func:`str.maketrans` is a
            helper function for making translation tables.

        Returns
        -------
        Series of object
            Series with translated strings.

        Examples
        --------
        >>> s = ps.Series(["dog", "cat", "bird"])
        >>> m = str.maketrans({'a': 'X', 'i': 'Y', 'o': None})
        >>> s.str.translate(m)
        0      dg
        1     cXt
        2    bYrd
        dtype: object
        """

        def pandas_translate(s) -> "ps.Series[str]":
            return s.str.translate(table)

        return self._data.koalas.transform_batch(pandas_translate)

    def wrap(self, width, **kwargs) -> "ps.Series":
        """
        Wrap long strings in the Series to be formatted in paragraphs with
        length less than a given width.

        This method has the same keyword parameters and defaults as
        :class:`textwrap.TextWrapper`.

        Parameters
        ----------
        width : int
            Maximum line-width. Lines separated with newline char.
        expand_tabs : bool, optional
            If true, tab characters will be expanded to spaces (default: True).
        replace_whitespace : bool, optional
            If true, each whitespace character remaining after tab expansion
            will be replaced by a single space (default: True).
        drop_whitespace : bool, optional
            If true, whitespace that, after wrapping, happens to end up at the
            beginning or end of a line is dropped (default: True).
        break_long_words : bool, optional
            If true, then words longer than width will be broken in order to
            ensure that no lines are longer than width. If it is false, long
            words will not be broken, and some lines may be longer than width
            (default: True).
        break_on_hyphens : bool, optional
            If true, wrapping will occur preferably on whitespace and right
            after hyphens in compound words, as it is customary in English.
            If false, only whitespaces will be considered as potentially good
            places for line breaks, but you need to set break_long_words to
            false if you want truly insecable words (default: True).

        Returns
        -------
        Series of object
            Series with wrapped strings.

        Examples
        --------
        >>> s = ps.Series(['line to be wrapped', 'another line to be wrapped'])
        >>> s.str.wrap(12)
        0             line to be\\nwrapped
        1    another line\\nto be\\nwrapped
        dtype: object
        """

        def pandas_wrap(s) -> "ps.Series[str]":
            return s.str.wrap(width, **kwargs)

        return self._data.koalas.transform_batch(pandas_wrap)

    def zfill(self, width) -> "ps.Series":
        """
        Pad strings in the Series by prepending ‘0’ characters.

        Strings in the Series are padded with ‘0’ characters on the left of the
        string to reach a total string length width. Strings in the Series with
        length greater or equal to width are unchanged.

        Differs from :func:`str.zfill` which has special handling for ‘+’/’-‘
        in the string.

        Parameters
        ----------
        width : int
            Minimum length of resulting string; strings with length less than
            width be prepended with ‘0’ characters.

        Returns
        -------
        Series of object
            Series with '0' left-padded strings.

        Examples
        --------
        >>> s = ps.Series(['-1', '1', '1000', np.nan])
        >>> s
        0      -1
        1       1
        2    1000
        3    None
        dtype: object

        Note that NaN is not a string, therefore it is converted to NaN. The
        minus sign in '-1' is treated as a regular character and the zero is
        added to the left of it (:func:`str.zfill` would have moved it to the
        left). 1000 remains unchanged as it is longer than width.

        >>> s.str.zfill(3)
        0     0-1
        1     001
        2    1000
        3    None
        dtype: object
        """

        def pandas_zfill(s) -> "ps.Series[str]":
            return s.str.zfill(width)

        return self._data.koalas.transform_batch(pandas_zfill)

    def get_dummies(self, sep="|"):
        """
        Not supported.
        """
        raise NotImplementedError()


def _test():
    import os
    import doctest
    import sys
    from pyspark.sql import SparkSession
    import pyspark.pandas.strings

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.pandas.strings.__dict__.copy()
    globs["ps"] = pyspark.pandas
    spark = (
        SparkSession.builder.master("local[4]")
        .appName("pyspark.pandas.strings tests")
        .getOrCreate()
    )
    (failure_count, test_count) = doctest.testmod(
        pyspark.pandas.strings,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
    )
    spark.stop()
    if failure_count:
        sys.exit(-1)


if __name__ == "__main__":
    _test()