archive.git - Gitblit

"""Accessors for arrow-backed data."""
 
from __future__ import annotations
 
from abc import (
    ABCMeta,
    abstractmethod,
)
from typing import (
    TYPE_CHECKING,
    cast,
)
 
from pandas.compat import (
    pa_version_under10p1,
    pa_version_under11p0,
)
 
from pandas.core.dtypes.common import is_list_like
 
if not pa_version_under10p1:
    import pyarrow as pa
    import pyarrow.compute as pc
 
    from pandas.core.dtypes.dtypes import ArrowDtype
 
if TYPE_CHECKING:
    from collections.abc import Iterator
 
    from pandas import (
        DataFrame,
        Series,
    )
 
 
class ArrowAccessor(metaclass=ABCMeta):
    @abstractmethod
    def __init__(self, data, validation_msg: str) -> None:
        self._data = data
        self._validation_msg = validation_msg
        self._validate(data)
 
    @abstractmethod
    def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
        pass
 
    def _validate(self, data):
        dtype = data.dtype
        if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
            # Raise AttributeError so that inspect can handle non-struct Series.
            raise AttributeError(self._validation_msg.format(dtype=dtype))
 
        if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
            # Raise AttributeError so that inspect can handle invalid Series.
            raise AttributeError(self._validation_msg.format(dtype=dtype))
 
    @property
    def _pa_array(self):
        return self._data.array._pa_array
 
 
class ListAccessor(ArrowAccessor):
    """
    Accessor object for list data properties of the Series values.
 
    Parameters
    ----------
    data : Series
        Series containing Arrow list data.
    """
 
    def __init__(self, data=None) -> None:
        super().__init__(
            data,
            validation_msg="Can only use the '.list' accessor with "
            "'list[pyarrow]' dtype, not {dtype}.",
        )
 
    def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
        return (
            pa.types.is_list(pyarrow_dtype)
            or pa.types.is_fixed_size_list(pyarrow_dtype)
            or pa.types.is_large_list(pyarrow_dtype)
        )
 
    def len(self) -> Series:
        """
        Return the length of each list in the Series.
 
        Returns
        -------
        pandas.Series
            The length of each list.
 
        Examples
        --------
        >>> import pyarrow as pa
        >>> s = pd.Series(
        ...     [
        ...         [1, 2, 3],
        ...         [3],
        ...     ],
        ...     dtype=pd.ArrowDtype(pa.list_(
        ...         pa.int64()
        ...     ))
        ... )
        >>> s.list.len()
        0    3
        1    1
        dtype: int32[pyarrow]
        """
        from pandas import Series
 
        value_lengths = pc.list_value_length(self._pa_array)
        return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
 
    def __getitem__(self, key: int | slice) -> Series:
        """
        Index or slice lists in the Series.
 
        Parameters
        ----------
        key : int | slice
            Index or slice of indices to access from each list.
 
        Returns
        -------
        pandas.Series
            The list at requested index.
 
        Examples
        --------
        >>> import pyarrow as pa
        >>> s = pd.Series(
        ...     [
        ...         [1, 2, 3],
        ...         [3],
        ...     ],
        ...     dtype=pd.ArrowDtype(pa.list_(
        ...         pa.int64()
        ...     ))
        ... )
        >>> s.list[0]
        0    1
        1    3
        dtype: int64[pyarrow]
        """
        from pandas import Series
 
        if isinstance(key, int):
            # TODO: Support negative key but pyarrow does not allow
            # element index to be an array.
            # if key < 0:
            #     key = pc.add(key, pc.list_value_length(self._pa_array))
            element = pc.list_element(self._pa_array, key)
            return Series(element, dtype=ArrowDtype(element.type))
        elif isinstance(key, slice):
            if pa_version_under11p0:
                raise NotImplementedError(
                    f"List slice not supported by pyarrow {pa.__version__}."
                )
 
            # TODO: Support negative start/stop/step, ideally this would be added
            # upstream in pyarrow.
            start, stop, step = key.start, key.stop, key.step
            if start is None:
                # TODO: When adding negative step support
                #  this should be setto last element of array
                # when step is negative.
                start = 0
            if step is None:
                step = 1
            sliced = pc.list_slice(self._pa_array, start, stop, step)
            return Series(sliced, dtype=ArrowDtype(sliced.type))
        else:
            raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
 
    def __iter__(self) -> Iterator:
        raise TypeError(f"'{type(self).__name__}' object is not iterable")
 
    def flatten(self) -> Series:
        """
        Flatten list values.
 
        Returns
        -------
        pandas.Series
            The data from all lists in the series flattened.
 
        Examples
        --------
        >>> import pyarrow as pa
        >>> s = pd.Series(
        ...     [
        ...         [1, 2, 3],
        ...         [3],
        ...     ],
        ...     dtype=pd.ArrowDtype(pa.list_(
        ...         pa.int64()
        ...     ))
        ... )
        >>> s.list.flatten()
        0    1
        1    2
        2    3
        3    3
        dtype: int64[pyarrow]
        """
        from pandas import Series
 
        flattened = pc.list_flatten(self._pa_array)
        return Series(flattened, dtype=ArrowDtype(flattened.type))
 
 
class StructAccessor(ArrowAccessor):
    """
    Accessor object for structured data properties of the Series values.
 
    Parameters
    ----------
    data : Series
        Series containing Arrow struct data.
    """
 
    def __init__(self, data=None) -> None:
        super().__init__(
            data,
            validation_msg=(
                "Can only use the '.struct' accessor with 'struct[pyarrow]' "
                "dtype, not {dtype}."
            ),
        )
 
    def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
        return pa.types.is_struct(pyarrow_dtype)
 
    @property
    def dtypes(self) -> Series:
        """
        Return the dtype object of each child field of the struct.
 
        Returns
        -------
        pandas.Series
            The data type of each child field.
 
        Examples
        --------
        >>> import pyarrow as pa
        >>> s = pd.Series(
        ...     [
        ...         {"version": 1, "project": "pandas"},
        ...         {"version": 2, "project": "pandas"},
        ...         {"version": 1, "project": "numpy"},
        ...     ],
        ...     dtype=pd.ArrowDtype(pa.struct(
        ...         [("version", pa.int64()), ("project", pa.string())]
        ...     ))
        ... )
        >>> s.struct.dtypes
        version     int64[pyarrow]
        project    string[pyarrow]
        dtype: object
        """
        from pandas import (
            Index,
            Series,
        )
 
        pa_type = self._data.dtype.pyarrow_dtype
        types = [ArrowDtype(struct.type) for struct in pa_type]
        names = [struct.name for struct in pa_type]
        return Series(types, index=Index(names))
 
    def field(
        self,
        name_or_index: list[str]
        | list[bytes]
        | list[int]
        | pc.Expression
        | bytes
        | str
        | int,
    ) -> Series:
        """
        Extract a child field of a struct as a Series.
 
        Parameters
        ----------
        name_or_index : str | bytes | int | expression | list
            Name or index of the child field to extract.
 
            For list-like inputs, this will index into a nested
            struct.
 
        Returns
        -------
        pandas.Series
            The data corresponding to the selected child field.
 
        See Also
        --------
        Series.struct.explode : Return all child fields as a DataFrame.
 
        Notes
        -----
        The name of the resulting Series will be set using the following
        rules:
 
        - For string, bytes, or integer `name_or_index` (or a list of these, for
          a nested selection), the Series name is set to the selected
          field's name.
        - For a :class:`pyarrow.compute.Expression`, this is set to
          the string form of the expression.
        - For list-like `name_or_index`, the name will be set to the
          name of the final field selected.
 
        Examples
        --------
        >>> import pyarrow as pa
        >>> s = pd.Series(
        ...     [
        ...         {"version": 1, "project": "pandas"},
        ...         {"version": 2, "project": "pandas"},
        ...         {"version": 1, "project": "numpy"},
        ...     ],
        ...     dtype=pd.ArrowDtype(pa.struct(
        ...         [("version", pa.int64()), ("project", pa.string())]
        ...     ))
        ... )
 
        Extract by field name.
 
        >>> s.struct.field("project")
        0    pandas
        1    pandas
        2     numpy
        Name: project, dtype: string[pyarrow]
 
        Extract by field index.
 
        >>> s.struct.field(0)
        0    1
        1    2
        2    1
        Name: version, dtype: int64[pyarrow]
 
        Or an expression
 
        >>> import pyarrow.compute as pc
        >>> s.struct.field(pc.field("project"))
        0    pandas
        1    pandas
        2     numpy
        Name: project, dtype: string[pyarrow]
 
        For nested struct types, you can pass a list of values to index
        multiple levels:
 
        >>> version_type = pa.struct([
        ...     ("major", pa.int64()),
        ...     ("minor", pa.int64()),
        ... ])
        >>> s = pd.Series(
        ...     [
        ...         {"version": {"major": 1, "minor": 5}, "project": "pandas"},
        ...         {"version": {"major": 2, "minor": 1}, "project": "pandas"},
        ...         {"version": {"major": 1, "minor": 26}, "project": "numpy"},
        ...     ],
        ...     dtype=pd.ArrowDtype(pa.struct(
        ...         [("version", version_type), ("project", pa.string())]
        ...     ))
        ... )
        >>> s.struct.field(["version", "minor"])
        0     5
        1     1
        2    26
        Name: minor, dtype: int64[pyarrow]
        >>> s.struct.field([0, 0])
        0    1
        1    2
        2    1
        Name: major, dtype: int64[pyarrow]
        """
        from pandas import Series
 
        def get_name(
            level_name_or_index: list[str]
            | list[bytes]
            | list[int]
            | pc.Expression
            | bytes
            | str
            | int,
            data: pa.ChunkedArray,
        ):
            if isinstance(level_name_or_index, int):
                name = data.type.field(level_name_or_index).name
            elif isinstance(level_name_or_index, (str, bytes)):
                name = level_name_or_index
            elif isinstance(level_name_or_index, pc.Expression):
                name = str(level_name_or_index)
            elif is_list_like(level_name_or_index):
                # For nested input like [2, 1, 2]
                # iteratively get the struct and field name. The last
                # one is used for the name of the index.
                level_name_or_index = list(reversed(level_name_or_index))
                selected = data
                while level_name_or_index:
                    # we need the cast, otherwise mypy complains about
                    # getting ints, bytes, or str here, which isn't possible.
                    level_name_or_index = cast(list, level_name_or_index)
                    name_or_index = level_name_or_index.pop()
                    name = get_name(name_or_index, selected)
                    selected = selected.type.field(selected.type.get_field_index(name))
                    name = selected.name
            else:
                raise ValueError(
                    "name_or_index must be an int, str, bytes, "
                    "pyarrow.compute.Expression, or list of those"
                )
            return name
 
        pa_arr = self._data.array._pa_array
        name = get_name(name_or_index, pa_arr)
        field_arr = pc.struct_field(pa_arr, name_or_index)
 
        return Series(
            field_arr,
            dtype=ArrowDtype(field_arr.type),
            index=self._data.index,
            name=name,
        )
 
    def explode(self) -> DataFrame:
        """
        Extract all child fields of a struct as a DataFrame.
 
        Returns
        -------
        pandas.DataFrame
            The data corresponding to all child fields.
 
        See Also
        --------
        Series.struct.field : Return a single child field as a Series.
 
        Examples
        --------
        >>> import pyarrow as pa
        >>> s = pd.Series(
        ...     [
        ...         {"version": 1, "project": "pandas"},
        ...         {"version": 2, "project": "pandas"},
        ...         {"version": 1, "project": "numpy"},
        ...     ],
        ...     dtype=pd.ArrowDtype(pa.struct(
        ...         [("version", pa.int64()), ("project", pa.string())]
        ...     ))
        ... )
 
        >>> s.struct.explode()
           version project
        0        1  pandas
        1        2  pandas
        2        1   numpy
        """
        from pandas import concat
 
        pa_type = self._pa_array.type
        return concat(
            [self.field(i) for i in range(pa_type.num_fields)], axis="columns"
        )