Source code for bolster.stats

"""Basic statistics and data frame helpers.

Simple functions for common data manipulation tasks:
- add_totals/drop_totals: manage row/column totals in DataFrames
- top_n: truncate DataFrames to top N rows with 'others' aggregation
- fix_datetime_tz_columns: strip timezone info from datetime columns

Plus distribution fitting in the distributions submodule.
"""

from typing import AnyStr

import pandas as pd



[docs]
def add_totals(
    df: pd.DataFrame,
    column_total: AnyStr = "total",
    row_total: AnyStr = "total",
    inplace=True,
):
    """Add Row and Column totals to a dataframe (in place).

    >>> add_totals(pd.DataFrame([[0,1,2],[3,4,5]]))
           0  1  2  total
    0      0  1  2      3
    1      3  4  5     12
    total  3  5  7     15

    >>> add_totals(pd.DataFrame([[0,1,2],[3,4,5]]),'ctot', 'rtot')
          0  1  2  rtot
    0     0  1  2     3
    1     3  4  5    12
    ctot  3  5  7    15


    >>> df = pd.DataFrame([[0,1,2],[3,4,5]])
    >>> add_totals(df, inplace=False)
           0  1  2  total
    0      0  1  2      3
    1      3  4  5     12
    total  3  5  7     15

    >>> df
       0  1  2
    0  0  1  2
    1  3  4  5

    """
    if not inplace:
        df = df.copy(deep=True)
    df.loc[column_total] = df.sum(numeric_only=True, axis=0)
    df.loc[:, row_total] = df.sum(numeric_only=True, axis=1)

    return df




[docs]
def drop_totals(
    df: pd.DataFrame,
    column_total: AnyStr = "total",
    row_total: AnyStr = "total",
    inplace=True,
) -> pd.DataFrame:
    """Remove Row and Column totals from a dataframe (in place).

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame from which to remove totals.
    column_total : AnyStr, optional
        The name of the column total, by default "total".
    row_total : AnyStr, optional
        The name of the row total, by default "total".
    inplace : bool, optional
        Whether to modify the DataFrame in place, by default True.

    Returns:
    --------
    pd.DataFrame
        The DataFrame with totals removed.

    Examples:
    --------
    >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'total': [5, 7, 9]})
    >>> df.loc['total'] = [6, 15, 21]
    >>> drop_totals(df)
       A  B
    0  1  4
    1  2  5
    2  3  6
    """
    if not inplace:
        df = df.copy(deep=True)

    if column_total in df.columns:
        df = df.drop(columns=[column_total])

    if row_total in df.index:
        df = df.drop(index=[row_total])

    return df




[docs]
def fix_datetime_tz_columns(df: pd.DataFrame, inplace=True) -> pd.DataFrame:
    """Strip Timezone information from relevant datetime columns in a dataframe.

    Parameters
    ----------
    df
    inplace (bool)


    Returns:
    --------
    df

    """
    if not inplace:
        df = df.copy(deep=True)
    date_columns = df.select_dtypes(include=["datetime64[ns, UTC]", "datetimetz"]).columns
    for date_column in date_columns:
        df[date_column] = df[date_column].dt.tz_localize(None)
    return df




[docs]
def top_n(df: pd.DataFrame, n: int, others: AnyStr = "others") -> pd.DataFrame:
    """Truncate the DataFrame to the top 'n' rows, summing all subsequent rows into an 'others' row.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to truncate.
    n : int
        The number of top rows to keep.

    Returns:
    --------
    pd.DataFrame
        The truncated DataFrame with an 'others' row.

    Examples:
    --------
    >>> df = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [5, 4, 3, 2, 1]})
    >>> top_n(df, 3) # doctest: +NORMALIZE_WHITESPACE
           A  B
    0      1  5
    1      2  4
    2      3  3
    others 9  3
    """
    if n >= len(df):
        return df

    top_df = df.iloc[:n]
    others_df = df.iloc[n:].sum(numeric_only=True)
    if isinstance(others_df, pd.Series | pd.DataFrame):
        others_df.name = others
    else:
        others_df = pd.Series(others_df, name=others)
    return pd.concat([top_df, others_df.to_frame().T])