Source code for bolster.stats

"""Basic statistics and data frame helpers.

Simple functions for common data manipulation tasks:
- add_totals/drop_totals: manage row/column totals in DataFrames
- top_n: truncate DataFrames to top N rows with 'others' aggregation
- fix_datetime_tz_columns: strip timezone info from datetime columns

Plus distribution fitting in the distributions submodule.
"""

from typing import AnyStr

import pandas as pd


[docs] def add_totals( df: pd.DataFrame, column_total: AnyStr = "total", row_total: AnyStr = "total", inplace=True, ): """Add Row and Column totals to a dataframe (in place). >>> add_totals(pd.DataFrame([[0,1,2],[3,4,5]])) 0 1 2 total 0 0 1 2 3 1 3 4 5 12 total 3 5 7 15 >>> add_totals(pd.DataFrame([[0,1,2],[3,4,5]]),'ctot', 'rtot') 0 1 2 rtot 0 0 1 2 3 1 3 4 5 12 ctot 3 5 7 15 >>> df = pd.DataFrame([[0,1,2],[3,4,5]]) >>> add_totals(df, inplace=False) 0 1 2 total 0 0 1 2 3 1 3 4 5 12 total 3 5 7 15 >>> df 0 1 2 0 0 1 2 1 3 4 5 """ if not inplace: df = df.copy(deep=True) df.loc[column_total] = df.sum(numeric_only=True, axis=0) df.loc[:, row_total] = df.sum(numeric_only=True, axis=1) return df
[docs] def drop_totals( df: pd.DataFrame, column_total: AnyStr = "total", row_total: AnyStr = "total", inplace=True, ) -> pd.DataFrame: """Remove Row and Column totals from a dataframe (in place). Parameters ---------- df : pd.DataFrame The DataFrame from which to remove totals. column_total : AnyStr, optional The name of the column total, by default "total". row_total : AnyStr, optional The name of the row total, by default "total". inplace : bool, optional Whether to modify the DataFrame in place, by default True. Returns: -------- pd.DataFrame The DataFrame with totals removed. Examples: -------- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'total': [5, 7, 9]}) >>> df.loc['total'] = [6, 15, 21] >>> drop_totals(df) A B 0 1 4 1 2 5 2 3 6 """ if not inplace: df = df.copy(deep=True) if column_total in df.columns: df = df.drop(columns=[column_total]) if row_total in df.index: df = df.drop(index=[row_total]) return df
[docs] def fix_datetime_tz_columns(df: pd.DataFrame, inplace=True) -> pd.DataFrame: """Strip Timezone information from relevant datetime columns in a dataframe. Parameters ---------- df inplace (bool) Returns: -------- df """ if not inplace: df = df.copy(deep=True) date_columns = df.select_dtypes(include=["datetime64[ns, UTC]", "datetimetz"]).columns for date_column in date_columns: df[date_column] = df[date_column].dt.tz_localize(None) return df
[docs] def top_n(df: pd.DataFrame, n: int, others: AnyStr = "others") -> pd.DataFrame: """Truncate the DataFrame to the top 'n' rows, summing all subsequent rows into an 'others' row. Parameters ---------- df : pd.DataFrame The DataFrame to truncate. n : int The number of top rows to keep. Returns: -------- pd.DataFrame The truncated DataFrame with an 'others' row. Examples: -------- >>> df = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [5, 4, 3, 2, 1]}) >>> top_n(df, 3) # doctest: +NORMALIZE_WHITESPACE A B 0 1 5 1 2 4 2 3 3 others 9 3 """ if n >= len(df): return df top_df = df.iloc[:n] others_df = df.iloc[n:].sum(numeric_only=True) if isinstance(others_df, pd.Series | pd.DataFrame): others_df.name = others else: others_df = pd.Series(others_df, name=others) return pd.concat([top_df, others_df.to_frame().T])