Source code for mydatapreprocessing.helpers.helpers_internal

"""Module for helpers subpackage."""

from __future__ import annotations
from typing import overload

import numpy as np
import pandas as pd

from ..types import DataFrameOrArrayGeneric, PandasIndex


[docs]def check_column_in_df(df: pd.DataFrame, name_or_index: PandasIndex, source: None | str = None) -> None: """If defined column is not in DataFrame, it raise Error. Args: df (pd.DataFrame): Input data. name_or_index (PandasIndex): Integer index, name or pandas.Index. source (str, optional): In raised message wanted column can be referenced. Defaults to None. Raises: KeyError: If column not found in DataFrame. Example: >>> df = pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) >>> check_column_in_df(df, "a") >>> check_column_in_df(df, "z") Traceback (most recent call last): KeyError... """ if isinstance(name_or_index, str): if not name_or_index in df.columns: caption = "Column not found error." if not source else f"Error in '{source}.'" raise KeyError( f"{caption} Column '{name_or_index}' not found in data. Available columns: {list(df.columns)}" ) elif isinstance(name_or_index, (int, np.integer)): if name_or_index >= len(df.columns): raise ValueError( f"Column '{name_or_index}' not available as df has only {df.shape[1]} columns.", ) elif isinstance(name_or_index, pd.Index): if name_or_index.size == 1: column_name = name_or_index.values[0] check_column_in_df(df, column_name) else: raise ValueError( "If checking column in DataFrame with pandas.Index, it has to have size 1.", ) else: raise TypeError("Checking column in DataFrame is possible only with PandasIndex types.")
@overload def get_column_name(df: pd.DataFrame, index: str | int | np.integer) -> str: ... @overload def get_column_name(df: pd.DataFrame, index: pd.Index) -> pd.Index: ...
[docs]def get_column_name(df: pd.DataFrame, index: PandasIndex) -> str | pd.Index: """Return index that can be used to access column directly. In user input the column can be defined by name or by it's index. Then selecting the column has different syntax. It's verified whether column is available. If it's integer index, it's converted to string so the syntax is always the same. Args: df (pd.DataFrame): Input data index (PandasIndex): Also integer index. Example: >>> df = pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) >>> get_column_name(df, "b") 'b' >>> get_column_name(df, 2) 'c' >>> get_column_name(df, "z") Traceback (most recent call last): KeyError... """ check_column_in_df(df, index) if isinstance(index, str): return index elif isinstance(index, (np.integer, int)): return df.columns.values[index] elif isinstance(index, pd.Index): return index else: raise TypeError("Column type must be one of str | int | np.integer | pd.Index.")
[docs]def get_copy_or_view(data: DataFrameOrArrayGeneric, inplace: bool) -> DataFrameOrArrayGeneric: """As DataFrame copy function needs to be casted for correct type hints this helps to solve it. Args: data (DataFrameOrArrayGeneric): Input data inplace (bool): Whether to return copy or not. Returns: DataFrameOrArrayGeneric: Copy or original data. Example: >>> a = np.array([1, 2, 3]) >>> b = get_copy_or_view(a, inplace=True) >>> id(a) == id(b) True >>> b = get_copy_or_view(a, inplace=False) >>> id(a) == id(b) False """ if inplace: return data if isinstance(data, pd.DataFrame): df_copy = data.copy() return df_copy if isinstance(data, np.ndarray): return data.copy()
[docs]def check_not_empty(data: DataFrameOrArrayGeneric): """Check whether there are data. It can happen that in some functions empty data would result error. Args: data (DataFrameOrArrayGeneric): Data Raises: TypeError: If `data.size == 0` """ if not data.size: if isinstance(data, np.ndarray): used_type = "numpy ndarray" else: used_type = "pandas DataFrame" raise TypeError(f"Used {used_type} is empty which would cause further error.")