Source code for mydatapreprocessing.consolidation.consolidation_functions.consolidation_functions_internal

"""Module for consolidation_functions subpackage."""

from __future__ import annotations
from typing import Any

from typing_extensions import Literal
import pandas as pd

import mylogging

from ...types import DataFrameOrArrayGeneric, PandasIndex, Numeric
from ...helpers import get_copy_or_view, get_column_name

# Lazy loaded
# from pandas.tseries.frequencies import to_offset

# TODO implement inplace parameter as in preprocessing


[docs]def check_shape_and_transform(data: DataFrameOrArrayGeneric, inplace=False) -> DataFrameOrArrayGeneric: """Check whether input data has expected shape. Some functions work with defined shape of data - (n_samples, n_features). If this is not the case, it will transpose the data and log that it happened. Args: data (DataFrameOrArrayGeneric): Input data. inplace (bool, optional): If True, then original data are edited. If False, copy is created. Defaults to False. Returns: DataFrameOrArrayGeneric: Data with verified shape. Example: >>> import numpy as np >>> data = np.array([range(10), range(10)]) >>> data.shape (2, 10) >>> data = check_shape_and_transform(data) >>> data.shape (10, 2) """ if data.shape[0] < data.shape[1]: data = get_copy_or_view(data, inplace) mylogging.info( "Input data must be in shape (n_samples, n_features) that means (rows, columns) Your shape is " f" {data.shape}. It's unusual to have more features than samples. Probably wrong shape.", caption="Data transposed warning!!!", ) data = data.T return data
[docs]def categorical_embedding( data: pd.DataFrame, embedding: Literal["label", "one-hot"] = "label", unique_threshold: Numeric = 0.6, inplace=False, ) -> pd.DataFrame: """Transform string categories such as 'US', 'FR' into numeric values. This is necessary for example in machine learnings models. Args: data (pd.DataFrame): Data with string (pandas Object dtype) columns. embedding("label", "one-hot", optional): 'label' or 'one-hot'. Categorical encoding. Create numbers from strings. 'label' give each category (unique string) concrete number. Result will have same number of columns. 'one-hot' create for every category new column. Only columns, where are strings repeating (unique_threshold) will be used. Defaults to "label". unique_threshold(Numeric, optional): Remove string columns, that have to many categories (ids, hashes etc.). E.g 0.9 defines that in column of length 100, max number of categories to not to be deleted is 10 (90% non unique repeating values). Defaults to 0.6. Min is 0, max is 1. Defaults is 0.6. inplace (bool, optional): If True, then original data are edited. If False, copy is created. Defaults to False. Returns: pd.DataFrame: DataFrame where string columns transformed to numeric. Raises: TypeError: If there is unhashable object in values for example. Example: >>> df = pd.DataFrame(["One", "Two", "One", "Three", "One"]) >>> categorical_embedding(df, embedding="label", unique_threshold=0.1) 0 0 0 1 2 2 0 3 1 4 0 >>> categorical_embedding(df, embedding="one-hot", unique_threshold=0.1) One Three Two 0 1 0 0 1 0 0 1 2 1 0 0 3 0 1 0 4 1 0 0 """ data = get_copy_or_view(data, inplace) to_drop = [] for i in data.select_dtypes( exclude=["number"], ): try: if (data[i].nunique() / len(data[i])) >= (1 - unique_threshold): to_drop.append(i) continue except TypeError: to_drop.append(i) continue data[i] = data[i].astype("category", copy=False) if embedding == "label": data[i] = data[i].cat.codes if embedding == "one-hot": data = data.join(pd.get_dummies(data[i])) to_drop.append(i) # Drop columns with too few categories - drop all columns at once to better performance data.drop(to_drop, axis=1, inplace=True) return data
[docs]def set_datetime_index( df: pd.DataFrame, name_or_index: PandasIndex, on_error: Literal["ignore", "raise"] = "ignore", inplace: bool = False, ) -> pd.DataFrame: """Set defined column as index and convert it to datetime. Args: df (pd.DataFrame): Input data. name_or_index (PandasIndex): Name or index of datetime column that will be set as index. Defaults to None. on_error (Literal["ignore", "raise"]): What happens if converting to datetime fails. Defaults to "ignore". inplace (bool, optional): If True, then original data are edited. If False, copy is created. Defaults to False. Raises: ValueError: If defined column failed to convert to datetime. Returns: pd.DataFrame: Data with datetime index. Example: >>> from datetime import datetime ... >>> df = pd.DataFrame( ... { ... "col_1": [1] * 3, ... "col_2": [2] * 3, ... "date": [ ... datetime(2022, 1, 1), ... datetime(2022, 2, 1), ... datetime(2022, 3, 1), ... ], ... } ... ) >>> df = set_datetime_index(df, 'date', inplace=True) >>> isinstance(df.index, pd.DatetimeIndex) True """ df = get_copy_or_view(df, inplace) index_name = get_column_name(df, name_or_index) df.set_index(index_name, drop=True, inplace=True) try: df.index = pd.to_datetime(df.index) # type: ignore except ValueError as err: if on_error == "raise": raise ValueError( "Error in 'mydatapreprocessing' package in 'set_datetime_index' function. Setting of " f"datetime index from column '{index_name}' failed.", ) from err return df
[docs]def infer_frequency( df: pd.DataFrame, on_error: Literal[None, "warn", "raise"] = "warn", inplace=False ) -> pd.DataFrame: """When DataFrame has datetime index, it will try to infer it's frequency. Args: df (pd.DataFrame): Input data. on_error (Literal[None, "warn", "raise"]): Define what to do when index is not inferred. Defaults to "warn. inplace (bool, optional): If True, then original data are edited. If False, copy is created. Defaults to False. Raises: ValueError: If defined column failed to convert to datetime. Returns: pd.DataFrame: Data with datetime index. Example: >>> df = pd.DataFrame([[1], [2], [3]], index=["08/04/2022", "09/04/2022", "10/04/2022"]) >>> df.index = pd.to_datetime(df.index) >>> df = infer_frequency(df) >>> df.index.freq """ df = get_copy_or_view(df, inplace) if isinstance(df.index, (pd.DatetimeIndex, pd.TimedeltaIndex)): if df.index.freq is None: freq = pd.infer_freq(df.index) if freq: from pandas.tseries.frequencies import to_offset df.index.freq = to_offset(freq) if df.index.freq is None: message = ( "Error in 'mydatapreprocessing' package in 'infer_frequency' function. Frequency inferring " "failed. Check the datetime index and try 'set_datetime_index' first." ) if on_error == "warn": mylogging.warn(message, caption="Error in 'infer_frequency'") elif on_error == "raise": raise TypeError(message) else: raise TypeError( "Error in 'mydatapreprocessing' package in 'infer_frequency' function. Index is not" "pd.DatetimeIndex | pd.TimedeltaIndex type. You can use 'set_datetime_index' function to convert" "it from string." ) return df
[docs]def resample( df: pd.DataFrame, freq: Literal["S", "min", "H", "M", "Y"] | str, resample_function: Literal["sum", "mean"], ): """Change the sampling frequency. Args: df (pd.DataFrame): Input data. freq (Literal["S", "min", "H", "M", "Y"] | str): Frequency of resampled data. For possible options check pandas 'Offset aliases'. resample_function (Literal['sum', 'mean'], optional): 'sum' or 'mean'. Whether sum resampled columns, or use average. Defaults to 'sum'. Returns: pd.DataFrame: Resampled data. Example: >>> from datetime import datetime, timedelta ... >>> df = pd.DataFrame( ... { ... "date": [ ... datetime(2022, 1, 1), ... datetime(2022, 1, 2), ... datetime(2022, 2, 1), ... datetime(2022, 4, 1) ... ], ... "col_1": [1] * 4, ... "col_2": [2] * 4, ... } ... ) >>> df date col_1 col_2 0 2022-01-01 1 2 1 2022-01-02 1 2 2 2022-02-01 1 2 3 2022-04-01 1 2 >>> df = df.set_index("date") >>> df = resample(df, "M", "sum") >>> df col_1 col_2 date 2022-01-31 2 4 2022-02-28 1 2 2022-03-31 0 0 2022-04-30 1 2 >>> df.index.freq <MonthEnd> """ df.sort_index(inplace=True) if resample_function == "mean": df = pd.DataFrame(df.resample(freq).mean()) elif resample_function == "sum": df = pd.DataFrame(df.resample(freq).sum()) return df
[docs]def move_on_first_column(df: pd.DataFrame, name_or_index: PandasIndex) -> pd.DataFrame: """Move defined column on index 0. Use case for that can be for example to be good visible in generated table. Args: df (pd.DataFrame): Input data. name_or_index (PandasIndex): Index or name of the column that will be moved. Raises: KeyError: Defined column not found in data. Returns: pd.DataFrame: DataFrame with defined column at index 0. Example: >>> move_on_first_column(pd.DataFrame([[1, 2, 3]], columns=["One", "Two", "Three"]), "Two").columns Index(['Two', 'One', 'Three']... """ index = get_column_name(df, name_or_index) df.insert(0, index, df.pop(index)) # type: ignore - It's validated in get_column_name return df
[docs]def remove_nans( data: DataFrameOrArrayGeneric, remove_all_column_with_nans_threshold: None | Numeric = None, remove_nans_type: None | Literal["interpolate", "mean", "neighbor", "remove"] | Any = None, inplace: bool = False, ) -> DataFrameOrArrayGeneric: """Remove NotANumber values. Columns where are too many NaN values are dropped. Then in rest of columns rows with NaNs are removed or Nans are interpolated. Args: data (DataFrameOrArrayGeneric): Data in shape (n_samples, n_features). remove_all_column_with_nans_threshold (None | Numeric, optional): From 0 to 1. Require that many non-nan numeric values in column to not be deleted. E.G if value is 0.9 with column with 10 values, 90% must be numeric that implies max 1 np.nan can be presented, otherwise column will be deleted. Defaults to 0.85. remove_nans_type (None | Literal["interpolate", "mean", "neighbor", "remove"] | Any, optional): Remove or replace rest nan values. If you want to use concrete value, just use value directly. Defaults to 'interpolate'. inplace (bool, optional): If True, then original data are edited. If False, copy is created. Defaults to False. Example: >>> import numpy as np ... >>> array = np.array([[1, 2, np.nan], [2, np.nan, np.nan], [3, 4, np.nan]]) >>> array array([[ 1., 2., nan], [ 2., nan, nan], [ 3., 4., nan]]) >>> cleaned_df = remove_nans( ... array, ... remove_all_column_with_nans_threshold=0.5, ... remove_nans_type="interpolate" ... ) >>> cleaned_df array([[1., 2.], [2., 3.], [3., 4.]]) """ data = get_copy_or_view(data, inplace) df = pd.DataFrame(data) if not isinstance(data, pd.DataFrame) else data # Remove columns that have to much nan values if remove_all_column_with_nans_threshold: df = df.dropna(axis=1, thresh=int(len(df) * (remove_all_column_with_nans_threshold))) if remove_nans_type is not None: # Replace rest of nan values if remove_nans_type == "interpolate": df.interpolate(inplace=True) elif remove_nans_type == "remove": df.dropna(axis=0, inplace=True) elif remove_nans_type == "neighbor": # Need to use both directions if first or last value is nan df.fillna(method="ffill", inplace=True) elif remove_nans_type == "mean": for col in df.columns: df[col] = df[col].fillna(df[col].mean()) else: df.fillna(remove_nans_type, inplace=True) # Forward fill and interpolate can miss som nans if on first row if remove_nans_type in ["interpolate", "neighbor"]: df.fillna(method="bfill", inplace=True) if isinstance(data, pd.DataFrame): return df else: return df.values
[docs]def cast_str_to_numeric(df: pd.DataFrame, on_error: Literal["ignore", "raise"] = "ignore") -> pd.DataFrame: """Convert string values in DataFrame. Args: df (pd.DataFrame): Data on_error (Literal["ignore", "raise"]): What to do if meet error. Defaults to 'ignore'. Returns: pd.DataFrame: Data with possibly converted types. """ df = df.apply(pd.to_numeric, errors=on_error) # type: ignore return df