Source code for mydatapreprocessing.consolidation.consolidation_config.subconfigurations.subconfigurations
"""Content for consolidation subconfigs subpackage."""
from __future__ import annotations
from typing import Any
from typing_extensions import Literal
import numpy as np
import pandas as pd
from mypythontools.config import Config, MyProperty
from ....types import PandasIndex, Numeric
[docs]class Datetime(Config):
"""Define whether to set datetime index."""
@MyProperty
def datetime_column(self) -> PandasIndex | None:
"""Name or index of datetime column that will be set as index and converted to datetime.
Type:
PandasIndex | None
Default:
None
If None, then no column will be set as index.
"""
return None
@MyProperty
def on_set_datetime_error(self) -> Literal["ignore", "raise"]:
"""Define what happens if converting to datetime fails.
Type:
Literal["ignore", "raise"]
Default:
"ignore"
"""
return "ignore"
[docs]class Resample(Config):
"""Change the sampling frequency."""
@MyProperty
def resample(self) -> None | Literal["S", "min", "H", "M", "Y"] | str:
"""Frequency of resampled data.
Type:
None | Literal["S", "min", "H", "M", "Y"] | str
Default:
None
If None, then data are not resampled.
"""
return None
@MyProperty
def resample_function(self) -> Literal["sum", "mean"]:
"""Define whether resampled values are sum of values or it's average.
Type:
Literal["sum", "mean"]
Default:
"sum"
"""
return "sum"
[docs]class RemoveMissingValues(Config):
"""Remove NaN values."""
@MyProperty
def remove_all_column_with_nans_threshold(self) -> None | Numeric:
"""Delete all the column based on amount of NaN values.
Type:
None | Numeric
Default:
0.85
From 0 to 1. Require that many non-nan numeric values to not be deleted. E.G if value is 0.9 with
column with 10 values, 90% must be numeric that implies max 1 np.nan can be presented, otherwise
column will be deleted.
"""
return 0.85
@MyProperty
def remove_nans_type(self) -> None | Literal["interpolate", "mean", "neighbor", "remove"] | Any:
"""Remove rows where NaN or replace rest nan values.
Type:
None | Literal["interpolate", "mean", "neighbor", "remove"] | Any
Default:
"interpolate"
If None, NaN are not removed. If you want to replace with concrete value, use float or int type.
"""
return "interpolate"
[docs]class StringsToNumeric(Config):
"""Remove or replace string values with numeric."""
@MyProperty
def embedding(self) -> None | Literal["label", "one-hot"]:
"""Implement categorical encoding.
Type:
None | Literal["label", "one-hot"]
Default:
"label"
Create numbers from strings. 'label' give each category (unique string) concrete number. Result will
have the same number of columns. 'one-hot' create for every category new column. Only columns, where
are strings repeating (unique_threshold) will be used.
"""
return "label"
@MyProperty
def cast_str_to_numeric(self) -> bool:
"""Try to convert strings to numeric.
Type:
bool
Default:
True
Errors will be ignored, so if column cannot be converted to numeric, it's untouched.
"""
return True
@MyProperty
def only_numeric(self) -> bool:
"""Remove all non numeric values.
Type:
bool
Default:
True
If True, all the non numeric columns will be dropped. 'cast_str_to_numeric' and 'embedding' are used
before dropping columns.
"""
return True
@MyProperty
def unique_threshold(self) -> Numeric:
"""Remove string columns, that have to many categories.
Type:
Numeric
Default:
0.6
E.g 0.9 define, that if column contain more that 90% of NOT unique values it's deleted. Min is 0, max
is 1. It will remove ids, hashes etc.
"""
return 0.6