Source code for mydatapreprocessing.preprocessing.preprocessing_pipeline_internal

"""Module for preprocessing subpackage."""

from __future__ import annotations
from typing import TYPE_CHECKING, Generic, cast, Union

import numpy as np
import pandas as pd

from ..types import DataFrameOrArrayGeneric, Numeric
from . import preprocessing_functions as pf
from .preprocessing_config import default_preprocessing_config, PreprocessingConfig
from .inverse_preprocessing_config import InversePreprocessingConfig


if TYPE_CHECKING:
    from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

    ScalerType = Union[MinMaxScaler, RobustScaler, StandardScaler]

# Lazy load

# import scipy.signal
# import scipy.stats
# from sklearn import preprocessing


[docs]def preprocess_data( data: DataFrameOrArrayGeneric, config: PreprocessingConfig = default_preprocessing_config ) -> tuple[DataFrameOrArrayGeneric, InversePreprocessingConfig]: """Main preprocessing function, that call other functions based on configuration. Mostly for preparing data to be optimal as input into machine learning models. Args: data (DataFrameOrArrayGeneric): Input data that we want to preprocess. config (PreprocessingConfig): Configure data preprocessing. It's documented in `preprocessing_config` subpackage. You can import and edit `default_preprocessing_config`. Intellisense and static type analysis should work. Defaults to `default_preprocessing_config` Returns: PreprocessedData: If input in numpy array, then also output in array, if DataFrame input, then DataFrame output. Example: >>> import pandas as pd >>> import mydatapreprocessing.preprocessing as mdpp >>> df = pd.DataFrame( ... np.array([range(5), range(20, 25), np.random.randn(5)]).astype("float32").T, ... ) >>> df.iloc[2, 0] = 500 ... >>> config = mdpp.preprocessing_config.default_preprocessing_config.do.copy() >>> config.do.update({"remove_outliers": 1, "difference_transform": True, "standardize": "standardize"}) Predicted column moved to index 0, but for test reason test, use different one >>> processed_df, inverse_preprocessing_config_df = mdpp.preprocess_data(df, config) >>> processed_df 0 1 2 1 -0.707107 -0.707107 0.377062 3 1.414214 1.414214 0.991879 4 -0.707107 -0.707107 -1.368941 Inverse preprocessing is done for just one column - the one on index 0 You can use `first_column` in consolidation to move the column to that index. >>> inverse_preprocessing_config_df.difference_transform = df.iloc[0, 0] >>> inverse_processed_df = mdpp.preprocess_data_inverse( ... processed_df.iloc[:, 0].values, inverse_preprocessing_config_df ... ) >>> np.allclose(inverse_processed_df, np.array([1, 3, 4])) True """ preprocessed = data.copy() if config.remove_outliers: preprocessed = pf.remove_outliers(preprocessed, threshold=config.remove_outliers) if config.smooth: preprocessed = pf.smooth(preprocessed, config.smooth[0], config.smooth[1]) if config.difference_transform: if isinstance(preprocessed, np.ndarray): last_undiff_value = preprocessed[-1, 0] else: last_undiff_value = preprocessed.iloc[-1, 0] last_undiff_value = cast(Numeric, last_undiff_value) preprocessed = pf.do_difference(preprocessed) else: last_undiff_value = None if config.standardize: preprocessed, final_scaler = pf.standardize(preprocessed, used_scaler=config.standardize) else: final_scaler = None if config.discretization.discretize: preprocessed = pf.binning( preprocessed, config.discretization.discretize, config.discretization.binning_type ) preprocessed = cast(DataFrameOrArrayGeneric, preprocessed) inverse_preprocessing_config = InversePreprocessingConfig() inverse_preprocessing_config.difference_transform = last_undiff_value inverse_preprocessing_config.standardize = final_scaler return preprocessed, inverse_preprocessing_config
[docs]def preprocess_data_inverse( data: pd.DataFrame | np.ndarray, config: InversePreprocessingConfig ) -> np.ndarray: """Undo all data preprocessing to get real data. Does not inverse all the columns, but only defined one. Only predicted column is also returned. Order is reverse than preprocessing. Output is in numpy array. Args: data (pd.DataFrame | np.ndarray): One dimension (one column) preprocessed data. Do not use ndim > 1. config (InversePreprocessingConfig): Data necessary for inverse transformation. It does not need to be configured, but it is returned from `preprocess_data`. Returns: np.ndarray: Inverse preprocessed data Example: >>> import pandas as pd >>> import numpy as np >>> import mydatapreprocessing.preprocessing as mdpp ... >>> df = pd.DataFrame( ... np.array([range(5), range(20, 25), np.random.randn(5)]).astype("float32").T, ... ) >>> preprocessed, inverse_config = mdpp.preprocess_data(df.values) >>> preprocessed array([[-1.4142135 , -1.4142135 , 0.1004863 ], [-0.70710677, -0.70710677, 0.36739323], [ 0. , 0. , -1.1725829 ], [ 0.70710677, 0.70710677, 1.6235067 ], [ 1.4142135 , 1.4142135 , -0.9188035 ]], dtype=float32) >>> preprocess_data_inverse(preprocessed[:, 0], inverse_config) array([0., 1., 2., 3., 4.], dtype=float32) """ inverse = data.values if isinstance(data, pd.DataFrame) else data if config.standardize: inverse = config.standardize.inverse_transform(inverse.reshape(1, -1)).ravel() if config.difference_transform is not None: inverse = pf.inverse_difference(inverse, config.difference_transform) return inverse