Source code for mydatapreprocessing.load_data.load_data_functions.data_parsers.data_parsers_internal
"""Module for data_parsers subpackage."""
from __future__ import annotations
from typing import Any
import io
from pathlib import Path
from typing_extensions import Literal
import pandas as pd
import mylogging
from ...load_data_internal import load_data
[docs]def csv_load(
data: io.BytesIO | str | Path,
csv_style: Literal["infer"] | dict = "infer",
header: Literal["infer"] | None | int = None,
max_imported_length: None | int = None,
) -> pd.DataFrame:
"""Load CSV data and infer used separator.
Args:
data (io.BytesIO | str | Path): Input data.
csv_style (Literal["infer"] | dict, optional): If infer, inferred automatically else dictionary
with `sep` and `decimal`. E.g. {'sep': ';', 'dec': ','}. Defaults to "infer".
header (Literal['infer'] | None | int, optional): First row used. Usually with column names.
Defaults to None.
max_imported_length (int, optional): Last N rows used. Defaults to None.
Raises:
RuntimeError: If loading fails.
Returns:
pd.DataFrame: Loaded data.
"""
if csv_style == "infer":
if isinstance(data, io.BytesIO):
data_line = _get_further_data_line(data)
data.seek(0)
else:
with open(data, "r") as data_read:
data_line = _get_further_data_line(data_read)
if data_line.count(";") and data_line.count(";") >= data_line.count(",") - 1:
sep = ";"
decimal = ","
else:
sep = ","
decimal = "."
else:
sep = csv_style["sep"]
decimal = csv_style["decimal"]
try:
loaded = pd.read_csv(data, header=header, sep=sep, decimal=decimal)
except UnicodeDecodeError:
loaded = pd.read_csv(
data,
header=header,
sep=sep,
decimal=decimal,
encoding="cp1252",
)
except Exception as err:
raise RuntimeError(
"CSV load failed. Try to set correct `header` and `csv_style`",
) from err
if max_imported_length:
loaded = loaded.iloc[-max_imported_length:, :]
return loaded
[docs]def load_dict(data: dict[str, Any], data_orientation: Literal["index", "columns"] = "columns"):
"""Load dict with values to DataFrame.
Args:
data (dict[str, Any]): Data with array like values.
data_orientation (Literal["index", "columns"], optional): Define dict data orientation.
Defaults to "columns".
Returns:
pd.DataFrame: Loaded data.
"""
if isinstance(next(iter(data.values())), list):
dict_of_lists = data
else:
dict_of_lists = {k: [l] for (k, l) in data.items()}
return pd.DataFrame.from_dict(dict_of_lists, orient=data_orientation)
[docs]def json_load(
data: str | Path | io.BytesIO, field: str, data_orientation: Literal["index", "columns"] = "columns"
):
"""Load data from json to DataFrame.
The reason why pandas read_json is not used is that usually just some subfield with inner json is used.
Args:
data (str | Path | io.BytesIO): Input data. Path to file or io.BytesIO created for example from
request content.
field (str, optional): If you need to use just a node from data. You can use dot for entering another
levels of nested data. For example "key_1.sub_key_1"
data_orientation (Literal["index", "columns"], optional): Define dict data orientation.
Defaults to "columns".
Raises:
KeyError: If defined key is not available.
Returns:
pd.DataFrame: Loaded data.
"""
# pandas is not used so it's possible to use just one fields values as subset to original data
import json
if isinstance(data, io.BytesIO):
data_loaded = json.loads(data.read())
else:
with open(data) as json_file:
data_loaded = json.load(json_file)
if field:
for i in field.split("."):
try:
data_loaded = data_loaded[i]
except KeyError as err:
raise KeyError(
mylogging.format_str(f"Data load error. Defined field '{field}' not found in data.")
) from err
if isinstance(data_loaded, list):
return load_data(data_loaded)
else:
return load_dict(data_loaded, data_orientation)
def _get_further_data_line(data) -> str:
data_line = ""
for _ in range(20):
new_line = data.readline()
if new_line:
data_line = new_line
else:
break
return str(data_line)