Source code for mydatapreprocessing.load_data.load_data_functions.load_data_functions_internal

"""Module for load_data_functions subpackage."""

from __future__ import annotations
from pathlib import Path
import io

import pandas as pd
from typing_extensions import Literal

from ... import datasets


[docs]def download_data_from_url(url: str, ssl_verification: None | bool | str = None) -> io.BytesIO: """Download data from defined url and returns io.BytesIO. Args: url (str): Url with defined file. ssl_verification (None | bool | str, optional): Same meaning as in requests library. Raises: FileNotFoundError: If url is not available. Returns: io.BytesIO: Converted to io.BytesIO so it can later be used for example in pandas read_x functions. Example: >>> downloaded = download_data_from_url( ... "https://github.com/Malachov/mydatapreprocessing/blob/master/tests/test_files/csv.csv?raw=true" ... ) >>> downloaded <_io.BytesIO object at... >>> downloaded.readline() b'Column 1, Column 2... """ import requests try: request = requests.get(url, verify=ssl_verification) except requests.exceptions.RequestException as err: raise FileNotFoundError(f"Url '{url}' probably not available or no permissions available.") from err if not request or not (200 <= request.status_code < 300): raise RuntimeError( f"Request failed with status {request.status_code}.", ) return io.BytesIO(request.content)
[docs]def return_test_data(data: Literal["test_ramp", "test_sin", "test_random", "test_ecg"]) -> pd.DataFrame: """If want some test data, define just name and get data. Args: data (Literal['test_ramp', 'test_sin', 'test_random', 'test_ecg']): Possible test data. Most of it is generated, test_ecg is real data. Returns: pd.DataFrame: Test data. Example: >>> return_test_data('test_ramp') 0 0 0 1 1 2 2 ... """ if data == "test_ramp": return pd.DataFrame(datasets.ramp()) elif data == "test_sin": return pd.DataFrame(datasets.sin()) elif data == "test_random": return pd.DataFrame(datasets.random()) elif data == "test_ecg": return pd.DataFrame(datasets.get_ecg())
[docs]def get_file_type(data_path: Path, request_datatype_suffix: None | str = None): """Give file name or url with extension and return file extension. If file extension not at end of url add it extra. Args: data_path (Path): Defined path. It can also be URL, but it must be pathlib.Path in format. request_datatype_suffix (None | str): If there is no extension in name, it can be defined via parameter. Defaults to None. Raises: TypeError: If extension not inferred and not defined with param. Returns: str: Extension lowered like for example 'csv'. """ if request_datatype_suffix: file_type = request_datatype_suffix.lower() if file_type.startswith("."): file_type = file_type[1:] # If not suffix inferred, then maybe url that return as request - than suffix have to be configured else: # For example csv or json. On url, take everything after last dot file_type = data_path.suffix[1:].lower() if not file_type: raise TypeError( "Data has no suffix (e.g. csv). If using url with no suffix, setup" "'request_datatype_suffix' or insert data with local path or insert data for example in" f"DataFrame or numpy array. \n\nParsed data are '{data_path}'", ) return file_type