import datetime
import json
import numpy as np
import pandas as pd
import dask.array as da
import dask
from tqdm import tqdm
from tabulate import tabulate
from dama.abc.data import AbsData
from dama.data.it import Iterator, BaseIterator, BatchIterator
from dama.utils.core import Hash, Login, Metadata, Chunks, Shape
from dama.abc.driver import AbsDriver
from dama.drivers.core import Memory
from dama.drivers.sqlite import Sqlite
from dama.utils.logger import log_config
from dama.utils.config import get_settings
from dama.utils.decorators import cache, clean_cache
from dama.utils.files import get_dir_file_size
from dama.utils.order import order_table
from dama.groups.core import DaGroup
from pydoc import locate
settings = get_settings("paths")
log = log_config(__name__)
[docs]class Data(AbsData):
def __init__(self, name: str = None, driver: AbsDriver = None, group_name: str = None,
chunks: Chunks = None, auto_chunks=False, metadata_path: str = None):
if driver is None:
self.driver = Memory()
else:
self.driver = driver
if name is None and not isinstance(self.driver, Memory):
raise Exception("I can't build a dataset without a name, plese add a name to this dataset.")
if self.driver.persistent is True:
if metadata_path is not None:
self.metadata_path = metadata_path
else:
self.metadata_path = settings["metadata_path"]
self.metadata_driver = Sqlite(login=Login(table="data"), path=self.metadata_path)
else:
self.metadata_path = None
self.metadata_driver = None
self.name = name
self.header_map = ["author", "description"]
self.group_name = group_name
self.dtypes = None
self.hash = None
self.author = None
self.description = None
self.timestamp = None
self.compressor_params = None
self.chunksize = chunks
self.from_ds_hash = None
self.auto_chunks = auto_chunks
if self.driver.path is None:
path = settings["data_path"]
else:
path = self.driver.path
self.driver.build_url(self.name, group_level=self.group_name, path=path)
@property
def author(self):
return self._get_attr('author')
@author.setter
def author(self, value):
if value is not None:
self._set_attr('author', value)
@property
def dtype(self):
return self.driver.absgroup.dtype
@property
def description(self):
return self._get_attr('description')
@description.setter
def description(self, value):
if value is not None:
self._set_attr('description', value)
@property
def timestamp(self):
return self._get_attr('timestamp')
@timestamp.setter
def timestamp(self, value):
if value is not None:
self._set_attr('timestamp', value)
@property
def hash(self):
return self._get_attr('hash')
@hash.setter
def hash(self, value):
if value is not None:
self._set_attr('hash', value)
@property
def compressor_params(self):
return json.loads(self._get_attr('compressor_params'))
@compressor_params.setter
def compressor_params(self, value):
if value is not None:
self._set_attr('compressor_params', json.dumps(value))
@classmethod
def module_cls_name(cls):
return "{}.{}".format(cls.__module__, cls.__name__)
@property
@cache
def data(self) -> DaGroup:
return self.driver.data(chunks=self.chunksize)
@data.setter
@clean_cache
def data(self, v):
pass
def clean_data_cache(self):
self.data = None
@property
def from_ds_hash(self):
return self._get_attr('from_ds_hash')
@from_ds_hash.setter
def from_ds_hash(self, value):
if value is not None:
self._set_attr('from_ds_hash', value)
def open(self):
self.driver.open()
if self.driver.data_tag is None:
self.driver.data_tag = self.name
if self.driver.mode in ["w", "a", "r+"]:
if len(self.driver.compressor_params) > 0:
self.compressor_params = self.driver.compressor_params
if self.auto_chunks is True:
try:
self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes)
except KeyError as e:
log.error(e)
def close(self):
self.driver.close()
self.data = None
def __getitem__(self, key):
return self.data[key]
def __setitem__(self, key, value):
self.data[key].set(key, value)
def __iter__(self):
return self
def __next__(self):
return next(self.data)
def _set_attr(self, name, value):
if value is not None:
log.debug("SET attribute {name} {value}".format(name=name, value=value))
self.driver.attrs[name] = value
def _get_attr(self, name):
try:
return self.driver.attrs[name]
except KeyError:
log.debug("Not found attribute {} in file {}".format(name, self.url))
return None
except IOError as e:
log.debug(e)
log.debug("Error opening {} in file {}".format(name, self.url))
return None
def batchs_writer(self, data):
batch_size = getattr(data, 'batch_size', 0)
log.info("Writing with chunks {}".format(batch_size))
if batch_size > 0:
absgroup = self.driver.absgroup
for smx in tqdm(data, total=data.num_splits()):
absgroup.set(smx.slice, smx)
else:
for i, smx in tqdm(enumerate(data), total=data.num_splits()):
for j, group in enumerate(self.groups):
self.data[group][i] = smx[j]
def destroy(self):
hash_hex = self.hash
self.driver.destroy()
if self.driver.persistent is True:
with Metadata(self.metadata_driver) as metadata:
metadata.invalid(hash_hex)
@property
def url(self) -> str:
return self.driver.url
@property
def metadata_url(self) -> str:
return self.metadata_driver.url
def __len__(self):
return len(self.data)
def __repr__(self):
return repr(self.data)
@property
def shape(self):
return self.driver.absgroup.shape
@property
def groups(self) -> tuple:
return self.driver.groups
@property
def dtypes(self) -> np.dtype:
return self.driver.dtypes
@dtypes.setter
def dtypes(self, value):
if value is not None:
self.driver.set_schema(value)
def info(self):
print(' ')
print('Name: {}'.format(self.name))
print('Author: {}'.format(self.author))
print('Description: {}'.format(self.description))
print('URL path: {}'.format(self.driver.url))
print('Hash: {}'.format(self.hash))
print(' ')
headers = ["Group", "Shape"]
table = []
for group, shape in self.shape.items():
table.append([group, shape])
print(order_table(headers, table, "Group"))
def metadata(self) -> dict:
meta_dict = dict()
meta_dict["hash"] = self.hash
meta_dict["path"] = self.driver.path
meta_dict["metadata_path"] = self.metadata_path
meta_dict["group_name"] = self.group_name
meta_dict["driver_module"] = self.driver.module_cls_name()
meta_dict["driver_name"] = self.driver.cls_name()
meta_dict["name"] = self.name
meta_dict["size"] = get_dir_file_size(self.url)
meta_dict["timestamp"] = self.timestamp
meta_dict["author"] = self.author
meta_dict["num_groups"] = len(self.groups)
meta_dict["description"] = self.description if self.description is None else ""
meta_dict["from_ds_hash"] = self.from_ds_hash
return meta_dict
def metadata_to_json(self, f):
metadata = self.metadata()
json.dump(metadata, f)
def write_metadata(self):
if self.driver.persistent is True:
with Metadata(self.metadata_driver, self.metadata()) as metadata:
dtypes = np.dtype([("hash", object), ("name", object), ("author", object),
("description", object), ("size", int), ("driver_module", object),
("path", object), ("driver_name", object), ("group_name", object),
("timestamp", np.dtype("datetime64[ns]")), ("num_groups", int),
("is_valid", bool), ("from_ds_hash", object)])
timestamp = metadata["timestamp"]
metadata["timestamp"] = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M UTC')
metadata["group_name"] = "s/n" if self.group_name is None else self.group_name
metadata["is_valid"] = True
metadata.set_schema(dtypes, unique_key=["hash", ["path", "name", "driver_name", "group_name"]])
metadata.insert_update_data(keys=["path", "name", "driver_name", "group_name"])
def calc_hash(self, with_hash: str = 'sha1') -> str:
hash_obj = Hash(hash_fn=with_hash)
header = [getattr(self, attr) for attr in self.header_map]
header = [attr for attr in header if attr is not None]
hash_obj.hash.update("".join(header).encode("utf-8"))
for group in self.groups:
it = Iterator(self.data[group]).batchs(chunks=self.chunksize)
hash_obj.update(it.only_data())
return str(hash_obj)
def from_data(self, data, chunks=None, with_hash: str = "sha1", from_ds_hash=None):
if isinstance(data, da.Array):
data = DaGroup.from_da(data)
self.chunksize = data.chunksize
elif isinstance(data, Iterator):
if chunks is None:
self.chunksize = Chunks.build_from_shape(data.shape, data.dtypes)
else:
self.chunksize = Chunks.build_from(chunks, data.groups)
data = data.batchs(chunks=self.chunksize)
self.chunksize = data.chunksize
elif isinstance(data, dict):
if chunks is None:
shape, dtypes = Shape.get_shape_dtypes_from_dict(data)
self.chunksize = Chunks.build_from_shape(shape, dtypes)
else:
self.chunksize = Chunks.build_from(chunks, tuple(data.keys()))
data = DaGroup(data, chunks=self.chunksize)
elif isinstance(data, DaGroup) or type(data) == DaGroup:
self.chunksize = data.chunksize
elif not isinstance(data, BaseIterator):
data = Iterator(data)
if chunks is None:
self.chunksize = Chunks.build_from_shape(data.shape, data.dtypes)
else:
self.chunksize = data.shape.to_chunks(chunks)
data = data.batchs(chunks=self.chunksize)
self.chunksize = data.chunksize
self.dtypes = data.dtypes
self.driver.set_data_shape(data.shape)
if isinstance(data, BatchIterator) or isinstance(data, Iterator):
self.chunksize = data.chunksize
self.batchs_writer(data)
else:
data.store(self)
if with_hash is not None:
c_hash = self.calc_hash(with_hash=with_hash)
else:
c_hash = None
self.from_ds_hash = from_ds_hash
self.hash = c_hash
self.timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M UTC")
self.write_metadata()
def to_df(self) -> pd.DataFrame:
return self.data.to_df()
def to_ndarray(self, dtype=None) -> np.ndarray:
return self.data.to_ndarray(dtype=dtype)
def concat(self, datasets: tuple, axis=0):
da_groups = []
for ds in datasets:
da_groups.append(ds.data)
da_group = DaGroup.concat(da_groups, axis=axis)
self.from_data(da_group)
def stadistics(self):
headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "unique", "dtype", "shape"]
self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes)
table = []
for group, (dtype, _) in self.dtypes.fields.items():
values = dict()
values["dtype"] = dtype
values["group"] = group
values["shape"] = self.shape[group]
darray = self.data[group].darray
if dtype == np.dtype(float) or dtype == np.dtype(int):
da_mean = da.around(darray.mean(), decimals=5)
da_std = da.around(darray.std(), decimals=5)
da_min = da.around(darray.min(), decimals=5)
da_max = da.around(darray.max(), decimals=5)
result = dask.compute([da_mean, da_std, da_min, da_max])[0]
values["mean"] = result[0]
values["std dev"] = result[1]
values["min"] = result[2]
values["max"] = result[3]
if len(self.shape[group]) == 1:
da_percentile = da.percentile(darray, [25, 50, 75])
result = da_percentile.compute()
values["25%"] = result[0]
values["50%"] = result[1]
values["75%"] = result[2]
else:
values["25%"] = "-"
values["50%"] = "-"
values["75%"] = "-"
values["nonzero"] = da.count_nonzero(darray).compute()
values["unique"] = "-"
else:
values["mean"] = "-"
values["std dev"] = "-"
values["min"] = "-"
values["max"] = "-"
values["25%"] = "-"
values["50%"] = "-"
values["75%"] = "-"
values["nonzero"] = "-"
da_unique = da.unique(darray)
values["unique"] = dask.compute(da_unique)[0].shape[0]
row = []
for column in headers:
row.append(values[column])
table.append(row)
return tabulate(table, headers)
@staticmethod
def load(hash: str, metadata_driver: AbsDriver, metadata_path: str=None) -> 'Data':
with Metadata(metadata_driver) as metadata:
data = metadata.query(
"SELECT name, driver_module, path, group_name, hash FROM {} WHERE hash = ?".format(
metadata_driver.login.table),
(hash,))
if len(data) == 0:
log.warning("Resource {} does not exists in table '{}' in url {}".format(hash,
metadata_driver.login.table,
metadata_driver.url))
else:
row = data[0]
data_driver = locate(row[1])
path = row[2]
group_name = None if row[3] == "s/n" else row[3]
name = row[0]
return Data(name=name, group_name=group_name, driver=data_driver(path=path, mode="r"),
metadata_path=metadata_path)