Source code for almirah.dataset
""" Model classes to represent a dataset and its components."""
import pandas as pd
from typing import List
from typing import Optional
from sqlalchemy import ForeignKey
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import relationship
from sqlalchemy.orm import mapped_column
from .core import Base
from .core import uniquify
from .indexer import index
[docs]
class Component(Base):
"""Generic component representation within the dataset architecture."""
__tablename__ = "components"
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
type: Mapped[str]
__mapper_args__ = {"polymorphic_identity": "component", "polymorphic_on": "type"}
def __repr__(self) -> str:
return f"<Component id: '{self.id}' type: '{self.type}'>"
[docs]
@uniquify(index)
class Dataset(Component):
"""Represents a collection of components as a dataset."""
__tablename__ = "datasets"
__identifier_attrs__ = {"name"}
id: Mapped[int] = mapped_column(ForeignKey("components.id"), primary_key=True)
name: Mapped[str] = mapped_column(unique=True, nullable=False)
components: Mapped[List["Component"]] = relationship(secondary="collections")
__mapper_args__ = {"polymorphic_identity": "dataset"}
def __init__(self, *, name: str) -> None:
self.name = name
[docs]
def add(self, *components: Component) -> None:
"""
Adds components to the dataset.
Parameters
----------
components : Component
`Component` instances to be added to the dataset.
Raises
------
TypeError
If a dataset is added to itself or circular references are detected.
"""
if any(c == self or self in getattr(c, "components", []) for c in components):
raise TypeError("Dataset cannot include itself as a component")
self.components.extend(components)
[docs]
def index(self, **kwargs):
"""Perform indexing on components."""
for c in self.components:
if index := callable(getattr(c, "index")):
index(c, **kwargs)
[docs]
def report(self) -> None:
"""Generate report for dataset."""
print(f"Components of {self}:")
for c in self.components:
print("{!r:5}".format(c))
for c in self.components:
c.report()
[docs]
def query(self, returns: Optional[List[str]] = None, **filters) -> List:
"""
Query components based on filter criteria.
Parameters
----------
returns : list of str, optional
Specific fields to return, defaults to all or object if None.
filters : dict
Filter conditions to apply on the query.
Returns
-------
list
List of components or queried data meeting the filter criteria.
"""
results = list()
for c in self.components:
result = c.query(returns, **filters)
if isinstance(result, pd.DataFrame) and not result.empty:
return result
if result:
results.extend(result)
return results
def __repr__(self) -> str:
return f"<Dataset name: '{self.name}'>"
class Collection(Base):
"""
Represents associations between a dataset and its components.
Attributes
----------
dataset_id : int
Foreign key to the dataset.
component_id : int
Foreign key to the component.
"""
__tablename__ = "collections"
dataset_id: Mapped[int] = mapped_column(ForeignKey("datasets.id"), primary_key=True)
component_id: Mapped[int] = mapped_column(
ForeignKey("components.id"), primary_key=True
)
def __repr__(self):
return f"<Collection dataset: {self.dataset_id} component: {self.component_id}>"