Source code for hypergol.dataset_factory

from pathlib import Path
from hypergol.repr import Repr
from hypergol.dataset import Dataset
from hypergol.repo_data import RepoData


[docs]class DatasetFactory(Repr):
    """Convenience class to create lots of datasets at once. Used in pipelines where multiple datasets are created into the same location, project, branch
    """

[docs]    def __init__(self, location, project, branch, chunkCount, repoData=None):
        """
        Parameters
        ----------
        location : str
            path the project is in
        project : str
            project name
        branch : str
            branch name
        repoData : RepoData
            stores the commit information at the creation of the dataset
        chunkCount : int = {16 , 256, 4096}
            How many files the data will be stored in, sets the granularity of multithreaded processing
        """
        self.location = location
        self.project = project
        self.branch = branch
        self.chunkCount = chunkCount
        self.repoData = repoData or RepoData.get_dummy()

    @property
    def projectDirectory(self):
        return Path(self.location, self.project)

    @property
    def branchDirectory(self):
        return Path(self.location, self.project, self.branch)

[docs]    def get(self, dataType, name, branch=None, chunkCount=None):
        """Creates a dataset with the parameters given and the factory's own parameters

        Parameters
        ----------
        dataType : BaseData
            Type of the dataset
        branch : str=None
            Name of the branch to load the dataset from (if None, defaults to current)
        name : str
            Name of the dataset (recommended to be in snakecase)
        chunkCount : int=None
            Number of chunks, if None, the factory's own value will be used
        """
        if chunkCount is None:
            chunkCount = self.chunkCount
        if branch is None:
            branch = self.branch
        return Dataset(
            dataType=dataType,
            location=self.location,
            project=self.project,
            branch=branch,
            name=name,
            chunkCount=chunkCount,
            repoData=self.repoData
        )