Source code for hypergol.dataset_factory

from pathlib import Path
from hypergol.repr import Repr
from hypergol.dataset import Dataset
from hypergol.repo_data import RepoData


[docs]class DatasetFactory(Repr): """Convenience class to create lots of datasets at once. Used in pipelines where multiple datasets are created into the same location, project, branch """
[docs] def __init__(self, location, project, branch, chunkCount, repoData=None): """ Parameters ---------- location : str path the project is in project : str project name branch : str branch name repoData : RepoData stores the commit information at the creation of the dataset chunkCount : int = {16 , 256, 4096} How many files the data will be stored in, sets the granularity of multithreaded processing """ self.location = location self.project = project self.branch = branch self.chunkCount = chunkCount self.repoData = repoData or RepoData.get_dummy()
@property def projectDirectory(self): return Path(self.location, self.project) @property def branchDirectory(self): return Path(self.location, self.project, self.branch)
[docs] def get(self, dataType, name, branch=None, chunkCount=None): """Creates a dataset with the parameters given and the factory's own parameters Parameters ---------- dataType : BaseData Type of the dataset branch : str=None Name of the branch to load the dataset from (if None, defaults to current) name : str Name of the dataset (recommended to be in snakecase) chunkCount : int=None Number of chunks, if None, the factory's own value will be used """ if chunkCount is None: chunkCount = self.chunkCount if branch is None: branch = self.branch return Dataset( dataType=dataType, location=self.location, project=self.project, branch=branch, name=name, chunkCount=chunkCount, repoData=self.repoData )