Source code for darwin.dataset.identifier

import re
from typing import Optional, Tuple, Union



[docs]
class DatasetIdentifier:
    """
    Formal representation of a dataset identifier for the SDK.

    A dataset identifier is a string that uniquely identifies a dataset on Darwin.
    A dataset identifier is made of the following substrings:
    ``<team-slug>/<dataset-slug>:<version>``.

    If ``version`` is missing, it defaults to ``latest``.

    Parameters
    ----------
    dataset_slug : str
        The slugified name of the dataset.
    team_slug : Optional[str], default: None
        The slugified name of the team.
    version : Optional[str], default: None
        The version of the identifier.

    Attributes
    ----------
    dataset_slug : str
        The slugified name of the dataset.
    team_slug : Optional[str], default: None
        The slugified name of the team.
    version : Optional[str], default: None
        The version of the identifier.
    """

    def __init__(
        self,
        dataset_slug: str,
        team_slug: Optional[str] = None,
        version: Optional[str] = None,
    ):
        self.dataset_slug = dataset_slug
        self.team_slug = team_slug
        self.version = version


[docs]
    @classmethod
    def parse(cls, identifier: Union[str, "DatasetIdentifier"]) -> "DatasetIdentifier":
        """
        Parses the given identifier and returns the corresponding DatasetIdentifier.

        Parameters
        ----------
        identifier : Union[str, DatasetIdentifier]
            The identifier to be parsed.

        Returns
        -------
        DatasetIdentifier
            The SDK representation of a ``DatasetIdentifier``.

        Raises
        ------
        ValueError
            If the ``identifier`` given is invalid.
        """
        if isinstance(identifier, DatasetIdentifier):
            return identifier

        team_slug, dataset_slug, version = _parse(identifier)
        return cls(dataset_slug=dataset_slug, team_slug=team_slug, version=version)


    def __str__(self):
        output = ""
        if self.team_slug:
            output = f"{self.team_slug}/"
        output = f"{output}{self.dataset_slug}"
        if self.version:
            output = f"{output}:{self.version}"
        return output



def _parse(slug: str) -> Tuple[Optional[str], str, Optional[str]]:
    team: Optional[str] = None
    version: Optional[str] = None

    if not _is_slug_valid(slug):
        raise ValueError(f"Invalid dataset identifier {slug}")

    initial_split = slug.split("/")
    if len(initial_split) == 1:
        dataset = initial_split[0]
    elif len(initial_split) == 2:
        team, dataset = initial_split
    else:
        raise ValueError(f"Invalid dataset identifier {slug}")

    if ":" in dataset:
        dataset, version = dataset.split(":")

    return team, dataset, version


def _is_slug_valid(slug: str) -> bool:
    slug_format = "[\\_a-zA-Z0-9.-]+"
    version_format = "[\\_a-zA-Z0-9.:-]+"
    return (
        re.fullmatch(rf"({slug_format}/)?{slug_format}(:{version_format})?", slug)
        is not None
    )