Source code for dandi.validate._types

from __future__ import annotations

from enum import Enum, IntEnum, auto, unique
import logging
from pathlib import Path
from typing import Annotated, Any, Union

from pydantic import BaseModel, BeforeValidator, Field
from pydantic.functional_serializers import PlainSerializer

import dandi
from dandi.utils import StrEnum



[docs]
class MissingFileContent(StrEnum):
    """Policy for handling files whose content is missing (e.g. broken symlinks
    in a datalad dataset without fetched data)."""

    error = auto()
    """Emit a concise error for each file with missing content (default)."""

    only_non_data = "only-non-data"
    """Skip content-dependent validators (e.g. pynwb, nwbinspector) but still
    validate non-data aspects such as path layout."""

    skip = auto()
    """Skip the file entirely; emit a WARNING noting that validation was skipped."""



lgr = logging.getLogger(__name__)



[docs]
@unique
class Standard(StrEnum):
    """Standards to validate against"""

    BIDS = auto()
    DANDI_LAYOUT = "DANDI-LAYOUT"
    DANDI_SCHEMA = "DANDI-SCHEMA"
    HED = auto()
    NWB = auto()
    OME_ZARR = "OME-ZARR"
    ZARR = auto()

    # File formats (For denoting validation failures in file format level)
    JSON = auto()
    TSV = auto()
    YAML = auto()




[docs]
@unique
class Validator(StrEnum):
    """Validators that are used to do validation"""

    bids_validator_deno = "bids-validator-deno"
    bidsschematools = auto()
    dandi = auto()
    dandi_zarr = "dandi.zarr"
    dandischema = auto()
    hed_python_validator = "hed-python-validator"
    nwbinspector = auto()
    pynwb = auto()
    tensorstore = auto()
    zarr = auto()




[docs]
class OriginType(StrEnum):
    """Types of validation result origins"""

    INTERNAL = auto()
    """
    Validation result is originated from the validator but not necessarily relating
    to validation of the data"""

    VALIDATION = auto()
    """Validation result is originated from validation of the data"""




[docs]
class Origin(BaseModel):
    """
    Origin of the validation result
    """

    type: OriginType

    validator: Validator
    """The validator conducting the validation"""

    validator_version: str
    """The version of the validator"""

    standard: Standard | None = None
    """Standard being validated against"""

    standard_version: str | None = None
    """Version of the standard"""

    standard_schema_version: str | None = None
    """
    Version of the schema used in defining or implementing the standard,
    such as BIDS schema version in BIDS
    """



# Some commonly used `Origin` instances
ORIGIN_VALIDATION_DANDI = Origin(
    type=OriginType.VALIDATION,
    validator=Validator.dandi,
    validator_version=dandi.__version__,
)
ORIGIN_VALIDATION_DANDI_LAYOUT = Origin(
    type=OriginType.VALIDATION,
    validator=Validator.dandi,
    validator_version=dandi.__version__,
    standard=Standard.DANDI_LAYOUT,
)
ORIGIN_VALIDATION_DANDI_ZARR = Origin(
    type=OriginType.VALIDATION,
    validator=Validator.dandi_zarr,
    validator_version=dandi.__version__,
)
ORIGIN_INTERNAL_DANDI = Origin(
    type=OriginType.INTERNAL,
    validator=Validator.dandi,
    validator_version=dandi.__version__,
)



[docs]
class Severity(IntEnum):
    """Severity levels for validation results"""

    INFO = 10
    """Not an indication of problem but information of status or confirmation"""

    HINT = 20
    """Data is valid but could be improved"""

    WARNING = 30
    """Data is not recognized as valid. Changes are needed to ensure validity"""

    ERROR = 40
    """Data is recognized as invalid"""

    CRITICAL = 50
    """
    A serious invalidity in data.
    E.g., an invalidity that prevents validation of other aspects of the data such
    as when validating against the BIDS standard, the data is without a `BIDSVersion`
    field or has an invalid `BIDSVersion` field.
    """



_SeverityName = Enum(  # type: ignore[misc]
    "_SeverityName", [(n, n) for n in Severity.__members__]
)
"""
Names of the members of `Severity` as an enum

This is used for generating JSON schema for `Severity_`
"""


def _accept_severity_by_name(v: Any) -> Any:
    """
    A validator function to be used in `BeforeValidator` to allow `Severity` member
    names to be validated as `Severity` values.
    """
    if isinstance(v, str):
        if v in Severity.__members__:
            return Severity[v]
        else:
            raise ValueError(
                f"Invalid severity name: {v}. "
                f"Valid names are: {', '.join(Severity.__members__.keys())}"
            )
    else:
        return v


Severity_ = Annotated[
    Severity,
    BeforeValidator(
        _accept_severity_by_name, json_schema_input_type=Union[Severity, _SeverityName]
    ),
    PlainSerializer(
        lambda s: _SeverityName[s.name], return_type=_SeverityName, when_used="json"
    ),
]
"""
The annotated version of `Severity` with which the values of `Severity` are serialized
as their names when serialized to JSON, i.e. serialization done in the mode of "json".
"""



[docs]
class Scope(Enum):
    FILE = "file"
    FOLDER = "folder"
    DANDISET = "dandiset"
    DATASET = "dataset"



CURRENT_RECORD_VERSION = "1"



[docs]
class ValidationResult(BaseModel):
    record_version: str = CURRENT_RECORD_VERSION
    """Version of the serialized record format for forward compatibility"""

    id: str

    origin: Origin
    """Origin of the validation result as validator and standard used in producing it"""

    scope: Scope

    origin_result: Any | None = Field(default=None, exclude=True)
    """
    The representation of the validation result produced by the used validator,
    `self.origin.validator`, unchanged
    """

    severity: Severity_ | None = None
    # asset_paths, if not populated, assumes [.path], but could be smth like
    # {"path": "task-broken_bold.json",
    #  "asset_paths": ["sub-01/func/sub-01_task-broken_bold.json",
    #                  "sub-02/func/sub-02_task-broken_bold.json"]}
    asset_paths: list[str] | None = None
    # e.g. path within hdf5 file hierarchy
    # As a dict we will map asset_paths into location within them
    within_asset_paths: dict[str, str] | None = None
    dandiset_path: Path | None = None
    dataset_path: Path | None = None
    # TODO: locations analogous to nwbinspector.InspectorMessage.location
    # but due to multiple possible asset_paths, we might want to have it
    # as a dict to point to location in some or each affected assets
    message: str | None = None
    metadata: dict | None = None
    # ??? should it become a list e.g. for errors which rely on
    # multiple files, like mismatch between .nii.gz header and .json sidecar
    path: Path | None = None
    path_regex: str | None = None


[docs]
    def model_post_init(self, __context: Any) -> None:
        if self.record_version != CURRENT_RECORD_VERSION:
            lgr.warning(
                "record_version %r != current %r, loading anyway",
                self.record_version,
                CURRENT_RECORD_VERSION,
            )


    @property
    def purview(self) -> str | None:
        if self.path is not None:
            return str(self.path)
        elif self.path_regex is not None:
            return self.path_regex
        elif self.dataset_path is not None:
            return str(self.dataset_path)
        else:
            return None