Source code for pyradise.fileio.crawling

import os
import warnings
from abc import ABC, abstractmethod
from typing import Any, Optional, Tuple

import itk
from pydicom.tag import Tag

from pyradise.data import Modality
from pyradise.utils import (assume_is_segmentation, is_dicom_file,
                            is_dir_and_exists, load_dataset_tag)

from .extraction import AnnotatorExtractor, ModalityExtractor, OrganExtractor
from .modality_config import ModalityConfiguration
from .series_info import (DicomSeriesImageInfo, DicomSeriesInfo,
                          DicomSeriesRegistrationInfo, DicomSeriesRTSSInfo,
                          DicomSeriesDoseInfo, FileSeriesInfo,
                          IntensityFileSeriesInfo, SegmentationFileSeriesInfo)

__all__ = ["Crawler", "SubjectFileCrawler", "DatasetFileCrawler", "SubjectDicomCrawler", "DatasetDicomCrawler"]


[docs]class Crawler(ABC):
    """An abstract crawler whose subtypes are intended to be used for searching files of a certain type in a specified
    location or within a hierarchy of directories.

    Args:
        path (str): The directory path for which the crawling will be performed.
    """

    def __init__(self, path: str) -> None:
        super().__init__()

        self.path = is_dir_and_exists(os.path.normpath(path))

[docs]    @abstractmethod
    def execute(self) -> Any:
        """Execute the crawling process.

        Returns:
            Any: The crawled data.
        """
        raise NotImplementedError()


[docs]class SubjectFileCrawler(Crawler):
    """A crawler for retrieving :class:`~pyradise.fileio.series_info.FileSeriesInfo` entries from a subject directory
    containing discrete image files of a specified type (see ``extension`` parameter).

    The :class:`SubjectFileCrawler` is used for searching appropriate files within a specific subject directory
    containing all the subject's data. If there are multiple subjects in separate directories but within a
    common top-level directory to be crawled we recommend using the :class:`DatasetFileCrawler`.

    Important:
        The DICOM format is not supported by this crawler. Use the appropriate crawler variant instead.

    Raises:
        ValueError: If the ``extension`` parameter specifies the DICOM file extension (i.e. ``.dcm``).

    Args:
        path (str): The directory path to crawl for files.
        subject_name (str): The name of the subject.
        extension (str): The file extension of the files to be searched.
        modality_extractor (ModalityExtractor): The modality extractor.
        organ_extractor (OrganExtractor): The organ extractor.
        annotator_extractor (AnnotatorExtractor): The annotator extractor.

    """

    def __init__(
        self,
        path: str,
        subject_name: str,
        extension: str,
        modality_extractor: ModalityExtractor,
        organ_extractor: OrganExtractor,
        annotator_extractor: AnnotatorExtractor,
    ) -> None:
        super().__init__(path)

        if "dcm" in extension:
            raise ValueError(
                f"The DICOM format is not supported by {self.__class__.__name__}! "
                "Use the appropriate DICOM variant instead."
            )

        self.extension = extension
        self.subject_name = subject_name
        self.modality_extractor = modality_extractor
        self.organ_extractor = organ_extractor
        self.annotator_extractor = annotator_extractor

[docs]    def execute(self) -> Tuple[FileSeriesInfo, ...]:
        """Execute the crawling process.

        Returns:
            Tuple[FileSeriesInfo, ...]: The crawled data.
        """

        series_infos = []
        for root, _, files in os.walk(self.path):
            for file in files:
                if file.endswith(self.extension):
                    file_path = os.path.join(root, file)
                    modality = self.modality_extractor.extract(file_path)

                    if self.modality_extractor.is_enumerated_default_modality(modality):
                        is_segmentation = assume_is_segmentation(file_path)
                    else:
                        is_segmentation = True if modality is None else False

                    if is_segmentation:
                        organ = self.organ_extractor.extract(file_path)
                        annotator = self.annotator_extractor.extract(file_path)

                        series_info = SegmentationFileSeriesInfo(file_path, self.subject_name, organ, annotator)
                    else:
                        series_info = IntensityFileSeriesInfo(file_path, self.subject_name, modality)

                    series_infos.append(series_info)

        return tuple(series_infos)


[docs]class DatasetFileCrawler(Crawler):
    """An iterable crawler for retrieving :class:`~pyradise.fileio.series_info.FileSeriesInfo` entries from a dataset
    directory containing at least one subject directory with image files of a specified type (see ``extension``
    parameter).

    If you want to load a large dataset with many subjects, we recommend using the iterative crawling approach instead
    of crawling the data via :meth:`execute` to reduce memory consumption (see example below).

    Important:
        The DICOM format is not supported by this crawler. Use the appropriate crawler variant instead.

    Example:

        Demonstration of the iterative and the non-iterative loading approach:

        >>> from pyradise.data import (Modality, Organ, Annotator)
        >>> from pyradise.fileio import (DatasetFileCrawler, ModalityExtractor,
        >>>                              OrganExtractor, AnnotatorExtractor, SubjectLoader)
        >>>
        >>>
        >>> # An example modality extractor
        >>> class MyModalityExtractor(ModalityExtractor):
        >>>
        >>>     def extract_from_dicom(self, path: str) -> Optional[Modality]:
        >>>         return None
        >>>
        >>>     def extract_from_path(self, path: str) -> Optional[Modality]:
        >>>         file_name = os.path.basename(path)
        >>>         if 't1' in file_name:
        >>>             return Modality('T1')
        >>>         elif 't2' in file_name:
        >>>             return Modality('T2')
        >>>         else:
        >>>             return None
        >>>
        >>>
        >>> # An example organ extractor
        >>> class MyOrganExtractor(OrganExtractor):
        >>>
        >>>     def extract(self, path: str) -> Optional[Organ]:
        >>>         file_name = os.path.basename(path).lower()
        >>>         if 'brainstem' in file_name:
        >>>             return Organ('Brainstem')
        >>>         elif 'tumor' in file_name:
        >>>             return Organ('Tumor')
        >>>         else:
        >>>             return None
        >>>
        >>>
        >>> # An example annotator extractor
        >>> class MyAnnotatorExtractor(AnnotatorExtractor):
        >>>
        >>>     def extract(self, path: str) -> Optional[Annotator]:
        >>>         file_name = os.path.basename(path).lower()
        >>>         if 'example_expert' in file_name:
        >>>             return Annotator('ExampleExpert')
        >>>         return None
        >>>
        >>>
        >>> def main_iterative_crawling(dataset_path: str) -> None:
        >>>     extension = '.nii.gz'
        >>>
        >>>     # Create the crawler
        >>>     crawler = DatasetFileCrawler(dataset_path, extension, MyModalityExtractor(),
        >>>                                  MyOrganExtractor(), MyAnnotatorExtractor())
        >>>
        >>>     # Use the crawler iteratively (more memory efficient)
        >>>     for series_info in crawler:
        >>>         subject = SubjectLoader().load(series_info)
        >>>         # Do something with the subject
        >>>         print(subject.get_name())
        >>>
        >>>
        >>> def main_crawling_using_execute_fn(dataset_path: str) -> None:
        >>>     extension = '.nii.gz'
        >>>
        >>>     # Create the crawler
        >>>     crawler = DatasetFileCrawler(dataset_path, extension, MyModalityExtractor(),
        >>>                                  MyOrganExtractor(), MyAnnotatorExtractor())
        >>>
        >>>     # Use the crawler with the execute function
        >>>     # (all series info entries are loaded in one step)
        >>>     series_infos = crawler.execute()
        >>>
        >>>     # Iterate over the series infos
        >>>     for series_info in series_infos:
        >>>         subject = SubjectLoader().load(series_info)
        >>>         # Do something with the subject
        >>>         print(subject.get_name())

    Raises:
        ValueError: If the ``extension`` parameter specifies the DICOM file extension (i.e. ``.dcm``).

    Args:
        path (str): The dataset directory path to crawl for data.
        extension (str): The file extension of the image files to be crawled.
        modality_extractor (ModalityExtractor): The modality extractor.
        organ_extractor (OrganExtractor): The organ extractor.
        annotator_extractor (AnnotatorExtractor): The annotator extractor.
    """

    def __init__(
        self,
        path: str,
        extension: str,
        modality_extractor: ModalityExtractor,
        organ_extractor: OrganExtractor,
        annotator_extractor: AnnotatorExtractor,
    ) -> None:
        super().__init__(path)

        if "dcm" in extension:
            raise ValueError(
                f"The DICOM format is not supported by {self.__class__.__name__}! "
                "Use the appropriate DICOM variant instead."
            )
        self.extension = extension

        self.modality_extractor = modality_extractor
        self.organ_extractor = organ_extractor
        self.annotator_extractor = annotator_extractor

        subject_dir_paths = self._get_subject_dir_paths(self.path, self.extension)
        self.subject_dir_path = tuple(sorted(subject_dir_paths))
        self.subject_names = tuple(os.path.basename(path) for path in self.subject_dir_path)

        self.current_idx = 0
        self.num_subjects = len(self.subject_dir_path)

    @staticmethod
    def _get_subject_dir_paths(path: str, extension: str) -> Tuple[str, ...]:
        """Get the paths of the subject directories containing valid files.

        Args:
            path (str): The directory path for which the crawling will be performed.
            extension (str): The file extension of the files to be considered.

        Returns:
            Tuple[str, ...]: The subject directory paths containing valid files.
        """
        candidate_dir_paths = [entry.path for entry in os.scandir(path) if entry.is_dir()]

        subject_dir_paths = []
        for candidate_dir_path in candidate_dir_paths:
            for root, _, files in os.walk(candidate_dir_path):
                for file in files:
                    if file.endswith(extension):
                        subject_dir_paths.append(candidate_dir_path)
                        break

        return tuple(subject_dir_paths)

[docs]    def execute(self) -> Tuple[Tuple[FileSeriesInfo, ...], ...]:
        """Execute the crawling process.

        Returns:
            Tuple[Tuple[FileSeriesInfo, ...], ...]: The crawled data.
        """
        # Get subject files
        subject_files = []
        for subject_dir, subject_name in zip(self.subject_dir_path, self.subject_names):
            subject_file_crawler = SubjectFileCrawler(
                subject_dir,
                subject_name,
                self.extension,
                self.modality_extractor,
                self.organ_extractor,
                self.annotator_extractor,
            )
            subject_files.append(subject_file_crawler.execute())

        return tuple(subject_files)

    def __iter__(self) -> "DatasetFileCrawler":
        self.current_idx = 0
        return self

    def __next__(self) -> Tuple[FileSeriesInfo, ...]:
        if self.current_idx < self.num_subjects:
            subject_info = SubjectFileCrawler(
                self.subject_dir_path[self.current_idx],
                self.subject_names[self.current_idx],
                self.extension,
                self.modality_extractor,
                self.organ_extractor,
                self.annotator_extractor,
            ).execute()
            self.current_idx += 1
            return subject_info
        else:
            raise StopIteration

    def __len__(self) -> int:
        return self.num_subjects


[docs]class SubjectDicomCrawler(Crawler):
    """A crawler for retrieving :class:`~pyradise.fileio.series_info.DicomSeriesInfo` entries from a subject directory
    containing DICOM files (e.g. DICOM images, DICOM registrations, DICOM RTSS). Files of other formats then DICOM will
    be ignored and can not be crawled with this type of crawler.

    The :class:`SubjectDicomCrawler` is used for searching appropriate files within a specific subject directory
    containing all the subject's data. If there are multiple subjects in separate directories but within a common
    top-level directory to be crawled we recommend using the :class:`DatasetDicomCrawler`.

    The prioritized method to extract the :class:`~pyradise.data.modality.Modality` for the retrieved images is the
    usage of a modality configuration file. If no modality configuration file is available the
    :class:`SubjectDicomCrawler` will try to extract the :class:`~pyradise.data.modality.Modality` from the retrieved
    images using the class:`ModalityExtractor`. If no :class:`~pyradise.fileio.extraction.ModalityExtractor` is
    provided an exception will be raised.

    The :class:`SubjectDicomCrawler` can be used to generate the modality configuration file skeleton for a
    specific subject. In this case set the ``generate_modality_config`` parameter to ``True`` and execute the
    crawling process. The generated modality configuration file skeleton will be saved in the subject directory.

    Important:
        This crawler exclusively support the DICOM file format and does not support any other file format.

    Args:
        path (str): The subject directory path to crawl.
        modality_extractor (Optional[ModalityExtractor]): The modality extractor (default: None).
        modality_config_file_name (str): The file name for the modality configuration file within the subject
         directory (default: modality_config.json).
        write_modality_config (bool): If True writes the modality configuration retrieved to the subject directory
         (default: False).
    """

    def __init__(
        self,
        path: str,
        modality_extractor: Optional[ModalityExtractor] = None,
        modality_config_file_name: str = "modality_config.json",
        write_modality_config: bool = False,
    ) -> None:
        super().__init__(path)
        self.modality_extractor: Optional[ModalityExtractor] = modality_extractor
        self.config_file_name = modality_config_file_name
        self.write_config = write_modality_config

    def _get_dicom_files(self) -> Tuple[str, ...]:
        """Get all DICOM files in the subject directory.

        Returns:
            Tuple[str, ...]: The DICOM file paths.
        """
        file_paths = []
        for root, _, files in os.walk(self.path):
            for file in files:
                if file == "DICOMDIR":
                    continue

                file_path = os.path.join(root, file)
                if is_dicom_file(file_path):
                    file_paths.append(file_path)

        return tuple(file_paths)

    def _get_image_files(self) -> Tuple[Tuple[str, ...], ...]:
        """Get all DICOM image files in the subject directory.

        Notes:
            The DICOM image files are grouped by their SeriesInstanceUID.

        Returns:
            Tuple[Tuple[str, ...], ...]: The DICOM image file paths separated by SeriesInstanceUID.
        """
        series_extractor = itk.GDCMSeriesFileNames.New()
        series_extractor.SetRecursive(True)
        series_extractor.SetDirectory(self.path)
        series_extractor.Update()

        image_series_paths = []
        for series_uid in series_extractor.GetSeriesUIDs():
            series_paths = [str(os.path.normpath(entry)) for entry in series_extractor.GetFileNames(series_uid)]

            # check if the image belongs to the DICOM RT SOP Classes
            dataset = load_dataset_tag(series_paths[0], (Tag(0x0008, 0x0016),))
            if "481" in str(dataset.get("SOPClassUID", "")):
                continue

            image_series_paths.append(tuple(series_paths))

        return tuple(image_series_paths)

    @staticmethod
    def _get_registration_files(paths: Tuple[str, ...]) -> Tuple[str, ...]:
        """Get all DICOM registration files in the subject directory.

        Args:
            paths (Tuple[str, ...]): The DICOM file paths to check if they specify a DICOM registration file.

        Returns:
            Tuple[str, ...]: The DICOM registration file paths.
        """
        valid_sop_class_uids = (
            "1.2.840.10008.5.1.4.1.1.66.1",  # Spatial Registration Storage
            "1.2.840.10008.5.1.4.1.1.66.3",
        )  # Deformable Spatial Registration Storage

        registration_files = []
        for path in paths:
            dataset = load_dataset_tag(path, (Tag(0x0008, 0x0016),))

            if dataset.get("SOPClassUID", None) in valid_sop_class_uids:
                registration_files.append(path)

        return tuple(registration_files)

    @staticmethod
    def _get_rtss_files(paths: Tuple[str, ...]) -> Tuple[str, ...]:
        """Get all DICOM RTSS files in the subject directory.

        Args:
            paths (Tuple[str, ...]): The DICOM file paths to check if they specify a DICOM RTSS file.

        Returns:
            Tuple[str, ...]: The DICOM RTSS file paths.
        """
        valid_sop_class_uid = "1.2.840.10008.5.1.4.1.1.481.3"  # RT Structure Set Storage

        rtss_files = []
        for path in paths:
            dataset = load_dataset_tag(path, (Tag(0x0008, 0x0016),))

            if dataset.get("SOPClassUID", None) == valid_sop_class_uid:
                rtss_files.append(path)

        return tuple(rtss_files)


    @staticmethod
    def _get_rtdose_files(paths: Tuple[str, ...]) -> Tuple[str, ...]:
        """Get all DICOM RTDOSE files in the subject directory.

        Args:
            paths (Tuple[str, ...]): The DICOM file paths to check if they specify a DICOM RTDOSE file.

        Returns:
            Tuple[str, ...]: The DICOM RTDOSE file paths.
        """
        valid_sop_class_uid = "1.2.840.10008.5.1.4.1.1.481.2"  # RT Structure Set Storage

        rtdose_files = []
        for path in paths:
            dataset = load_dataset_tag(path, (Tag(0x0008, 0x0016),))

            if dataset.get("SOPClassUID", None) == valid_sop_class_uid:
                rtdose_files.append(path)

        return tuple(rtdose_files)

    @staticmethod
    def _generate_image_infos(image_paths: Tuple[Tuple[str, ...], ...]) -> Tuple[DicomSeriesImageInfo]:
        """Generate the :class:`~pyradise.fileio.series_info.DicomSeriesImageInfo` entries for the DICOM file paths
        specified.

        Args:
            image_paths (Tuple[Tuple[str, ...], ...]): The DICOM image file paths provided.

        Returns:
            Tuple[DicomSeriesImageInfo, ...]: The retrieved :class:`~pyradise.fileio.series_info.DicomSeriesImageInfo`
            entries.
        """
        infos = []

        for paths in image_paths:
            image_info = DicomSeriesImageInfo(paths)
            infos.append(image_info)

        return tuple(infos)

    @staticmethod
    def _generate_registration_infos(
        registration_paths: Tuple[str, ...], image_infos: Tuple[DicomSeriesImageInfo, ...]
    ) -> Tuple[DicomSeriesRegistrationInfo]:
        """Generate the :class:`~pyradise.fileio.series_info.DicomSeriesRegistrationInfo` entries for the DICOM file
        paths specified.

        Args:
            registration_paths (Tuple[str, ...]): The DICOM registration file paths provided.
            image_infos (Tuple[DicomSeriesImageInfo, ...]): The available
             :class:`~pyradise.fileio.series_info.DicomSeriesImageInfo` entries.

        Returns:
            Tuple[DicomSeriesRegistrationInfo, ...]: The retrieved
             :class:`~pyradise.fileio.series_info.DicomSeriesRegistrationInfo` entries.
        """
        infos = []

        for path in registration_paths:
            registration_info = DicomSeriesRegistrationInfo(path, image_infos, persistent_image_infos=False)
            infos.append(registration_info)

        return tuple(infos)

    @staticmethod
    def _generate_rtss_info(rtss_paths: Tuple[str, ...]) -> Tuple[DicomSeriesRTSSInfo]:
        """Generate the :class:`~pyradise.fileio.series_info.DicomSeriesRTStructureSetInfo` entries for the DICOM file
        paths specified.

        Args:
            rtss_paths (Tuple[str, ...]): The DICOM RTSS file paths.

        Returns:
            Tuple[DicomSeriesRTStructureSetInfo, ...]: The retrieved
             :class:`~pyradise.fileio.series_info.DicomSeriesRTStructureSetInfo` entries.
        """
        infos = []

        for path in rtss_paths:
            rtss_info = DicomSeriesRTSSInfo(path)
            infos.append(rtss_info)

        return tuple(infos)


    @staticmethod
    def _generate_rtdose_info(rtdose_paths: Tuple[str, ...]) -> Tuple[DicomSeriesImageInfo]:
        """Generate the :class:`~pyradise.fileio.series_info.DicomSeriesImageInfo` entries for the DICOM file
        paths specified.

        Args:
            rtdose_paths (Tuple[str, ...]): The DICOM RTDOSE file paths.

        Returns:
            Tuple[DicomSeriesImageInfo, ...]: The retrieved
             :class:`~pyradise.fileio.series_info.DicomSeriesImageInfo` entries.
        """
        infos = []

        for path in rtdose_paths:
            rtdose_info = DicomSeriesDoseInfo(path)
            infos.append(rtdose_info)

        return tuple(infos)

    def _export_modality_config(self, infos: Tuple[DicomSeriesInfo, ...]) -> None:
        """Export the retrieved :class:`~pyradise.fileio.modality_config.ModalityConfiguration` to a file.

        Args:
            infos (Tuple[DicomSeriesInfo, ...]): The :class:`~pyradise.fileio.series_info.DicomSeriesInfo` entries
             containing the information to export.

        Returns:
            None
        """
        config = ModalityConfiguration.from_dicom_series_info(infos)
        config.to_file(os.path.join(self.path, self.config_file_name))

    def _apply_modality_config(self, infos: Tuple[DicomSeriesImageInfo, ...]) -> None:
        """Load the :class:`~pyradise.fileio.modality_config.ModalityConfiguration` from a file if available and apply
        it to the specified :class:`~pyradise.fileio.series_info.DicomSeriesImageInfo` entries. If the
        :class:`~pyradise.fileio.modality_config.ModalityConfiguration` file is not available and a
        :class:`~pyradise.fileio.extraction.ModalityExtractor` is provided the extractor is used for modality
        determination.

        Args:
            infos (Tuple[DicomSeriesImageInfo, ...]): The available
             :class:`~pyradise.fileio.series_info.DicomSeriesImageInfo` entries to which the
             loaded :class:`~pyradise.fileio.modality_config.ModalityConfiguration` can be applied.

        Returns:
            None
        """
        # try to apply the modality configuration if it exists
        modality_file_path = ""
        for root, _, files in os.walk(self.path):
            for file in files:
                if self.config_file_name in file:
                    modality_file_path = os.path.join(root, file)
                    break

        # apply the modality configuration if it exists
        if os.path.exists(modality_file_path):
            config = ModalityConfiguration.from_file(modality_file_path)
            config.add_modalities_to_info(infos)

            if config.has_default_modalities():
                warnings.warn("The modality configuration file contains at least one default modality.")

            if config.has_duplicate_modalities():
                raise ValueError(
                    "The modalities from the modality configuration file contain at least one duplicate "
                    "modality. This will cause ambiguity when loading the DICOM series."
                )

            if self.write_config:
                warnings.warn("The modality configuration file already exists and will not be overwritten.")

            return

        # if no modality configuration file exists, try to apply the default configuration
        else:
            if self.modality_extractor is not None:
                extraction_possible_for_all = True
                for info in infos:
                    modality = self.modality_extractor.extract(info.path[0])
                    if modality is not None:
                        info.set_modality(modality)
                    else:
                        info.set_modality(Modality.get_default())
                        extraction_possible_for_all = False

                config = ModalityConfiguration.from_dicom_series_info(infos)
                if self.write_config:
                    config.to_file(os.path.join(self.path, self.config_file_name))

                if config.has_duplicate_modalities():
                    raise ValueError(
                        "The extracted modalities contain at least one duplicate modality. "
                        "This will cause ambiguity when loading the DICOM series."
                    )

                if extraction_possible_for_all:
                    return
                else:
                    warnings.warn(
                        "Modality extraction failed for one DICOM series. The default modality will "
                        "be used for the series which failed during modality extraction."
                    )
                    return

            else:
                config = ModalityConfiguration.from_dicom_series_info(infos)

                if config.has_duplicate_modalities() and self.write_config is False:
                    raise ValueError(
                        "The extracted modalities contain at least one duplicate modality. "
                        "This will cause ambiguity when loading the DICOM series. Use either a modality "
                        "configuration file or a modality extractor to resolve this issue."
                    )

                if config.has_default_modalities() and self.write_config is False and len(config.configuration) > 1:
                    raise ValueError(
                        "The extracted modalities contain at least one default modality. "
                        "This will cause ambiguity when loading the DICOM series. Use either a modality "
                        "configuration file or a modality extractor to resolve this issue."
                    )

                if self.write_config:
                    config.to_file(os.path.join(self.path, self.config_file_name))
                    return

                if not config.has_duplicate_modalities():
                    return

                raise ValueError(
                    "The modality configuration file could not be found "
                    f"in the specified path ({self.path}) and there is no modality extractor provided!"
                )

[docs]    def execute(self) -> Tuple[DicomSeriesInfo, ...]:
        """Execute the crawling process to retrieve the :class:`~pyradise.fileio.series_info.DicomSeriesInfo` entries.

        Returns:
            Tuple[DicomSeriesInfo, ...]: The retrieved :class:`~pyradise.fileio.series_info.DicomSeriesInfo` entries.
        """
        # get the dicom file paths and sort them according to the file content
        file_paths = self._get_dicom_files()
        image_paths = self._get_image_files()

        flat_image_paths = [path for paths in image_paths for path in paths]
        remaining_paths = tuple(set(file_paths) - set(flat_image_paths))

        registration_paths = self._get_registration_files(remaining_paths)
        remaining_paths = tuple(set(remaining_paths) - set(registration_paths))

        rtss_paths = self._get_rtss_files(remaining_paths)
        remaining_paths = tuple(set(remaining_paths) - set(rtss_paths))

        rtdose_paths = self._get_rtdose_files(remaining_paths)

        # generate the series infos
        image_infos = self._generate_image_infos(image_paths)
        registration_infos = self._generate_registration_infos(registration_paths, image_infos)
        rtss_infos = self._generate_rtss_info(rtss_paths)
        rtdose_infos = self._generate_rtdose_info(rtdose_paths)

        # apply the modality config and write it to disk if requested
        self._apply_modality_config(image_infos + rtdose_infos)

        return image_infos + registration_infos + rtss_infos + rtdose_infos


[docs]class DatasetDicomCrawler(Crawler):
    """A crawler for retrieving :class:`~pyradise.fileio.series_info.DicomSeriesInfo` entries from a dataset directory
    containing at least one subject directory with DICOM files (e.g. DICOM images, DICOM registrations, DICOM RTSS).
    Files of other formats then DICOM will be ignored and can not be crawled with this type of crawler.

    The :class:`DatasetDicomCrawler` is used for searching appropriate files within a specific dataset directory
    containing at least one subject folder with DICOM files. If there is just one subject in a single directory to be
    crawled we recommend using the :class:`SubjectDicomCrawler`. If you want to load a large dataset with many subjects,
    we recommend using the iterative crawling approach instead of crawling the data via :meth:`execute` to reduce
    memory consumption (see example below).

    The prioritized method to extract the :class:`~pyradise.data.modality.Modality` for the retrieved images is the
    usage of a modality configuration file. If no modality configuration file is available for a specific subject
    directory the :class:`DatasetDicomCrawler` will try to extract the :class:`~pyradise.data.modality.Modality` from
    the retrieved subject images using the :class:`~pyradise.fileio.extraction.ModalityExtractor`. If no
    :class:`~pyradise.fileio.extraction.ModalityExtractor` is provided an exception will be raised.

    The :class:`DatasetDicomCrawler` can be used to generate the modality configuration file skeletons for all
    subjects in the dataset directory. In this case set the ``generate_modality_config`` parameter to ``True`` and
    execute the crawling process. The generated modality configuration file skeletons will be saved in the appropriate
    subject directories.

    Important:
        This crawler exclusively support the DICOM file format and does not support any other file format.

    Example:

        Demonstration of the iterative and the non-iterative loading approach:

        >>> from pyradise.fileio import (DatasetDicomCrawler, SubjectLoader)
        >>>
        >>>
        >>> def main_iterative_crawling(dataset_path: str) -> None:
        >>>     # Create the crawler (using the modality configuration file)
        >>>     crawler = DatasetDicomCrawler(dataset_path)
        >>>
        >>>     # Use the crawler iteratively (more memory efficient)
        >>>     for series_info in crawler:
        >>>         subject = SubjectLoader().load(series_info)
        >>>         # Do something with the subject
        >>>         print(subject.get_name())
        >>>
        >>>
        >>> def main_crawling_using_execute_fn(dataset_path: str) -> None:
        >>>     # Create the crawler (using the modality configuration file)
        >>>     crawler = DatasetDicomCrawler(dataset_path)
        >>>
        >>>     # Use the crawler with the execute function
        >>>     # (all series info entries are loaded in one step)
        >>>     series_infos = crawler.execute()
        >>>
        >>>     # Iterate over the series infos
        >>>     for series_info in series_infos:
        >>>         subject = SubjectLoader().load(series_info)
        >>>         # Do something with the subject
        >>>         print(subject.get_name())

    Args:
        path (str): The dataset directory path to crawl.
        modality_extractor (Optional[ModalityExtractor]): The modality extractor (default: None)
        modality_config_file_name (str): The file name for the modality configuration file within the subject
         directory (default: modality_config.json).
        write_modality_config (bool): If True writes the modality configuration retrieved to the subject directory
         (default: False).
    """

    def __init__(
        self,
        path: str,
        modality_extractor: Optional[ModalityExtractor] = None,
        modality_config_file_name: str = "modality_config.json",
        write_modality_config: bool = False,
    ) -> None:
        super().__init__(path)
        self.modality_extractor: Optional[ModalityExtractor] = modality_extractor
        self.config_file_name = modality_config_file_name
        self.write_config = write_modality_config

        self.subject_dir_paths: Optional[str] = None

        self.current_idx = 0
        self.num_subjects = 0

    @staticmethod
    def _get_subject_dir_paths(path: str) -> Tuple[str, ...]:
        """Get the paths of the subject directories containing DICOM files.

        Args:
            path (str): The base directory path which contain the subject directories.

        Returns:
            Tuple[str, ...]: Paths to all subject directories containing DICOM files.
        """
        # Search for all dicom files and sort them according to their patient id
        subjects = {}
        patient_id_tag = Tag(0x0010, 0x0020)  # Patient ID

        for root, _, files in os.walk(path):
            for file in files:
                if file == "DICOMDIR":
                    continue

                file_path = os.path.join(root, file)

                # check if file is a dicom file
                if is_dicom_file(file_path):
                    # get the patient id
                    patient_id = str(load_dataset_tag(file_path, (patient_id_tag,)).get(patient_id_tag).value)

                    # collect the file paths per patient id
                    if patient_id not in subjects:
                        subjects[patient_id] = file_path
                    else:
                        common_path = os.path.commonpath([subjects.get(patient_id), file_path])
                        subjects[patient_id] = common_path

        return tuple(sorted(subjects.values()))

[docs]    def execute(self) -> Tuple[Tuple[DicomSeriesInfo, ...], ...]:
        """Execute the crawling process to retrieve the :class:`~pyradise.fileio.series_info.DicomSeriesInfo` entries.

        Returns:
            Tuple[Tuple[DicomSeriesInfo, ...], ...]: The retrieved :class:`~pyradise.fileio.series_info.DicomSeriesInfo`
             entries.
        """
        self.subject_dir_paths = self._get_subject_dir_paths(self.path)

        subject_infos = []
        for subject_dir_path in self.subject_dir_paths:
            subject_info = SubjectDicomCrawler(
                subject_dir_path, self.modality_extractor, self.config_file_name, self.write_config
            ).execute()

            subject_infos.append(subject_info) if subject_info else None

        return tuple(subject_infos)

    def __iter__(self) -> "DatasetDicomCrawler":
        self.subject_dir_paths = self._get_subject_dir_paths(self.path)
        self.num_subjects = len(self.subject_dir_paths)
        self.current_idx = 0
        return self

    def __next__(self) -> Tuple[DicomSeriesInfo, ...]:
        if self.current_idx < self.num_subjects:
            subject_info = SubjectDicomCrawler(
                self.subject_dir_paths[self.current_idx],
                self.modality_extractor,
                self.config_file_name,
                self.write_config,
            ).execute()
            self.current_idx += 1
            return subject_info

        raise StopIteration

    def __len__(self) -> int:
        return self.num_subjects