Source code for aidsorb.datamodules

# This file is part of AIdsorb.
# Copyright (C) 2024 Antonios P. Sarikas

# AIdsorb is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

r"""
:class:`~lightning.pytorch.core.LightningDataModule`'s for use with |lightning|.
"""

import os
from collections.abc import Callable, Sequence
from typing import Any
from pathlib import Path

import lightning as L
from torch.utils.data import DataLoader

from .data import PCDDataset, get_names



[docs]
class PCDDataModule(L.LightningDataModule):
    r"""
    LightningDataModule for supervised/unsupervised learning on point clouds.

    Given the following directory structure::

        project_root
        ├── source      <-- path_to_X
        │   ├── foo.npy
        │   ├── ...
        │   └── bar.npy
        ├── test.json
        ├── train.json
        └── validation.json

    train, validation, and test datasets are set up, all of which are instances
    of :class:`~.PCDDataset`.

    .. note::
        Comma ``,`` is assumed as the field separator in ``.csv`` file.

    .. warning::
        * For validation and test dataloaders, ``shuffle=False`` and
          ``drop_last=False``.
        * If ``train_size`` is specified, the first ``train_size`` point clouds
          from ``train.json`` will be used. **If the data were not split with**
          :func:`~aidsorb.data.prepare_data`, **ensure that names in**
          ``train.json`` **don't follow a particular order**.

    .. todo::
        Add support for ``predict_dataloader``.

    Parameters
    ----------
    path_to_X : str
        Absolute or relative path to the directory holding the point clouds.
    path_to_Y : str, optional
        Absolute or relative path to the ``.csv`` file holding the labels of the
        point clouds.
    index_col : str, optional
        Column name of the ``.csv`` file to be used for indexing.
    labels : list, optional
        Column names of the ``.csv`` file containing the properties to be
        predicted.
    train_size : int, default=None
        Number of training samples. If :obj:`None`, all training samples are used.
    train_transform_x : callable, optional
        Transformation to apply to point cloud during training.
    eval_transform_x : callable, optional
        Transformation to apply to point cloud during validation and testing.
    transform_y : callable, optional
        Transformation to apply to label.
    shuffle : bool, default=False
        Only for train dataloader.
    drop_last : bool, default=False
        Only for train dataloader.
    train_batch_size : int, default=32
        Batch size for train dataloader.
    eval_batch_size : int, default=32
        Batch size for validation and test dataloaders.
    config_dataloaders : dict, optional
        Dictionary for configuring all dataloaders. For example::

            config_dataloaders = {
                'pin_memory': True,
                'num_workers': 2,
                }

        .. note::
            The dictionary is not copied. To avoid side effects, consider
            passing a copy.

    See Also
    --------
    :class:`~torch.utils.data.DataLoader` :
        For a description of ``shuffle``, ``drop_last`` and valid options for
        ``config_dataloaders``.
    """
    def __init__(
            self,
            path_to_X: str,
            *,
            path_to_Y: str | None = None,
            index_col: str | None = None,
            labels: list[str] | None = None,
            train_size: int | None = None,
            train_transform_x: Callable | None = None,
            eval_transform_x: Callable | None = None,
            transform_y: Callable | None = None,
            shuffle: bool = False,
            drop_last: bool = False,
            train_batch_size: int = 32,
            eval_batch_size: int = 32,
            config_dataloaders: dict[str, Any] | None = None,
            ) -> None:
        super().__init__()
        self.save_hyperparameters()  # For argument-less load_from_checkpoint.
        
        self.path_to_X = path_to_X
        self.path_to_Y = path_to_Y

        self.index_col = index_col
        self.labels = labels

        self.train_transform_x = train_transform_x
        self.eval_transform_x = eval_transform_x
        self.transform_y = transform_y

        self.train_size = train_size
        self.shuffle = shuffle
        self.drop_last = drop_last

        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size

        # Configuration for all dataloaders.
        self.config_dataloaders = {}
        if config_dataloaders is not None:
            self.config_dataloaders = config_dataloaders


[docs]
    def setup(self, stage: str | None = None) -> None:
        r"""
        Set up train, validation and test datasets.

        .. tip::
            Datasets are accesible via ``self.{train,validation,test}_dataset``.

        Parameters
        ----------
        stage : {None, 'fit', 'validate', 'test'}, default=None
            Which datasets to set up.

            * If ``'fit'``, only the train and validation datasets are set up.
            * If ``'validate'`` or ``'test'``, only the corresponding dataset is set up.
            * If :obj:`None`, all datasets are set up.
        """
        if stage == 'fit':
            self._setup_dataset('train')
            self._setup_dataset('validation')
        if stage == 'validate':
            self._setup_dataset('validation')
        if stage == 'test':
            self._setup_dataset('test')
        if stage is None:
            for mode in ['train', 'validation', 'test']:
                self._setup_dataset(mode)


    def _setup_dataset(self, mode: str) -> None:
        path_to_names = Path(self.path_to_X).parent
        pcd_names = get_names(os.path.join(path_to_names, f'{mode}.json'))

        if mode == 'train':
            transform_x = self.train_transform_x
            pcd_names = pcd_names[:self.train_size]  # Set the training set size.
        else:
            transform_x = self.eval_transform_x

        dataset = PCDDataset(
                pcd_names=pcd_names,
                path_to_X=self.path_to_X,
                path_to_Y=self.path_to_Y,
                index_col=self.index_col,
                labels=self.labels,
                transform_x=transform_x,
                transform_y=self.transform_y,
                )
        setattr(self, f'{mode}_dataset', dataset)


[docs]
    def train_dataloader(self) -> DataLoader:
        r"""
        Return the train dataloader.

        Can be called only after :meth:`setup` has been called and
        ``stage`` is ``{None, 'fit'}``.

        Returns
        -------
            :class:`~torch.utils.data.DataLoader`
        """
        # pylint: disable=no-member
        return DataLoader(
                dataset=self.train_dataset,
                batch_size=self.train_batch_size,
                shuffle=self.shuffle,
                drop_last=self.drop_last,
                **self.config_dataloaders,
                )



[docs]
    def val_dataloader(self) -> DataLoader:
        r"""
        Return the validation dataloader.

        Can be called only after :meth:`setup` has been called and
        ``stage`` is ``{None, 'fit', 'validate'}``.

        Returns
        -------
            :class:`~torch.utils.data.DataLoader`
        """
        # pylint: disable=no-member
        return DataLoader(
                dataset=self.validation_dataset,
                batch_size=self.eval_batch_size,
                shuffle=False,
                drop_last=False,
                **self.config_dataloaders,
                )



[docs]
    def test_dataloader(self) -> DataLoader:
        r"""
        Return the test dataloader.

        Can be called only after :meth:`setup` has been called and
        ``stage`` is ``{None, 'test'}``.

        Returns
        -------
            :class:`~torch.utils.data.DataLoader`
        """
        # pylint: disable=no-member
        return DataLoader(
                dataset=self.test_dataset,
                batch_size=self.eval_batch_size,
                shuffle=False,
                drop_last=False,
                **self.config_dataloaders,
                )