Source code for datasets

import pickle

import pandas as pd
from PIL import Image

import torch as th
from torchvision import transforms
from torch.utils.data import Dataset

Tensor = th.Tensor
img_to_tsr = transforms.ToTensor()
grayscale = transforms.Grayscale()


[docs]
class DatasetBase(Dataset):
    """Base class for storing datasets that can be readily used for training.

    Attributes:
        inputs: A list that contains the networks' inputs, ready to be called by
            the model. All pre-treatment must be done beforehand or during class
            initialization.
        output: A list that contains the expected outputs for training or
            validation. They should also be ready to be read by the loss
            function to be used.

    """

    def __init__(self, *args, **kwargs):
        super(DatasetBase, self).__init__()
        self.args = args
        self.kwargs = kwargs
        self.inputs = []
        self.output = []
        self.device = th.device("cpu")

    def __len__(self) -> int:
        return len(self.inputs)

    def __getitem__(self, item) -> tuple[Tensor, Tensor]:
        inputs = self.inputs[item]
        output = self.output[item]
        return inputs, output

    def __str__(self) -> str:
        return self.report()


[docs]
    def to(self, device: th.device):
        """Method for changing the input and output tensors' device.

        Args:
            device: A th.device.
        """

        if isinstance(self.inputs, list):
            for index in range(len(self.inputs)):
                self.inputs[index] = self.inputs[index].to(device)
        else:
            self.inputs = self.inputs.to(device)

        if isinstance(self.output, list):
            for index in range(len(self.output)):
                self.output[index] = self.output[index].to(device)
        else:
            self.output = self.output.to(device)

        self.device = device



[docs]
    def report(self) -> str:
        """See :func:`report`."""
        string = ''

        if not self.args == ():
            string += f"Arguments:\n"
            for arg in self.args:
                string += f"\t{arg}\n"

        if not self.kwargs == {}:
            string += f"Keyword Arguments:\n"
            for key in self.kwargs:
                string += f"\t{key}: {self.kwargs[key]}\n"

        string += f"Number of data points: {len(self)}\n"
        return string





[docs]
class HASYv2Dataset(DatasetBase):
    """Dataset class for preparing HASYv2Dataset data for network training.

    Note that initializing the class won't load the data into it just yet. One
    of the specialized methods needs to be called for that.
    """

    default_paths = {
        "base": './/_Data/HASYv2',
        "fold": './/_Data/HASYv2/classification-task',
    }

    id_dict = {}
    latex_dict = {}
    try:
        symb_csv = pd.read_csv(f"{default_paths['base']}/symbols.csv")
        for index, symbol_id in enumerate(symb_csv['symbol_id']):
            id_dict[int(symbol_id)] = index
            latex_dict[index] = symb_csv['latex'][index]

    except FileNotFoundError:
        id_dict = None
        latex_dict = None

    def __init__(self, **kwargs):
        super(HASYv2Dataset, self).__init__()
        self.path = HASYv2Dataset.default_paths
        self.path.update(kwargs)

        if HASYv2Dataset.id_dict is None:
            self.id = {}
            self.latex = {}
            symb_csv = pd.read_csv(f"{self.path['base']}/symbols.csv")
            for index, symbol_id in enumerate(symb_csv['symbol_id']):
                self.id[int(symbol_id)] = index
                self.latex[index] = symb_csv['latex'][index]
        else:
            self.id = HASYv2Dataset.id_dict
            self.latex = HASYv2Dataset.latex_dict

        self.inputs = []
        self.output = []
        self.fold = None


[docs]
    def cross_val(self, fold: int, train: bool, dataset: "HASYv2Dataset" = None):
        """Method for loading data from one fold to the dataset class.

        Args:
            fold: The number of the fold (1 to 10).
            train: Whether we want to load the training or the validation (test)
                data from the corresponding fold.
            dataset: Another :class:`HASYv2Dataset` object containing the
                entirety of the data (generated through the
                :func:`~HASYv2Dataset.for_colab` method). This is needed when
                the dataset's individual images are not locally available (like
                in GoogleDrive). This method will then use this base dataset
                instead of trying to find the files locally.

        """

        self.fold = fold
        self.inputs = []
        self.output = []

        if train:
            string = 'train'
        else:
            string = 'test'

        df = pd.read_csv(f"{self.path['fold']}/fold-{fold}/{string}.csv")

        if dataset is None:
            # Load actual images
            for index, path in enumerate(df['path']):
                path = path[6:]
                tsr = transforms.Grayscale(
                      transforms.ToTensor(
                          Image.open(
                    f"{self.path['base']}/{path}"
                )))
                tsr = tsr - 0.5     # So data is comprised between -0.5 and 0.5
                label = th.LongTensor([
                    self.id[int(df['symbol_id'][index])]
                ])
                self.inputs.append(tsr)
                self.output.append(label)
        else:
            # Load data from larger dataset
            for index, path in enumerate(df['path']):
                path = path[-10:-4].replace('-', '')
                tsr = dataset.inputs[int(path)]
                label = dataset.output[int(path)]
                self.inputs.append(tsr)
                self.output.append(label)



[docs]
    def for_colab(self):
        """Loads the entirety of the dataset into one single class instance.

        This is useful for passing the complete dataset around without having to
        move all of the 160,000+ image files. This is of course particularly
        useful for training models in Google Colab, since uploading the raw
        dataset to Google Drive has failed several times.

        This function will automatically save the full :class:`HASYv2Dataset`
        class in the Dataset's base directory (``self.path["base"]``) as
        "colab_dataset.pkl", a pickle.
        """

        self.inputs = []
        self.output = []

        data_csv = pd.read_csv(f"{self.path['base']}/hasy-data-labels.csv")
        for index, path in enumerate(data_csv['path']):
            tsr = transforms.Grayscale(
                  transforms.ToTensor(
                      Image.open(
                f"{self.path['base']}/{path}"
            )))

            tsr = tsr - 0.5     # So data is comprised between -0.5 and 0.5

            label = th.LongTensor([
                self.id[int(data_csv['symbol_id'][index])]
            ])

            self.inputs.append(tsr)
            self.output.append(label)

        with open(f"{self.path['base']}/colab_dataset.pkl", 'wb') as f:
            pickle.dump(self, f)