Source code for mlbench_core.dataset.linearmodels.pytorch.dataloader

import logging
import os
import pickle

import lmdb
import numpy as np
import torch.utils.data
from scipy.sparse import coo_matrix, csr_matrix
from tensorpack.utils.compatible_serialize import loads

from mlbench_core.dataset.util.tools import progress_download

_logger = logging.getLogger("mlbench")

# All available datasets
_LIBSVM_DATASETS = [
    {"name": "australian_train", "n_samples": 690, "n_features": 14, "sparse": False},
    {"name": "duke_train", "n_samples": 38, "n_features": 7129, "sparse": True},
    {"name": "duke_test", "n_samples": 4, "n_features": 7129, "sparse": True},
    {
        "name": "epsilon_train",
        "n_samples": 400000,
        "n_features": 2000,
        "sparse": False,
        "url": "https://storage.googleapis.com/mlbench-datasets/libsvm"
        "/epsilon_train.lmdb",
    },
    {
        "name": "epsilon_test",
        "n_samples": 100000,
        "n_features": 2000,
        "sparse": False,
        "url": "https://storage.googleapis.com/mlbench-datasets/libsvm"
        "/epsilon_test.lmdb",
    },
    {"name": "rcv1_train", "n_samples": 677399, "n_features": 47236, "sparse": True},
    {"name": "synthetic_dense", "n_samples": 10000, "n_features": 100, "sparse": False},
    {
        "name": "webspam_train",
        "n_samples": 350000,
        "n_features": 16609143,
        "sparse": True,
    },
]


[docs]class LMDBDataset(torch.utils.data.Dataset):
    """
    LMDB Dataset

    Args:
        root (string): Either root directory for the database files,
            or a absolute path pointing to the file.
        target_transform (callable, optional):
            A function/transform that takes in the target and transforms it.
    """

    def __init__(
        self,
        name,
        data_type,
        root,
        target_transform=None,
    ):

        root, self.transform = maybe_download_lmdb(name, data_type, root)
        self.root = os.path.expanduser(root)
        self.target_transform = target_transform
        self.lmdb_files = self._get_valid_lmdb_files()

        # for each class, create an LSUNClassDataset
        self.dbs = []
        for lmdb_file in self.lmdb_files:
            self.dbs.append(
                LMDBPTClass(
                    root=lmdb_file,
                    transform=self.transform,
                    target_transform=target_transform,
                )
            )

        # build up indices.
        self.indices = np.cumsum([len(db) for db in self.dbs])
        self.length = self.indices[-1]

        self._get_index_zones = self._build_indices()

    def _get_valid_lmdb_files(self):
        """get valid lmdb based on given root."""
        if not self.root.endswith(".lmdb"):
            for l in os.listdir(self.root):
                if "_" in l and "-lock" not in l:
                    yield os.path.join(self.root, l)
        else:
            yield self.root

    def _build_indices(self):
        indices = self.indices
        from_to_indices = enumerate(zip(indices[:-1], indices[1:]))

        def f(x):
            if len(list(from_to_indices)) == 0:
                return 0, x

            for ind, (from_index, to_index) in from_to_indices:
                if from_index <= x < to_index:
                    return ind, x - from_index

        return f

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, target)
        """
        block_index, item_index = self._get_index_zones(index)
        image, target = self.dbs[block_index][item_index]
        return image, np.array([target])

    def __len__(self):
        return self.length

    def __repr__(self):
        fmt_str = "Dataset {}\n".format(self.__class__.__name__)
        fmt_str += "    Number of datapoints: {}\n".format(self.__len__())
        fmt_str += "    Root Location: {}\n".format(self.root)
        tmp = "    Transforms (if any): "
        fmt_str += "{0}{1}\n".format(
            tmp, self.transform.__repr__().replace("\n", "\n" + " " * len(tmp))
        )
        tmp = "    Target Transforms (if any): "
        fmt_str += "{0}{1}".format(
            tmp, self.target_transform.__repr__().replace("\n", "\n" + " " * len(tmp))
        )
        return fmt_str


[docs]class LMDBPTClass(torch.utils.data.Dataset):
    """
    LMDB Dataset loader Class

    Args:
        root (string): Either root directory for the database files,
            or a absolute path pointing to the file.
        transform (callable, optional): A function/transform that
            takes in an PIL image and returns a transformed version.
            E.g, ``transforms.RandomCrop``
        target_transform (callable, optional):
            A function/transform that takes in the target and transforms it.
    """

    def __init__(self, root, transform=None, target_transform=None):
        self.root = os.path.expanduser(root)
        self.transform = transform
        self.target_transform = target_transform

        # open lmdb env.
        self.env = self._open_lmdb()

        # get file stats.
        self._get_length()

        # prepare cache_file
        self._prepare_cache()

    def _open_lmdb(self):
        return lmdb.open(
            self.root,
            subdir=os.path.isdir(self.root),
            readonly=True,
            lock=False,
            readahead=False,
            map_size=1099511627776 * 2,
            max_readers=1,
            meminit=False,
        )

    def _get_length(self):
        with self.env.begin(write=False) as txn:
            self.length = txn.stat()["entries"]

            if txn.get(b"__keys__") is not None:
                self.length -= 1

    def _prepare_cache(self):
        cache_file = self.root + "_cache_"
        if os.path.isfile(cache_file):
            self.keys = pickle.load(open(cache_file, "rb"))
        else:
            with self.env.begin(write=False) as txn:
                self.keys = [key for key, _ in txn.cursor() if key != b"__keys__"]
            pickle.dump(self.keys, open(cache_file, "wb"))

    def __getitem__(self, index):
        env = self.env
        with env.begin(write=False) as txn:
            bin_file = txn.get(self.keys[index])

        data, target = loads(bin_file)

        if self.transform is not None:
            data = self.transform(data)

        if self.target_transform is not None:
            target = self.target_transform(target)
        return data, target

    def __len__(self):
        return self.length

    def __repr__(self):
        return self.__class__.__name__ + " (" + self.root + ")"


def construct_sparse_matrix(triplet, n_features):

    data, row, col = triplet
    mat = coo_matrix((data, (row, col)), shape=(len(set(row)), n_features))
    return csr_matrix(mat)[list(set(row))]


def maybe_transform_sparse(stats):
    return (
        (lambda x: construct_sparse_matrix(x, stats["n_features"]))
        if stats["sparse"]
        else None
    )


def get_dataset_info(name):
    stats = list(filter(lambda x: x["name"] == name, _LIBSVM_DATASETS))
    assert len(stats) == 1, "{} not found.".format(name)
    return stats[0]


def maybe_download_lmdb(name, data_type, dataset_dir):
    """Downloads the given dataset

    Args:
        name (str): Name of the dataset, one of
            `[australian, duke, epsilon, rcv1, synthetic, webspam]`
        data_type (str): One of `test` and `train`
        dataset_dir (str): Directory where to store the dataset

    Returns:
        (str, bool): path of lmdb file and flag for sparse transform
    """

    full_name = "{}_{}".format(name, data_type)
    stats = get_dataset_info(full_name)
    lmdb_path = os.path.join(dataset_dir, "{}_{}.lmdb".format(name, data_type))

    if not (os.path.exists(lmdb_path) and os.path.isfile(lmdb_path)):
        if "url" not in stats:
            raise FileNotFoundError(
                "Could not download LIBSVM dataset {}".format(full_name)
            )
        _logger.info("Downloading dataset {}".format(full_name))

        progress_download(stats["url"], dest=lmdb_path)

    return lmdb_path, maybe_transform_sparse(stats)