Source code for mlbench_core.utils.pytorch.helpers

r"""Helper functions."""

import logging
import os
import random
import shutil
import socket

import numpy as np
import torch
from torch import distributed as dist

from mlbench_core.utils.pytorch.topology import FCGraph


[docs]def config_logging(logging_level="INFO", logging_file="/mlbench.log"): """Setup logging modules. A stream handler and file handler are added to default logger `mlbench`. Args: logging_level (str): Log level logging_file (str): Log file """ class RankFilter(logging.Filter): def filter(self, record): record.rank = dist.get_rank() return True logger = logging.getLogger("mlbench") if len(logger.handlers) >= 2: return logger.setLevel(logging_level) logger.addFilter(RankFilter()) formatter = logging.Formatter( "%(asctime)s %(name)s %(rank)2s %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S" ) ch = logging.StreamHandler() ch.setLevel(logging_level) ch.setFormatter(formatter) logger.addHandler(ch) fh = logging.FileHandler(logging_file) fh.setLevel(logging_level) fh.setFormatter(formatter) logger.addHandler(fh)
[docs]def config_pytorch(use_cuda=False, seed=None, cudnn_deterministic=False): """Config pytorch packages. Fix random number for packages and initialize distributed environment for pytorch. Setup cuda environment for pytorch. Args: use_cuda (bool): Use CUDA acceleration seed (int | None): Random seed to use cudnn_deterministic (bool): Set `cudnn.determenistic=True` Returns: (int, int, `obj`:FCGraph): The rank, world size, and network graph """ # Setting `cudnn.deterministic = True` will turn on # CUDNN deterministic setting which can slow down training considerably. # Unexpected behavior may also be observed from checkpoint. # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py if cudnn_deterministic: # cudnn.deterministic = True print( "You have chosen to seed training. " "This will turn on the CUDNN deterministic setting, " "which can slow down your training considerably! " "You may see unexpected behavior when restarting " "from checkpoints." ) if seed: torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) # define the graph for the computation. if use_cuda: assert torch.cuda.is_available() rank = dist.get_rank() world_size = dist.get_world_size() backend = dist.get_backend() if dist.is_initialized() else None graph = FCGraph(rank, world_size, use_cuda) # enable cudnn accelerator if we are using cuda. if use_cuda: graph.assigned_gpu_id() torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False if cudnn_deterministic: torch.backends.cudnn.deterministic = True if torch.backends.cudnn.version() is None: print("CUDNN not found on device.") print( "World size={}, Rank={}, hostname={}, backend={}, cuda_available={}, cuda_device={}".format( world_size, rank, socket.gethostname(), backend, torch.cuda.is_available(), torch.cuda.current_device(), ) ) return rank, world_size, graph
[docs]def config_path(ckpt_run_dir, delete_existing_ckpts=False): """Config the path used during the experiments.""" if delete_existing_ckpts: print("Remove previous checkpoint directory : {}".format(ckpt_run_dir)) shutil.rmtree(ckpt_run_dir, ignore_errors=True) os.makedirs(ckpt_run_dir, exist_ok=True)