|
|
|
import builtins
|
|
|
|
import math
|
|
|
|
import os
|
|
|
|
import random
|
|
|
|
import shutil
|
|
|
|
import time
|
|
|
|
import warnings
|
|
|
|
|
|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
|
|
|
import torch.nn.parallel
|
|
|
|
import torch.backends.cudnn as cudnn
|
|
|
|
import torch.distributed as dist
|
|
|
|
import torch.optim
|
|
|
|
import torch.multiprocessing as mp
|
|
|
|
import torch.utils.data
|
|
|
|
import torch.utils.data.distributed
|
|
|
|
import torchvision.models as models
|
|
|
|
|
|
|
|
from scripts.parser import parser
|
|
|
|
from scripts.meter import AverageMeter, ProgressMeter
|
|
|
|
import scripts.augmentation as aug
|
|
|
|
import scripts.momentum as momentum
|
|
|
|
import scripts.clustering as clustering
|
|
|
|
import scripts.loss as loss_script
|
|
|
|
from scripts.data_process import data_process
|
|
|
|
import pcl.builder
|
|
|
|
import pcl.loader
|
|
|
|
|
|
|
|
|
|
|
|
def worker_loader():
|
|
|
|
args = parser().parse_args()
|
|
|
|
|
|
|
|
if args.seed is not None:
|
|
|
|
random.seed(args.seed)
|
|
|
|
torch.manual_seed(args.seed)
|
|
|
|
warnings.warn('You have chosen to seed training. '
|
|
|
|
'This will turn on the CUDNN deterministic setting, '
|
|
|
|
'which can slow down your training considerably! '
|
|
|
|
'You may see unexpected behavior when restarting '
|
|
|
|
'from checkpoints.')
|
|
|
|
|
|
|
|
if args.gpu is not None:
|
|
|
|
warnings.warn('You have chosen a specific GPU. This will completely '
|
|
|
|
'disable data parallelism.')
|
|
|
|
|
|
|
|
if args.dist_url == "env://" and args.world_size == -1:
|
|
|
|
args.world_size = int(os.environ["WORLD_SIZE"])
|
|
|
|
|
|
|
|
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
|
|
|
|
|
|
|
|
args.num_cluster = args.num_cluster.split(',')
|
|
|
|
|
|
|
|
if not os.path.exists(args.exp_dir):
|
|
|
|
os.mkdir(args.exp_dir)
|
|
|
|
|
|
|
|
ngpus_per_node = torch.cuda.device_count()
|
|
|
|
if args.multiprocessing_distributed:
|
|
|
|
# Since we have ngpus_per_node processes per node, the total world_size
|
|
|
|
# needs to be adjusted accordingly
|
|
|
|
args.world_size = ngpus_per_node * args.world_size
|
|
|
|
# Use torch.multiprocessing.spawn to launch distributed processes: the
|
|
|
|
# main_worker process function
|
|
|
|
mp.spawn(worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
|
|
|
|
else:
|
|
|
|
# Simply call main_worker function
|
|
|
|
worker(args.gpu, ngpus_per_node, args)
|
|
|
|
|
|
|
|
|
|
|
|
def worker(gpu, ngpus_per_node, args):
|
|
|
|
args.gpu = gpu
|
|
|
|
args.multiprocessing_distributed = True
|
|
|
|
|
|
|
|
if args.gpu is not None:
|
|
|
|
print("Use GPU: {} for training".format(args.gpu))
|
|
|
|
|
|
|
|
# suppress printing if not master
|
|
|
|
if args.multiprocessing_distributed and args.gpu != 0:
|
|
|
|
def print_pass():
|
|
|
|
pass
|
|
|
|
builtins.print = print_pass
|
|
|
|
|
|
|
|
if args.distributed:
|
|
|
|
if args.dist_url == "env://" and args.rank == -1:
|
|
|
|
args.rank = int(os.environ["RANK"])
|
|
|
|
if args.multiprocessing_distributed:
|
|
|
|
# For multiprocessing distributed training, rank needs to be the
|
|
|
|
# global rank among all the processes
|
|
|
|
args.rank = args.rank * ngpus_per_node + gpu
|
|
|
|
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
|
|
|
|
world_size=args.world_size, rank=args.rank)
|
|
|
|
|
|
|
|
# create model
|
|
|
|
print("=> create model '{}'".format(args.arch))
|
|
|
|
model = pcl.builder.MoCo(
|
|
|
|
models.__dict__[args.arch],
|
|
|
|
args.low_dim, args.pcl_r, args.moco_m, args.temperature, args.mlp)
|
|
|
|
# print(model)
|
|
|
|
|
|
|
|
if args.distributed:
|
|
|
|
# For multiprocessing distributed, DistributedDataParallel constructor
|
|
|
|
# should always set the single device scope, otherwise,
|
|
|
|
# DistributedDataParallel will use all available devices.
|
|
|
|
if args.gpu is not None:
|
|
|
|
torch.cuda.set_device(args.gpu)
|
|
|
|
model.cuda(args.gpu)
|
|
|
|
# When using a single GPU per process and per
|
|
|
|
# DistributedDataParallel, we need to divide the batch size
|
|
|
|
# ourselves based on the total number of GPUs we have
|
|
|
|
args.batch_size = int(args.batch_size / ngpus_per_node)
|
|
|
|
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
|
|
|
|
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
|
|
|
|
else:
|
|
|
|
model.cuda()
|
|
|
|
# DistributedDataParallel will divide and allocate batch_size to all
|
|
|
|
# available GPUs if device_ids are not set
|
|
|
|
model = torch.nn.parallel.DistributedDataParallel(model)
|
|
|
|
elif args.gpu is not None:
|
|
|
|
torch.cuda.set_device(args.gpu)
|
|
|
|
model.cuda(args.gpu)
|
|
|
|
# comment out the following line for debugging
|
|
|
|
raise NotImplementedError("Only DistributedDataParallel is supported.")
|
|
|
|
else:
|
|
|
|
# AllGather implementation (batch shuffle, queue update, etc.) in
|
|
|
|
# this code only supports DistributedDataParallel.
|
|
|
|
raise NotImplementedError("Only DistributedDataParallel is supported.")
|
|
|
|
|
|
|
|
# define loss function (criterion) and optimizer
|
|
|
|
criterion = nn.CrossEntropyLoss().cuda(args.gpu)
|
|
|
|
|
|
|
|
optimizer = torch.optim.SGD(model.parameters(), args.lr,
|
|
|
|
momentum=args.momentum,
|
|
|
|
weight_decay=args.weight_decay)
|
|
|
|
|
|
|
|
# optionally resume from a checkpoint
|
|
|
|
if args.resume:
|
|
|
|
if os.path.isfile(args.resume):
|
|
|
|
print("=> loading checkpoint '{}'".format(args.resume))
|
|
|
|
if args.gpu is None:
|
|
|
|
checkpoint = torch.load(args.resume)
|
|
|
|
else:
|
|
|
|
# Map model to be loaded to specified single gpu.
|
|
|
|
loc = 'cuda:{}'.format(args.gpu)
|
|
|
|
checkpoint = torch.load(args.resume, map_location=loc)
|
|
|
|
args.start_epoch = checkpoint['epoch']
|
|
|
|
model.load_state_dict(checkpoint['state_dict'])
|
|
|
|
optimizer.load_state_dict(checkpoint['optimizer'])
|
|
|
|
print("=> loaded checkpoint '{}' (epoch {})"
|
|
|
|
.format(args.resume, checkpoint['epoch']))
|
|
|
|
else:
|
|
|
|
print("=> no checkpoint found at '{}'".format(args.resume))
|
|
|
|
|
|
|
|
cudnn.benchmark = True
|
|
|
|
cudnn.deterministic = True
|
|
|
|
|
|
|
|
# Data loading code
|
|
|
|
pre_train_dir = os.path.join(args.data, 'pre_train')
|
|
|
|
train_dir = os.path.join(args.data, 'train')
|
|
|
|
eval_dir = os.path.join(args.data, 'train')
|
|
|
|
|
|
|
|
# center-crop augmentation
|
|
|
|
eval_augmentation = aug.moco_eval()
|
|
|
|
|
|
|
|
pre_train_dataset = pcl.loader.PreImager(pre_train_dir, eval_augmentation)
|
|
|
|
|
|
|
|
train_dataset = pcl.loader.ImageFolderInstance(
|
|
|
|
train_dir,
|
|
|
|
pcl.loader.TwoCropsTransform(eval_augmentation))
|
|
|
|
eval_dataset = pcl.loader.ImageFolderInstance(
|
|
|
|
eval_dir,
|
|
|
|
eval_augmentation)
|
|
|
|
|
|
|
|
if args.distributed:
|
|
|
|
pre_train_sampler = torch.utils.data.distributed.DistributedSampler(pre_train_dataset)
|
|
|
|
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
|
|
|
eval_sampler = torch.utils.data.distributed.DistributedSampler(eval_dataset, shuffle=False)
|
|
|
|
else:
|
|
|
|
pre_train_sampler = None
|
|
|
|
train_sampler = None
|
|
|
|
eval_sampler = None
|
|
|
|
|
|
|
|
if args.batch_size//pre_train_dataset.class_number < 2:
|
|
|
|
raise NotImplementedError("Batch size must above double number of classes.")
|
|
|
|
|
|
|
|
pre_train_loader = torch.utils.data.DataLoader(
|
|
|
|
pre_train_dataset,
|
|
|
|
batch_size=args.batch_size//pre_train_dataset.class_number,
|
|
|
|
shuffle=(pre_train_sampler is None),
|
|
|
|
num_workers=args.workers,
|
|
|
|
pin_memory=True,
|
|
|
|
sampler=pre_train_sampler,
|
|
|
|
drop_last=True)
|
|
|
|
|
|
|
|
train_loader = torch.utils.data.DataLoader(
|
|
|
|
train_dataset, batch_size=args.batch_size//2, shuffle=(train_sampler is None),
|
|
|
|
num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
|
|
|
|
|
|
|
|
# dataloader for center-cropped images, use larger batch size to increase speed
|
|
|
|
eval_loader = torch.utils.data.DataLoader(
|
|
|
|
eval_dataset, batch_size=args.batch_size * 5, shuffle=False,
|
|
|
|
sampler=eval_sampler, num_workers=args.workers, pin_memory=True)
|
|
|
|
|
|
|
|
print("=> Pre-train")
|
|
|
|
# main loop
|
|
|
|
for epoch in range(args.start_epoch, args.epochs):
|
|
|
|
|
|
|
|
cluster_result = None
|
|
|
|
if epoch >= args.warmup_epoch:
|
|
|
|
# compute momentum features for center-cropped images
|
|
|
|
features = momentum.compute_features(eval_loader, model, args)
|
|
|
|
|
|
|
|
# placeholder for clustering result
|
|
|
|
cluster_result = {'im2cluster': [], 'centroids': [], 'density': []}
|
|
|
|
for num_cluster in args.num_cluster:
|
|
|
|
cluster_result['im2cluster'].append(torch.zeros(len(eval_dataset), dtype=torch.long).cuda())
|
|
|
|
cluster_result['centroids'].append(torch.zeros(int(num_cluster), args.low_dim).cuda())
|
|
|
|
cluster_result['density'].append(torch.zeros(int(num_cluster)).cuda())
|
|
|
|
|
|
|
|
if args.gpu == 0:
|
|
|
|
features[
|
|
|
|
torch.norm(features, dim=1) > 1.5] /= 2 # account for the few samples that are computed twice
|
|
|
|
features = features.numpy()
|
|
|
|
cluster_result = clustering.run_kmeans(features, args) # run kmeans clustering on master node
|
|
|
|
# save the clustering result
|
|
|
|
# torch.save(cluster_result,os.path.join(args.exp_dir, 'clusters_%d'%epoch))
|
|
|
|
|
|
|
|
dist.barrier()
|
|
|
|
# broadcast clustering result
|
|
|
|
for _k, data_list in cluster_result.items():
|
|
|
|
for data_tensor in data_list:
|
|
|
|
dist.broadcast(data_tensor, 0, async_op=False)
|
|
|
|
|
|
|
|
if args.distributed:
|
|
|
|
train_sampler.set_epoch(epoch)
|
|
|
|
adjust_learning_rate(optimizer, epoch, args)
|
|
|
|
|
|
|
|
# train for one epoch
|
|
|
|
if epoch >= args.warmup_epoch:
|
|
|
|
train(train_loader, model, criterion, optimizer, epoch, args, cluster_result)
|
|
|
|
else:
|
|
|
|
train(pre_train_loader, model, criterion, optimizer, epoch, args, cluster_result)
|
|
|
|
|
|
|
|
if (epoch + 1) % args.save_freq == 0 and (not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
|
|
|
and args.rank % ngpus_per_node == 0)):
|
|
|
|
torch.save({
|
|
|
|
'epoch': epoch + 1,
|
|
|
|
'arch': args.arch,
|
|
|
|
'state_dict': model.state_dict(),
|
|
|
|
'optimizer': optimizer.state_dict(),
|
|
|
|
}, '{}/checkpoint_{:04d}.pth.tar'.format(args.exp_dir, epoch))
|
|
|
|
|
|
|
|
|
|
|
|
def train(train_loader, model, criterion, optimizer, epoch, args, cluster_result=None):
|
|
|
|
batch_time = AverageMeter('Time', ':6.3f')
|
|
|
|
data_time = AverageMeter('Data', ':6.3f')
|
|
|
|
losses = AverageMeter('Loss', ':.4e')
|
|
|
|
acc_inst = AverageMeter('Acc@Inst', ':6.2f')
|
|
|
|
acc_proto = AverageMeter('Acc@Proto', ':6.2f')
|
|
|
|
|
|
|
|
progress = ProgressMeter(
|
|
|
|
len(train_loader),
|
|
|
|
[batch_time, data_time, losses, acc_inst, acc_proto],
|
|
|
|
prefix="Epoch: [{}]".format(epoch))
|
|
|
|
|
|
|
|
# switch to train mode
|
|
|
|
model.train()
|
|
|
|
|
|
|
|
end = time.time()
|
|
|
|
for i, (images, index) in enumerate(train_loader):
|
|
|
|
# measure data loading time
|
|
|
|
data_time.update(time.time() - end)
|
|
|
|
|
|
|
|
im_q, im_k = data_process(cluster_result, images, args.gpu)
|
|
|
|
|
|
|
|
# compute output
|
|
|
|
output, target, output_proto, target_proto = model(im_q=im_q, im_k=im_k,
|
|
|
|
cluster_result=cluster_result, index=index)
|
|
|
|
|
|
|
|
loss = loss_script.proto_with_quality(output, target, output_proto, target_proto, criterion, acc_proto, images,
|
|
|
|
args.num_cluster)
|
|
|
|
|
|
|
|
|
|
|
|
losses.update(loss.item(), images[0].size(0))
|
|
|
|
acc = loss_script.accuracy(output, target)[0]
|
|
|
|
acc_inst.update(acc[0], images[0].size(0))
|
|
|
|
|
|
|
|
# compute gradient and do SGD step
|
|
|
|
optimizer.zero_grad()
|
|
|
|
loss.backward()
|
|
|
|
optimizer.step()
|
|
|
|
|
|
|
|
# measure elapsed time
|
|
|
|
batch_time.update(time.time() - end)
|
|
|
|
end = time.time()
|
|
|
|
|
|
|
|
if i % args.print_freq == 0:
|
|
|
|
progress.display(i)
|
|
|
|
|
|
|
|
|
|
|
|
def adjust_learning_rate(optimizer, epoch, args):
|
|
|
|
"""Decay the learning rate based on schedule"""
|
|
|
|
lr = args.lr
|
|
|
|
if args.cos: # cosine lr schedule
|
|
|
|
lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epochs))
|
|
|
|
else: # stepwise lr schedule
|
|
|
|
for milestone in args.schedule:
|
|
|
|
lr *= 0.1 if epoch >= milestone else 1.
|
|
|
|
for param_group in optimizer.param_groups:
|
|
|
|
param_group['lr'] = lr
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
worker_loader()
|