diff --git a/INSTALLATION_SUPPORT.md b/INSTALLATION_SUPPORT.md index e9585e7..981237f 100644 --- a/INSTALLATION_SUPPORT.md +++ b/INSTALLATION_SUPPORT.md @@ -5,7 +5,7 @@ The library can be installed with: ```shell pip install difflogic ``` -> ⚠️ Note that `difflogic` requires CUDA, the CUDA Toolkit (for compilation), and `torch>=1.9.0` (matching the CUDA version). +> ⚠️ Note that, by default, `difflogic` requires CUDA, the CUDA Toolkit (for compilation), and `torch>=1.9.0` (matching the CUDA version). CUDA can be disable by setting a flag like so `export DIFFLOGIC_BUILD_CUDA_EXT=false` before running `pip install .` Only the much slower pure python implementation is available in that case. **It is very important that the installed version of PyTorch was compiled with a CUDA version that is compatible with the CUDA version of the locally installed CUDA Toolkit.** diff --git a/difflogic/difflogic.py b/difflogic/difflogic.py index bd2310c..4983e73 100644 --- a/difflogic/difflogic.py +++ b/difflogic/difflogic.py @@ -1,9 +1,13 @@ +import warnings import torch -import difflogic_cuda import numpy as np from .functional import bin_op_s, get_unique_connections, GradFactor from .packbitstensor import PackBitsTensor +try: + import difflogic_cuda +except ImportError: + warnings.warn('failed to import difflogic_cuda. no cuda features will be available', ImportWarning) ######################################################################################################################## @@ -95,9 +99,7 @@ def forward_python(self, x): assert x.shape[-1] == self.in_dim, (x[0].shape[-1], self.in_dim) if self.indices[0].dtype == torch.int64 or self.indices[1].dtype == torch.int64: - print(self.indices[0].dtype, self.indices[1].dtype) self.indices = self.indices[0].long(), self.indices[1].long() - print(self.indices[0].dtype, self.indices[1].dtype) a, b = x[..., self.indices[0]], x[..., self.indices[1]] if self.training: diff --git a/difflogic/packbitstensor.py b/difflogic/packbitstensor.py index bc1afd3..b5553e1 100644 --- a/difflogic/packbitstensor.py +++ b/difflogic/packbitstensor.py @@ -1,7 +1,12 @@ -import difflogic_cuda +import warnings import torch import numpy as np +try: + import difflogic_cuda +except ImportError: + warnings.warn('failed to import difflogic_cuda. no cuda features will be available', ImportWarning) + class PackBitsTensor: def __init__(self, t: torch.BoolTensor, bit_count=32, device='cuda'): diff --git a/experiments/main.py b/experiments/main.py index b55bde2..bd7e9de 100644 --- a/experiments/main.py +++ b/experiments/main.py @@ -22,6 +22,11 @@ 64: torch.float64 } +IMPL_TO_DEVICE = { + 'cuda': 'cuda', + 'python': 'cpu' +} + def load_dataset(args): validation_loader = None @@ -117,7 +122,13 @@ def num_classes_of_dataset(dataset): def get_model(args): - llkw = dict(grad_factor=args.grad_factor, connections=args.connections) + + llkw = { + 'grad_factor': args.grad_factor, + 'connections': args.connections, + 'implementation': args.implementation, + 'device': IMPL_TO_DEVICE[args.implementation] + } in_dim = input_dim_of_dataset(args.dataset) class_count = num_classes_of_dataset(args.dataset) @@ -158,7 +169,7 @@ def get_model(args): 'total_num_weights': total_num_weights, }) - model = model.to('cuda') + model = model.to(llkw['device']) print(model) if args.experiment_id is not None: @@ -181,13 +192,13 @@ def train(model, x, y, loss_fn, optimizer): return loss.item() -def eval(model, loader, mode): +def eval(model, loader, mode, device='cuda'): orig_mode = model.training with torch.no_grad(): model.train(mode=mode) res = np.mean( [ - (model(x.to('cuda').round()).argmax(-1) == y.to('cuda')).to(torch.float32).mean().item() + (model(x.to(device).round()).argmax(-1) == y.to(device)).to(torch.float32).mean().item() for x, y in loader ] ) @@ -195,14 +206,14 @@ def eval(model, loader, mode): return res.item() -def packbits_eval(model, loader): +def packbits_eval(model, loader, device='cuda'): orig_mode = model.training with torch.no_grad(): model.eval() res = np.mean( [ - (model(PackBitsTensor(x.to('cuda').reshape(x.shape[0], -1).round().bool())).argmax(-1) == y.to( - 'cuda')).to(torch.float32).mean().item() + (model(PackBitsTensor(x.to(device).reshape(x.shape[0], -1).round().bool())).argmax(-1) == y.to( + device)).to(torch.float32).mean().item() for x, y in loader ] ) @@ -258,6 +269,8 @@ def packbits_eval(model, loader): print(vars(args)) + device = IMPL_TO_DEVICE[args.implementation] + assert args.num_iterations % args.eval_freq == 0, ( f'iteration count ({args.num_iterations}) has to be divisible by evaluation frequency ({args.eval_freq})' ) @@ -283,23 +296,23 @@ def packbits_eval(model, loader): desc='iteration', total=args.num_iterations, ): - x = x.to(BITS_TO_TORCH_FLOATING_POINT_TYPE[args.training_bit_count]).to('cuda') - y = y.to('cuda') + x = x.to(BITS_TO_TORCH_FLOATING_POINT_TYPE[args.training_bit_count]).to(device) + y = y.to(device) loss = train(model, x, y, loss_fn, optim) if (i+1) % args.eval_freq == 0: if args.extensive_eval: - train_accuracy_train_mode = eval(model, train_loader, mode=True) - valid_accuracy_eval_mode = eval(model, validation_loader, mode=False) - valid_accuracy_train_mode = eval(model, validation_loader, mode=True) + train_accuracy_train_mode = eval(model, train_loader, mode=True, device=device) + valid_accuracy_eval_mode = eval(model, validation_loader, mode=False, device=device) + valid_accuracy_train_mode = eval(model, validation_loader, mode=True, device=device) else: train_accuracy_train_mode = -1 valid_accuracy_eval_mode = -1 valid_accuracy_train_mode = -1 - train_accuracy_eval_mode = eval(model, train_loader, mode=False) - test_accuracy_eval_mode = eval(model, test_loader, mode=False) - test_accuracy_train_mode = eval(model, test_loader, mode=True) + train_accuracy_eval_mode = eval(model, train_loader, mode=False, device=device) + test_accuracy_eval_mode = eval(model, test_loader, mode=False, device=device) + test_accuracy_train_mode = eval(model, test_loader, mode=True, device=device) r = { 'train_acc_eval_mode': train_accuracy_eval_mode, @@ -311,9 +324,9 @@ def packbits_eval(model, loader): } if args.packbits_eval: - r['train_acc_eval'] = packbits_eval(model, train_loader) - r['valid_acc_eval'] = packbits_eval(model, train_loader) - r['test_acc_eval'] = packbits_eval(model, test_loader) + r['train_acc_eval'] = packbits_eval(model, train_loader, device=device) + r['valid_acc_eval'] = packbits_eval(model, train_loader, device=device) + r['test_acc_eval'] = packbits_eval(model, test_loader, device=device) if args.experiment_id is not None: results.store_results(r) diff --git a/setup.py b/setup.py index 89804c3..d0c4a70 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,25 @@ +import os from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension with open('README.md', 'r', encoding='utf-8') as fh: long_description = fh.read() + +# decide from env variable if cuda extension should be built (default is 'true') +build_cuda_ext = os.getenv('DIFFLOGIC_BUILD_CUDA_EXT', 'true').lower() + +if build_cuda_ext == 'true': + from torch.utils.cpp_extension import BuildExtension, CUDAExtension + ext_modules = [ + CUDAExtension('difflogic_cuda', [ + 'difflogic/cuda/difflogic.cpp', + 'difflogic/cuda/difflogic_kernel.cu', + ], extra_compile_args={'nvcc': ['-lineinfo']}) + ] +else: + ext_modules = [] + + setup( name='difflogic', version='0.1.0', @@ -25,11 +41,8 @@ ], package_dir={'difflogic': 'difflogic'}, packages=['difflogic'], - ext_modules=[CUDAExtension('difflogic_cuda', [ - 'difflogic/cuda/difflogic.cpp', - 'difflogic/cuda/difflogic_kernel.cu', - ], extra_compile_args={'nvcc': ['-lineinfo']})], - cmdclass={'build_ext': BuildExtension}, + ext_modules=ext_modules, + cmdclass={'build_ext': BuildExtension} if ext_modules else {}, # Only if building extensions python_requires='>=3.6', install_requires=[ 'torch>=1.6.0',