diff --git a/INSTALLATION_SUPPORT.md b/INSTALLATION_SUPPORT.md
index e9585e7..981237f 100644
--- a/INSTALLATION_SUPPORT.md
+++ b/INSTALLATION_SUPPORT.md
@@ -5,7 +5,7 @@ The library can be installed with:
 ```shell
 pip install difflogic
 ```
-> ⚠️ Note that `difflogic` requires CUDA, the CUDA Toolkit (for compilation), and `torch>=1.9.0` (matching the CUDA version).
+> ⚠️ Note that, by default, `difflogic` requires CUDA, the CUDA Toolkit (for compilation), and `torch>=1.9.0` (matching the CUDA version). CUDA can be disable by setting a flag like so `export DIFFLOGIC_BUILD_CUDA_EXT=false` before running `pip install .` Only the much slower pure python implementation is available in that case.
 
 **It is very important that the installed version of PyTorch was compiled with a CUDA version that is compatible with the CUDA version of the locally installed CUDA Toolkit.**
 
diff --git a/difflogic/difflogic.py b/difflogic/difflogic.py
index bd2310c..4983e73 100644
--- a/difflogic/difflogic.py
+++ b/difflogic/difflogic.py
@@ -1,9 +1,13 @@
+import warnings
 import torch
-import difflogic_cuda
 import numpy as np
 from .functional import bin_op_s, get_unique_connections, GradFactor
 from .packbitstensor import PackBitsTensor
 
+try:
+    import difflogic_cuda
+except ImportError:
+    warnings.warn('failed to import difflogic_cuda. no cuda features will be available', ImportWarning)
 
 ########################################################################################################################
 
@@ -95,9 +99,7 @@ def forward_python(self, x):
         assert x.shape[-1] == self.in_dim, (x[0].shape[-1], self.in_dim)
 
         if self.indices[0].dtype == torch.int64 or self.indices[1].dtype == torch.int64:
-            print(self.indices[0].dtype, self.indices[1].dtype)
             self.indices = self.indices[0].long(), self.indices[1].long()
-            print(self.indices[0].dtype, self.indices[1].dtype)
 
         a, b = x[..., self.indices[0]], x[..., self.indices[1]]
         if self.training:
diff --git a/difflogic/packbitstensor.py b/difflogic/packbitstensor.py
index bc1afd3..b5553e1 100644
--- a/difflogic/packbitstensor.py
+++ b/difflogic/packbitstensor.py
@@ -1,7 +1,12 @@
-import difflogic_cuda
+import warnings
 import torch
 import numpy as np
 
+try:
+    import difflogic_cuda
+except ImportError:
+    warnings.warn('failed to import difflogic_cuda. no cuda features will be available', ImportWarning)
+
 
 class PackBitsTensor:
     def __init__(self, t: torch.BoolTensor, bit_count=32, device='cuda'):
diff --git a/experiments/main.py b/experiments/main.py
index b55bde2..bd7e9de 100644
--- a/experiments/main.py
+++ b/experiments/main.py
@@ -22,6 +22,11 @@
     64: torch.float64
 }
 
+IMPL_TO_DEVICE = {
+    'cuda': 'cuda',
+    'python': 'cpu'
+}
+
 
 def load_dataset(args):
     validation_loader = None
@@ -117,7 +122,13 @@ def num_classes_of_dataset(dataset):
 
 
 def get_model(args):
-    llkw = dict(grad_factor=args.grad_factor, connections=args.connections)
+
+    llkw = {
+        'grad_factor': args.grad_factor,
+        'connections': args.connections,
+        'implementation': args.implementation,
+        'device': IMPL_TO_DEVICE[args.implementation]
+    }
 
     in_dim = input_dim_of_dataset(args.dataset)
     class_count = num_classes_of_dataset(args.dataset)
@@ -158,7 +169,7 @@ def get_model(args):
             'total_num_weights': total_num_weights,
         })
 
-    model = model.to('cuda')
+    model = model.to(llkw['device'])
 
     print(model)
     if args.experiment_id is not None:
@@ -181,13 +192,13 @@ def train(model, x, y, loss_fn, optimizer):
     return loss.item()
 
 
-def eval(model, loader, mode):
+def eval(model, loader, mode, device='cuda'):
     orig_mode = model.training
     with torch.no_grad():
         model.train(mode=mode)
         res = np.mean(
             [
-                (model(x.to('cuda').round()).argmax(-1) == y.to('cuda')).to(torch.float32).mean().item()
+                (model(x.to(device).round()).argmax(-1) == y.to(device)).to(torch.float32).mean().item()
                 for x, y in loader
             ]
         )
@@ -195,14 +206,14 @@ def eval(model, loader, mode):
     return res.item()
 
 
-def packbits_eval(model, loader):
+def packbits_eval(model, loader, device='cuda'):
     orig_mode = model.training
     with torch.no_grad():
         model.eval()
         res = np.mean(
             [
-                (model(PackBitsTensor(x.to('cuda').reshape(x.shape[0], -1).round().bool())).argmax(-1) == y.to(
-                    'cuda')).to(torch.float32).mean().item()
+                (model(PackBitsTensor(x.to(device).reshape(x.shape[0], -1).round().bool())).argmax(-1) == y.to(
+                    device)).to(torch.float32).mean().item()
                 for x, y in loader
             ]
         )
@@ -258,6 +269,8 @@ def packbits_eval(model, loader):
 
     print(vars(args))
 
+    device = IMPL_TO_DEVICE[args.implementation]
+
     assert args.num_iterations % args.eval_freq == 0, (
         f'iteration count ({args.num_iterations}) has to be divisible by evaluation frequency ({args.eval_freq})'
     )
@@ -283,23 +296,23 @@ def packbits_eval(model, loader):
             desc='iteration',
             total=args.num_iterations,
     ):
-        x = x.to(BITS_TO_TORCH_FLOATING_POINT_TYPE[args.training_bit_count]).to('cuda')
-        y = y.to('cuda')
+        x = x.to(BITS_TO_TORCH_FLOATING_POINT_TYPE[args.training_bit_count]).to(device)
+        y = y.to(device)
 
         loss = train(model, x, y, loss_fn, optim)
 
         if (i+1) % args.eval_freq == 0:
             if args.extensive_eval:
-                train_accuracy_train_mode = eval(model, train_loader, mode=True)
-                valid_accuracy_eval_mode = eval(model, validation_loader, mode=False)
-                valid_accuracy_train_mode = eval(model, validation_loader, mode=True)
+                train_accuracy_train_mode = eval(model, train_loader, mode=True, device=device)
+                valid_accuracy_eval_mode = eval(model, validation_loader, mode=False, device=device)
+                valid_accuracy_train_mode = eval(model, validation_loader, mode=True, device=device)
             else:
                 train_accuracy_train_mode = -1
                 valid_accuracy_eval_mode = -1
                 valid_accuracy_train_mode = -1
-            train_accuracy_eval_mode = eval(model, train_loader, mode=False)
-            test_accuracy_eval_mode = eval(model, test_loader, mode=False)
-            test_accuracy_train_mode = eval(model, test_loader, mode=True)
+            train_accuracy_eval_mode = eval(model, train_loader, mode=False, device=device)
+            test_accuracy_eval_mode = eval(model, test_loader, mode=False, device=device)
+            test_accuracy_train_mode = eval(model, test_loader, mode=True, device=device)
 
             r = {
                 'train_acc_eval_mode': train_accuracy_eval_mode,
@@ -311,9 +324,9 @@ def packbits_eval(model, loader):
             }
 
             if args.packbits_eval:
-                r['train_acc_eval'] = packbits_eval(model, train_loader)
-                r['valid_acc_eval'] = packbits_eval(model, train_loader)
-                r['test_acc_eval'] = packbits_eval(model, test_loader)
+                r['train_acc_eval'] = packbits_eval(model, train_loader, device=device)
+                r['valid_acc_eval'] = packbits_eval(model, train_loader, device=device)
+                r['test_acc_eval'] = packbits_eval(model, test_loader, device=device)
 
             if args.experiment_id is not None:
                 results.store_results(r)
diff --git a/setup.py b/setup.py
index 89804c3..d0c4a70 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,25 @@
+import os
 from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
 with open('README.md', 'r', encoding='utf-8') as fh:
     long_description = fh.read()
 
+
+# decide from env variable if cuda extension should be built (default is 'true')
+build_cuda_ext = os.getenv('DIFFLOGIC_BUILD_CUDA_EXT', 'true').lower()
+
+if build_cuda_ext == 'true':
+    from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+    ext_modules = [
+        CUDAExtension('difflogic_cuda', [
+            'difflogic/cuda/difflogic.cpp',
+            'difflogic/cuda/difflogic_kernel.cu',
+        ], extra_compile_args={'nvcc': ['-lineinfo']})
+    ]
+else:
+    ext_modules = []
+
+
 setup(
     name='difflogic',
     version='0.1.0',
@@ -25,11 +41,8 @@
     ],
     package_dir={'difflogic': 'difflogic'},
     packages=['difflogic'],
-    ext_modules=[CUDAExtension('difflogic_cuda', [
-        'difflogic/cuda/difflogic.cpp',
-        'difflogic/cuda/difflogic_kernel.cu',
-    ], extra_compile_args={'nvcc': ['-lineinfo']})],
-    cmdclass={'build_ext': BuildExtension},
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': BuildExtension} if ext_modules else {},  # Only if building extensions
     python_requires='>=3.6',
     install_requires=[
         'torch>=1.6.0',