diff --git a/.gitignore b/.gitignore
index 5d34afb..6fa4e72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,44 @@ sensepy/__pycache__
 *.txt
 *.png
 *.pdf
+*.pyo
+*.pyd
+*.pdb
+*.egg
+*.egg-info
+*.whl
+*.manifest
+*.spec
+*.log
+*.pot
+*.mo
+*.so
+*.dll
+*.dylib
+*.a
+*.lib
+*.swp
+*.swo
+*.tmp
+*.bak
+*.old
+*.orig
+*.rej
+*.sublime-project
+*.sublime-workspace
+*.project
+*.pydevproject
+*.idea/
+.vscode/
+__pycache__/
+*.coverage
+.coverage.*
+.cache
+.tox/
+.nox/
+.pytest_cache/
+htmlcov/
+dist/
+build/
+site/
+docs/_build/
diff --git a/setup.py b/setup.py
index 07cd061..8e0e9fe 100755
--- a/setup.py
+++ b/setup.py
@@ -1,29 +1,35 @@
 from setuptools import setup
 
 packages = [
-    'numpy',
-    'scipy',
-    'matplotlib',
-    'sklearn',
-    'tensorflow',
-    'cvxpy',
-    'torch',
-    'pymanopt',
-    'pandas',
-    'mosek',
-    'quadprog',
-    'cvxpylayers',
-    'functorch',
-    'autograd_minimize'
+    "numpy",
+    "scipy",
+    "matplotlib",
+    "scikit-learn",
+    "tensorflow",
+    "cvxpy",
+    "torch",
+    "pymanopt",
+    "pandas",
+    "mosek",
+    "quadprog",
+    "cvxpylayers",
+    "autograd_minimize",
+    "torch-cluster",
+    "nmf-torch",
+    "fast-pytorch-kmeans",
+    "tqdm",
 ]
 #
-setup(name='stpy',
-      version='0.0.2',
-      description='Stochastic Process Library for Python',
-      url='',
-      author='Mojmir Mutny',
-      author_email='mojmir.mutny@inf.ethz.ch',
-      license='custom ',
-      packages=['stpy'],
-	    zip_safe=False,
-      install_requires=packages)
+setup(
+    name="stpy",
+    version="0.0.2",
+    description="Stochastic Process Library for Python",
+    url="",
+    author="Mojmir Mutny",
+    author_email="mojmir.mutny@inf.ethz.ch",
+    license="custom ",
+    packages=["stpy"],
+    zip_safe=False,
+    install_requires=packages,
+    setup_requires=["torch", "Cython"],
+)
diff --git a/stpy.egg-info/PKG-INFO b/stpy.egg-info/PKG-INFO
deleted file mode 100644
index b9cb176..0000000
--- a/stpy.egg-info/PKG-INFO
+++ /dev/null
@@ -1,8 +0,0 @@
-Metadata-Version: 2.1
-Name: stpy
-Version: 0.0.2
-Summary: Stochastic Process Library for Python
-Home-page: 
-Author: Mojmir Mutny
-Author-email: mojmir.mutny@inf.ethz.ch
-License: custom 
diff --git a/stpy.egg-info/not-zip-safe b/stpy.egg-info/not-zip-safe
deleted file mode 100644
index 8b13789..0000000
--- a/stpy.egg-info/not-zip-safe
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/stpy/approx_inference/expected-propagation.py b/stpy/approx_inference/expected-propagation.py
index 44b6b0e..fbb1132 100644
--- a/stpy/approx_inference/expected-propagation.py
+++ b/stpy/approx_inference/expected-propagation.py
@@ -3,63 +3,67 @@
 from scipy.stats import multivariate_normal
 
 
-class ExpectedPropagationQuadratic():
+class ExpectedPropagationQuadratic:
 
-	def __init__(self, mu_prior, Sigma_prior, likelihood_single, data):
+    def __init__(self, mu_prior, Sigma_prior, likelihood_single, data):
 
-		# takes two arguments param, theta
-		self.likelihood_single = likelihood_single
+        # takes two arguments param, theta
+        self.likelihood_single = likelihood_single
 
-		# prior information
-		self.mu_prior = mu_prior
-		self.Sigma_prior = Sigma_prior
+        # prior information
+        self.mu_prior = mu_prior
+        self.Sigma_prior = Sigma_prior
 
-		self.d = mu_prior.size()[1]
+        self.d = mu_prior.size()[1]
 
-		self.n = len(self.data)
-		self.data = data
+        self.n = len(self.data)
+        self.data = data
 
-		self.approx = []
-		for i in range(self.n):
-			mu = torch.zeros(size=(1, self.d)).double()
-			Sigma = torch.eye(size=(self.d, self.d)).double()
-			self.approx.append((mu, Sigma))
+        self.approx = []
+        for i in range(self.n):
+            mu = torch.zeros(size=(1, self.d)).double()
+            Sigma = torch.eye(size=(self.d, self.d)).double()
+            self.approx.append((mu, Sigma))
 
-	def marginalized_version(self, j):
-		mu = torch.zeros(size=(1, self.d)).double()
-		Sigma = torch.zeros(size=(self.d, self.d)).double()
+    def marginalized_version(self, j):
+        mu = torch.zeros(size=(1, self.d)).double()
+        Sigma = torch.zeros(size=(self.d, self.d)).double()
 
-		for i in range(self.n):
-			if i != j:
-				Sigma_elem = self.approx[j][0]
-				mu_elem = self.approx[j][1]
-				Sigma_elem_inv = torch.inverse(Sigma_elem)
-				mu += Sigma_elem_inv @ mu_elem
-				Sigma += Sigma_elem_inv
-		Sigma = torch.inverse(Sigma)
-		mu = Sigma @ mu
-		return (mu, Sigma)
+        for i in range(self.n):
+            if i != j:
+                Sigma_elem = self.approx[j][0]
+                mu_elem = self.approx[j][1]
+                Sigma_elem_inv = torch.inverse(Sigma_elem)
+                mu += Sigma_elem_inv @ mu_elem
+                Sigma += Sigma_elem_inv
+        Sigma = torch.inverse(Sigma)
+        mu = Sigma @ mu
+        return (mu, Sigma)
 
-	def match_likelihood(self, j):
-		mu, Sigma = self.marginalized_version(j)
-		lik = lambda x: self.likelihood_single(torch.from_numpy(x), self.data[j]).numpy()
-		prob = lambda x: multivariate_normal.pdf(x, mean=mu.view(-1).reshape.numpy(), cov=Sigma.numpy())
-		first_moment = integrate.quad(lambda x: x * lik(x) * prob(x), 0.0, 10e10)
-		second_moment = integrate.quad(lambda x: x * x * lik(x) * prob(x), 0.0, 10e10)
+    def match_likelihood(self, j):
+        mu, Sigma = self.marginalized_version(j)
+        lik = lambda x: self.likelihood_single(
+            torch.from_numpy(x), self.data[j]
+        ).numpy()
+        prob = lambda x: multivariate_normal.pdf(
+            x, mean=mu.view(-1).reshape.numpy(), cov=Sigma.numpy()
+        )
+        first_moment = integrate.quad(lambda x: x * lik(x) * prob(x), 0.0, 10e10)
+        second_moment = integrate.quad(lambda x: x * x * lik(x) * prob(x), 0.0, 10e10)
 
-		self.approx[j][0] = first_moment
-		self.approx[j][1] = second_moment
+        self.approx[j][0] = first_moment
+        self.approx[j][1] = second_moment
 
-		return (first_moment, second_moment - first_moment ** 2)
+        return (first_moment, second_moment - first_moment**2)
 
-	def finalize(self):
-		pass
+    def finalize(self):
+        pass
 
-	def fit_gp(self, iterations='auto'):
-		if iterations == 'auto':
-			T = 100
-		for i in range(T):
-			for j in range(self.n):
-				self.match_likelihood(j)
-		mu, Sigma = self.finalize()
-		return mu, Sigma
+    def fit_gp(self, iterations="auto"):
+        if iterations == "auto":
+            T = 100
+        for i in range(T):
+            for j in range(self.n):
+                self.match_likelihood(j)
+        mu, Sigma = self.finalize()
+        return mu, Sigma
diff --git a/stpy/approx_inference/hmc.py b/stpy/approx_inference/hmc.py
index 879fd17..1e6ce13 100644
--- a/stpy/approx_inference/hmc.py
+++ b/stpy/approx_inference/hmc.py
@@ -1,5 +1,7 @@
-params_hmc = hamiltorch.sample(log_prob_func=log_prob_func,
-							   params_init=params_init,
-							   num_samples=num_samples,
-							   step_size=step_size,
-							   num_steps_per_sample=num_steps_per_sample)
+params_hmc = hamiltorch.sample(
+    log_prob_func=log_prob_func,
+    params_init=params_init,
+    num_samples=num_samples,
+    step_size=step_size,
+    num_steps_per_sample=num_steps_per_sample,
+)
diff --git a/stpy/approx_inference/langevin.py b/stpy/approx_inference/langevin.py
index 3ed2dc2..21e430a 100644
--- a/stpy/approx_inference/langevin.py
+++ b/stpy/approx_inference/langevin.py
@@ -3,25 +3,29 @@
 import torch
 
 
-class LangevinSampler():
+class LangevinSampler:
 
-	def __init__(self, verbose=False):
-		self.verbose = verbose
-		pass
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+        pass
 
-	def calculate(self, HessianF, theta0):
-		W = HessianF(theta0)
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-3))
-		return L
+    def calculate(self, HessianF, theta0):
+        W = HessianF(theta0)
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-3
+            )
+        )
+        return L
 
-	def sample(self, F, nablaF, HessianF, theta0, steps=100):
-		L = self.calculate(HessianF, theta0)
-		eta = 0.5 / (L + 1)
-		m = theta0.size()[0]
-		theta = theta0
-		for k in range(steps):
-			w = torch.randn(size=(m, 1)).double()
-			theta = theta - eta * nablaF(theta) + np.sqrt(2 * eta) * w
-			if self.verbose == True:
-				print("Iter:", k, theta.T)
-		return theta
+    def sample(self, F, nablaF, HessianF, theta0, steps=100):
+        L = self.calculate(HessianF, theta0)
+        eta = 0.5 / (L + 1)
+        m = theta0.size()[0]
+        theta = theta0
+        for k in range(steps):
+            w = torch.randn(size=(m, 1)).double()
+            theta = theta - eta * nablaF(theta) + np.sqrt(2 * eta) * w
+            if self.verbose == True:
+                print("Iter:", k, theta.T)
+        return theta
diff --git a/stpy/approx_inference/proximal_langevin.py b/stpy/approx_inference/proximal_langevin.py
index f1da7b6..fec409d 100644
--- a/stpy/approx_inference/proximal_langevin.py
+++ b/stpy/approx_inference/proximal_langevin.py
@@ -3,18 +3,23 @@
 
 
 def ProximalLangevin(LangevinSampler):
-	def sample(self, F, nablaF, HessianF, theta0, prox, steps=100):
-		L = self.calculate(HessianF, theta0)
-		eta = 0.5 / (L + 1)
-		m = theta0.size()[0]
-		theta = theta0
-		for k in range(steps):
-			w = torch.randn(size=(m, 1)).double()
-			theta = (1 - eta) * theta - eta * nablaF(theta) + eta * prox(theta) + np.sqrt(2 * eta) * w
-			if self.verbose == True:
-				print("Iter:", k, theta.T)
-		return prox(theta)
+    def sample(self, F, nablaF, HessianF, theta0, prox, steps=100):
+        L = self.calculate(HessianF, theta0)
+        eta = 0.5 / (L + 1)
+        m = theta0.size()[0]
+        theta = theta0
+        for k in range(steps):
+            w = torch.randn(size=(m, 1)).double()
+            theta = (
+                (1 - eta) * theta
+                - eta * nablaF(theta)
+                + eta * prox(theta)
+                + np.sqrt(2 * eta) * w
+            )
+            if self.verbose == True:
+                print("Iter:", k, theta.T)
+        return prox(theta)
 
 
 def MirrorLangevin(LangvinSampler):
-	pass
+    pass
diff --git a/stpy/approx_inference/sampling_helper.py b/stpy/approx_inference/sampling_helper.py
index 8976e70..960a10e 100644
--- a/stpy/approx_inference/sampling_helper.py
+++ b/stpy/approx_inference/sampling_helper.py
@@ -4,53 +4,53 @@
 
 
 def get_increment(eta, steps, f, w0, path=False):
-	"""
+    """
 
-	:param eta: terminal time
-	:param steps: number of steps
-	:param f: the operator
-	:param w0: initial point
-	:return:
-	"""
+    :param eta: terminal time
+    :param steps: number of steps
+    :param f: the operator
+    :param w0: initial point
+    :return:
+    """
 
-	tau = eta / steps
-	w = w0
-	sequence = []
+    tau = eta / steps
+    w = w0
+    sequence = []
 
-	for i in range(steps):
+    for i in range(steps):
 
-		n = torch.randn(size=w0.size()).double()
-		w = w + np.sqrt(2 * tau) * f(w, n)
-		if path:
-			sequence.append(w)
+        n = torch.randn(size=w0.size()).double()
+        w = w + np.sqrt(2 * tau) * f(w, n)
+        if path:
+            sequence.append(w)
 
-	if path:
-		return sequence
-	else:
-		return w
+    if path:
+        return sequence
+    else:
+        return w
 
 
 if __name__ == "__main__":
 
-	f = lambda w: torch.diag(1. / torch.abs(w.view(-1)))
-	d = 1
-	w0 = torch.zeros(size=(d, 1)).double() + 2
-	step = 100
-	path = get_increment(2, step, f, w0, path=True)
-	# plt.plot(path)
-
-	i = 0
-	colors = ['k', 'r', 'b', 'orange', 'brown', 'purple']
-	for steps in [5, 10, 20, 100, 200, 500]:
-
-		repeats = 100
-		ws = []
-		for _ in range(repeats):
-			path = get_increment(2, steps, f, w0, path=True)
-			xtest = torch.linspace(0, 2, steps)
-			plt.plot(xtest, path, color=colors[i])
-		i = i + 1
-	#	plt.hist(np.array(ws), label = str(step))
-
-	plt.legend()
-	plt.show()
+    f = lambda w: torch.diag(1.0 / torch.abs(w.view(-1)))
+    d = 1
+    w0 = torch.zeros(size=(d, 1)).double() + 2
+    step = 100
+    path = get_increment(2, step, f, w0, path=True)
+    # plt.plot(path)
+
+    i = 0
+    colors = ["k", "r", "b", "orange", "brown", "purple"]
+    for steps in [5, 10, 20, 100, 200, 500]:
+
+        repeats = 100
+        ws = []
+        for _ in range(repeats):
+            path = get_increment(2, steps, f, w0, path=True)
+            xtest = torch.linspace(0, 2, steps)
+            plt.plot(xtest, path, color=colors[i])
+        i = i + 1
+    # 	plt.hist(np.array(ws), label = str(step))
+
+    plt.legend()
+    plt.show()
diff --git a/stpy/approx_inference/variational_mf.py b/stpy/approx_inference/variational_mf.py
index 5fff78d..9eae29c 100644
--- a/stpy/approx_inference/variational_mf.py
+++ b/stpy/approx_inference/variational_mf.py
@@ -16,9 +16,10 @@
 You should have received a copy of the GNU General Public License
 along with SGCP_Inference.  If not, see <http://www.gnu.org/licenses/>.
 """
-__author__ = 'Christian Donner'
-__email__ = 'christian.donner(at)bccn-berlin.de'
-__license__ = 'gpl-3.0'
+
+__author__ = "Christian Donner"
+__email__ = "christian.donner(at)bccn-berlin.de"
+__license__ = "gpl-3.0"
 
 import time
 
@@ -28,673 +29,752 @@
 from scipy.special import digamma, gammaln
 
 
-class VMF_SGCP():
-
-	def __init__(self, S_borders, X, cov_params, num_inducing_points,
-				 lmbda_star=None, conv_crit=1e-4,
-				 num_integration_points=1000, output=False,
-				 update_hyperparams=True,
-				 noise=1e-4, epsilon=5e-2):
-		""" Class initialisation for variational mean field inference for
-		sigmoidal Gaussian Cox process.
-
-		:param S_borders: numpy.ndarray [D x 2]
-			Limits of the region of interest.
-		:param X: numpy.ndarray [num_points x D]
-			Positions of the observations.
-		:param cov_params: numpy.ndarray [D + 1]
-			Hyperparameters of the covariance functions. First is amplitude,
-			and the others the length scale for each dimension.
-		:param num_inducing_points: int
-			Number of inducing points (Should be a power of dimensions)
-		:param lmbda_star: float
-			Maximal intensity. If None it is initialized as twice the mean
-			observation rate for a homogeneous process. (Default=None)
-		:param conv_crit:
-			Convergence criterion, when algorithm should stop. (Default=1e-4)
-		:param num_integration_points: int
-			Number of points that should be used for Monte Carlo integration.
-			(Default = 1000)
-		:param output: bool
-			Prints info after each optimisation step. (Default=False)
-		:param update_hyperparams: bool
-			Whether the hyperparameters are updated (by Adam) or not.  (
-			Default=False)
-		:param noise: float
-			Noise added to the diagonal of the covariance matrix (should be
-			small). (Default=1e-4)
-		param epsilon: float
-			Step size for Adam in the hyperparameter update. (Default=5e-2)
-		"""
-
-		self.S_borders = S_borders
-		self.S = S_borders[:, 1] - S_borders[:, 0]
-		self.R = numpy.prod(self.S)
-		self.D = S_borders.shape[0]
-		self.noise = noise
-		self.cov_params = cov_params
-		self.num_integration_points = num_integration_points
-		self.num_inducing_points = num_inducing_points  # must be power of D
-		self.X = X
-
-		self.place_inducing_points()
-		self.mu_g_s = numpy.zeros(self.induced_points.shape[0])
-		self.Sigma_g_s = numpy.identity(self.induced_points.shape[0])
-		self.Sigma_g_s_inv = numpy.identity(self.induced_points.shape[0])
-		self.Ks = self.cov_func(self.induced_points, self.induced_points)
-		L = numpy.linalg.cholesky(self.Ks + self.noise * numpy.eye(
-			self.Ks.shape[0]))
-		L_inv = solve_triangular(L, numpy.eye(L.shape[0]), lower=True,
-								 check_finite=False)
-		self.Ks_inv = L_inv.T.dot(L_inv)
-		self.logdet_Ks = 2. * numpy.sum(numpy.log(L.diagonal()))
-
-		self.place_integration_points()
-		self.ks_X = self.cov_func(self.induced_points, self.X)
-		self.LB_list = []
-		self.times = []
-
-		self.kappa_X = self.Ks_inv.dot(self.ks_X)
-		self.kappa_int_points = self.Ks_inv.dot(self.ks_int_points)
-		self.mu_g_X, var_g_X = self.predictive_posterior_GP(self.X, 'X')
-		self.mu_g2_X = var_g_X + self.mu_g_X ** 2
-		self.mu_g_int_points, var_g_int_points = self.predictive_posterior_GP(
-			self.integration_points, 'int_points')
-		self.mu_g2_int_points = var_g_int_points + self.mu_g_int_points ** 2
-		self.epsilon = epsilon
-		self.alpha0 = 4.
-		self.beta0 = 2. / (float(self.X.shape[0] / self.R))
-		if lmbda_star is None:
-			self.lmbda_star_q1 = self.alpha0 / self.beta0
-			self.log_lmbda_star_q1 = digamma(self.alpha0) - numpy.log(self.beta0)
-		else:
-			self.lmbda_star_q1 = lmbda_star
-			self.log_lmbda_star_q1 = numpy.log(lmbda_star)
-		self.alpha_q1 = self.alpha0
-		self.beta_q1 = self.beta0
-		self.convergence = numpy.inf
-		self.conv_crit = conv_crit
-		self.num_iterations = 0
-		self.output = output
-		self.update_hyperparams = update_hyperparams
-
-		# ADAM parameters
-		self.beta1_adam = .9
-		self.beta2_adam = .99
-		self.epsilon_adam = 1e-5
-		self.m_hyper_adam = numpy.zeros(self.D + 1)
-		self.v_hyper_adam = numpy.zeros(self.D + 1)
-		self.m_bm_adam = numpy.zeros(self.D)
-		self.v_bm_adam = numpy.zeros(self.D)
-
-	def place_inducing_points(self):
-		""" Places the induced points for sparse GP.
-		"""
-
-		num_per_dim = int(numpy.ceil(self.num_inducing_points ** (1. / self.D)))
-		induced_grid = numpy.empty([num_per_dim, self.D])
-		for di in range(self.D):
-			dist_between_points = self.S[di] / num_per_dim
-			induced_grid[:, di] = numpy.arange(.5 * dist_between_points,
-											   self.S[di],
-											   dist_between_points)
-
-		self.induced_points = numpy.meshgrid(*induced_grid.T.tolist())
-		self.induced_points = numpy.array(self.induced_points).reshape([
-			self.D, -1]).T
-
-	def run(self):
-		""" Fitting function for the variational mean-field algorithm.
-		"""
-
-		# Initialisation
-		self.times.append(time.perf_counter())
-		self.calculate_PG_expectations()
-		self.calculate_posterior_intensity()
-		converged = False
-		while not converged:
-			self.num_iterations += 1
-			# Update second factor q2
-			self.calculate_postrior_GP()
-			self.update_predictive_posterior()
-			self.update_max_intensity()
-			# Update first factor q1
-			self.calculate_PG_expectations()
-			self.calculate_posterior_intensity()
-			# Update hyperparameters
-			if self.update_hyperparams:
-				self.update_hyperparameters()
-			# Calculate lower bound
-			self.LB_list.append(self.calculate_lower_bound())
-			# Check for convergence
-			if self.num_iterations > 1:
-				self.convergence = numpy.absolute(self.LB_list[-1] -
-												  self.LB_list[
-													  -2]) / numpy.amax([numpy.abs(self.LB_list[-1]),
-																		 numpy.abs(self.LB_list[-2]), 1])
-			converged = self.convergence < self.conv_crit
-			self.times.append(time.perf_counter())
-			if self.output:
-				self.print_info()
-
-	def print_info(self):
-		""" Functions to print info, while iteratively updating posterior.
-		"""
-		print((' +-----------------+ ' +
-			   '\n |  Iteration %4d |' +
-			   '\n |  Conv. = %.4f |' +
-			   '\n +-----------------+') % (self.num_iterations,
-											self.convergence_inner))
-
-	def place_integration_points(self):
-		""" Places the integration points for Monte Carlo integration and
-		updates all related kernels.
-		"""
-
-		self.integration_points = numpy.random.rand(
-			self.num_integration_points, self.D)
-		self.integration_points *= self.S[numpy.newaxis]
-		self.ks_int_points = self.cov_func(self.induced_points,
-										   self.integration_points)
-		self.kappa_int_points = self.Ks_inv.dot(self.ks_int_points)
-
-	def calculate_posterior_intensity(self):
-		""" The rate of the posterior process is updated.
-		"""
-
-		self.lmbda_q2 = .5 * numpy.exp(
-			-.5 * self.mu_g_int_points + self.log_lmbda_star_q1) / \
-						numpy.cosh(.5 * self.c_int_points)
-
-	def calculate_PG_expectations(self):
-		""" The Polya-Gamma posterior is updated.
-		"""
-
-		self.c_X = numpy.sqrt(self.mu_g2_X)
-		self.mu_omega_X = .5 / self.c_X * numpy.tanh(
-			.5 * self.c_X)
-		self.c_int_points = numpy.sqrt(self.mu_g2_int_points)
-		self.mu_omega_int_points = .5 / self.c_int_points \
-								   * numpy.tanh(.5 * self.c_int_points)
-
-	def calculate_predictive_posterior_intensity(self, X_prime):
-		""" Calculates the posterior intensity at X_prime for the latent
-		Poisson process. (Not the intensity of the observed Poisson process!!!)
-
-		:param X_prime: numpy.ndarray [num_points x D]
-			Position of points, that should be evaluated.
-
-		:return: numpy.ndarray [num_points]
-			Posterior intensity.
-		"""
-		mu_g, var_g = self.predictive_posterior_GP(X_prime)
-		mu_g = mu_g
-		mu_g2 = var_g + mu_g ** 2
-		c = numpy.sqrt(mu_g2)
-		pred_lmbda_q2 = .5 * numpy.exp(
-			-.5 * mu_g + self.log_lmbda_star_q1) / \
-						numpy.cosh(.5 * c)
-		return pred_lmbda_q2
-
-	def calculate_postrior_GP(self):
-		""" The new GP at the inducing points is calculated.
-		"""
-
-		A_int_points = self.lmbda_q2 * self.mu_omega_int_points
-		A_X = self.mu_omega_X
-		kAk = self.kappa_X.dot(A_X[:, numpy.newaxis] * self.kappa_X.T) + \
-			  self.kappa_int_points.dot(A_int_points[:, numpy.newaxis] *
-										self.kappa_int_points.T) \
-			  / self.num_integration_points * self.R
-		self.Sigma_g_s_inv = kAk + self.Ks_inv
-		L_inv = numpy.linalg.cholesky(self.Sigma_g_s_inv + self.noise *
-									  numpy.eye(
-										  self.Sigma_g_s_inv.shape[0]))
-		L = solve_triangular(L_inv, numpy.eye(L_inv.shape[0]), lower=True,
-							 check_finite=False)
-		self.Sigma_g_s = L.T.dot(L)
-		self.logdet_Sigma_g_s = 2 * numpy.sum(numpy.log(L.diagonal()))
-		b_int_points = -.5 * self.lmbda_q2
-		b_X = .5 * numpy.ones(self.X.shape[0])
-		kb = self.ks_X.dot(b_X) + self.ks_int_points.dot(b_int_points) / \
-			 self.num_integration_points * self.R
-		self.mu_g_s = self.Sigma_g_s.dot(kb.dot(self.Ks_inv))
-
-	def predictive_posterior_GP(self, x_prime, points=None):
-		""" Computes the predictive posterior for given points
-
-		:param x_prime: numpy.ndarray [num_points x D]
-			Points, which should be predicted for.
-		:param points: str
-			If 'int_points' or 'X' posterior for integration points or
-			observation points is calculated, respectively. (Default=None)
-		:returns:
-			numpy.ndarray [num_points]: mean of predictive posterior
-			numpy.ndarray [num_points]: variance of predictive posterior
-		"""
-		if points is None:
-			ks_x_prime = self.cov_func(self.induced_points, x_prime)
-			kappa = self.Ks_inv.dot(ks_x_prime)
-		elif points is 'int_points':
-			ks_x_prime = self.ks_int_points
-			kappa = self.kappa_int_points
-		elif points is 'X':
-			ks_x_prime = self.ks_X
-			kappa = self.kappa_X
-
-		mu_g_x_prime = kappa.T.dot(self.mu_g_s)
-		K_xx = self.cov_func(x_prime, x_prime, only_diagonal=True)
-		var_g_x_prime = K_xx - numpy.sum(kappa * (ks_x_prime - kappa.T.dot(
-			self.Sigma_g_s).T), axis=0)
-		return mu_g_x_prime, var_g_x_prime
-
-	def cov_func(self, x, x_prime, only_diagonal=False):
-		""" Computes the covariance functions between x and x_prime.
-
-		:param x: numpy.ndarray [num_points x D]
-			Contains coordinates for points of x
-		:param x_prime: numpy.ndarray [num_points_prime x D]
-			Contains coordinates for points of x_prime
-		:param only_diagonal: bool
-			If true only diagonal is computed (Works only if x and x_prime
-			are the same, Default=False)
-
-		:return: numpy.ndarray [num_points x num_points_prime]
-		([num_points_prime] if only diagonal)
-			Kernel matrix.
-		"""
-
-		theta_1, theta_2 = self.cov_params[0], self.cov_params[1]
-		if only_diagonal:
-			return theta_1 * numpy.ones(x.shape[0])
-
-		else:
-			x_theta2 = x / theta_2
-			xprime_theta2 = x_prime / theta_2
-			h = numpy.sum(x_theta2 ** 2, axis=1)[:, None] - 2. * numpy.dot(
-				x_theta2, xprime_theta2.T) + \
-				numpy.sum(xprime_theta2 ** 2, axis=1)[None]
-			return theta_1 * numpy.exp(-.5 * h)
-
-	def calculate_lower_bound(self):
-		""" Calculates the variational lower bound for current posterior.
-
-		:return: float
-			Variational lower bound.
-		"""
-
-		Sigma_s_mugmug = self.Sigma_g_s + numpy.outer(self.mu_g_s, self.mu_g_s)
-		f_int_points = .5 * (- self.mu_g_int_points -
-							 self.mu_g2_int_points * self.mu_omega_int_points) - \
-					   numpy.log(2)
-		integrand = f_int_points - \
-					numpy.log(self.lmbda_q2 * numpy.cosh(.5 * self.c_int_points)) \
-					+ self.log_lmbda_star_q1 + \
-					.5 * self.c_int_points ** 2 * self.mu_omega_int_points + 1.
-		f_X = .5 * (self.mu_g_X - self.mu_g2_X * self.mu_omega_X) - \
-			  numpy.log(2)
-		summand = f_X + self.log_lmbda_star_q1 - numpy.log(numpy.cosh(
-			.5 * self.c_X)) + .5 * self.c_X ** 2 * self.mu_omega_X
-
-		L = integrand.dot(self.lmbda_q2) / self.num_integration_points * self.R
-		L -= self.lmbda_star_q1 * self.R
-		L += numpy.sum(summand)
-		L -= .5 * numpy.trace(self.Ks_inv.dot(Sigma_s_mugmug))
-		L -= .5 * self.logdet_Ks
-		L += .5 * self.logdet_Sigma_g_s + .5 * self.num_inducing_points
-		L += self.alpha0 * numpy.log(self.beta0) - gammaln(self.alpha0) + \
-			 (self.alpha0 - 1) * self.log_lmbda_star_q1 - \
-			 self.beta0 * self.lmbda_star_q1
-		L += self.alpha_q1 - numpy.log(self.beta_q1) + gammaln(self.alpha_q1) \
-			 + (1. - self.alpha_q1) * digamma(self.alpha_q1)
-
-		return L
-
-	def update_max_intensity(self):
-		""" Updates the posterior for the maximal intensity.
-		"""
-		self.alpha_q1 = self.X.shape[0] + numpy.sum(
-			self.lmbda_q2) / self.num_integration_points * self.R + self.alpha0
-		self.beta_q1 = self.beta0 + self.R
-		self.lmbda_star_q1 = self.alpha_q1 / self.beta_q1
-		self.log_lmbda_star_q1 = digamma(self.alpha_q1) - \
-								 numpy.log(self.beta_q1)
-
-	def update_kernels(self):
-		""" Updates all kernels (for inducing, observed and integration points).
-		"""
-		self.ks_int_points = self.cov_func(self.induced_points,
-										   self.integration_points)
-		self.ks_X = self.cov_func(self.induced_points, self.X)
-		self.Ks = self.cov_func(self.induced_points, self.induced_points)
-		L = numpy.linalg.cholesky(self.Ks + self.noise * numpy.eye(
-			self.Ks.shape[0]))
-		L_inv = solve_triangular(L, numpy.eye(L.shape[0]), lower=True,
-								 check_finite=False)
-		self.Ks_inv = L_inv.T.dot(L_inv)
-		self.logdet_Ks = 2. * numpy.sum(numpy.log(L.diagonal()))
-		self.kappa_X = self.Ks_inv.dot(self.ks_X)
-		self.kappa_int_points = self.Ks_inv.dot(self.ks_int_points)
-
-	def calculate_hyperparam_derivative(self):
-		""" Calculates the derivative of the hyperparameters.
-
-		:return: numpy.ndarray [D + 1]
-			Derivatives of hyperparameters.
-		"""
-
-		theta1, theta2 = self.cov_params[0], numpy.copy(
-			self.cov_params[1])
-		Sigma_s_mugmug = self.Sigma_g_s + numpy.outer(self.mu_g_s, self.mu_g_s)
-		dks_X = numpy.empty([self.ks_X.shape[0], self.ks_X.shape[1],
-							 1 + theta2.shape[0]])
-		dks_int_points = numpy.empty(
-			[self.ks_int_points.shape[0], self.ks_int_points.shape[1],
-			 1 + theta2.shape[0]])
-		dKs = numpy.empty([self.Ks.shape[0], self.Ks.shape[1],
-						   1 + theta2.shape[0]])
-		dKss = numpy.zeros([1 + theta2.shape[0]])
-		dKss[0] = 1.
-
-		# kernel derivatives wrt theta1
-		dks_X[:, :, 0] = self.ks_X / theta1
-		dks_int_points[:, :, 0] = self.ks_int_points / theta1
-		dKs[:, :, 0] = self.Ks / theta1
-		# kernel derivatives wrt theta2
-		dx = numpy.subtract(self.induced_points[:, None],
-							self.X[None])
-		dks_X[:, :, 1:] = self.ks_X[:, :, None] * (dx ** 2) / \
-						  (theta2[None, None] ** 3)
-		dx = numpy.subtract(self.induced_points[:, None],
-							self.integration_points[None])
-		dks_int_points[:, :, 1:] = self.ks_int_points[:, :, None] * \
-								   (dx ** 2) / (theta2[None, None] ** 3)
-		dx = numpy.subtract(self.induced_points[:, None],
-							self.induced_points[None])
-		dKs[:, :, 1:] = self.Ks[:, :, None] * (dx ** 2) / (
-				theta2[None, None] ** 3)
-		dL_dtheta = numpy.empty(1 + len(theta2))
-
-		for itheta in range(1 + len(theta2)):
-			dKs_inv = -self.Ks_inv.dot(dKs[:, :, itheta].dot(self.Ks_inv))
-
-			dkappa_X = self.Ks_inv.dot(dks_X[:, :, itheta]) + dKs_inv.dot(
-				self.ks_X)
-			dkappa_int_points = self.Ks_inv.dot(
-				dks_int_points[:, :, itheta]) + dKs_inv.dot(
-				self.ks_int_points)
-
-			dKtilde_X = dKss[itheta] - numpy.sum(
-				dks_X[:, :, itheta] * self.kappa_X, axis=0) - numpy.sum(
-				self.ks_X * dkappa_X, axis=0)
-			dKtilde_int_points = dKss[itheta] - numpy.sum(
-				dks_int_points[:, :, itheta] * self.kappa_int_points,
-				axis=0) - numpy.sum(self.ks_int_points * dkappa_int_points,
-									axis=0)
-
-			dg1_X = self.mu_g_s.dot(dkappa_X)
-			dg1_int_points = self.mu_g_s.dot(dkappa_int_points)
-
-			dg2_X = (dKtilde_X + 2. * numpy.sum(
-				self.kappa_X * Sigma_s_mugmug.dot(dkappa_X),
-				axis=0)) * self.mu_omega_X
-			dg2_int_points = (dKtilde_int_points + 2. * numpy.sum(
-				self.kappa_int_points * Sigma_s_mugmug.dot(dkappa_int_points),
-				axis=0)) * self.mu_omega_int_points
-
-			dL_dtheta[itheta] = .5 * (numpy.sum(dg1_X) - numpy.sum(dg2_X))
-			dL_dtheta[itheta] += .5 * numpy.dot(
-				-dg1_int_points - dg2_int_points,
-				self.lmbda_q2) / self.num_integration_points * self.R
-			dL_dtheta[itheta] -= .5 * numpy.trace(self.Ks_inv.dot(
-				dKs[:, :, itheta]))
-			dL_dtheta[itheta] += .5 * numpy.trace(
-				self.Ks_inv.dot(dKs[:, :, itheta].dot(
-					self.Ks_inv.dot(Sigma_s_mugmug))))
-
-		return dL_dtheta
-
-	def update_hyperparameters(self):
-		""" Updates the hyperparameters with Adam.
-		"""
-		dL_dtheta = self.calculate_hyperparam_derivative()
-		logtheta1, logtheta2 = numpy.log(self.cov_params[0]), \
-							   numpy.log(self.cov_params[1])
-		dL_dlogtheta1 = dL_dtheta[0] * numpy.exp(logtheta1)
-		dL_dlogtheta2 = dL_dtheta[1:] * numpy.exp(logtheta2)
-
-		self.m_hyper_adam[0] = self.beta1_adam * self.m_hyper_adam[0] + \
-							   (1. - self.beta1_adam) * dL_dlogtheta1
-		self.v_hyper_adam[0] = self.beta2_adam * self.v_hyper_adam[0] + \
-							   (1. - self.beta2_adam) * dL_dlogtheta1 ** 2
-		self.m_hyper_adam[1:] = self.beta1_adam * self.m_hyper_adam[1:] + \
-								(1. - self.beta1_adam) * dL_dlogtheta2
-		self.v_hyper_adam[1:] = self.beta2_adam * self.v_hyper_adam[1:] + \
-								(1. - self.beta2_adam) * dL_dlogtheta2 ** 2
-		m_hat = self.m_hyper_adam / (1. - self.beta1_adam)
-		v_hat = self.v_hyper_adam / (1. - self.beta2_adam)
-		logtheta1 += self.epsilon * m_hat[0] / (numpy.sqrt(v_hat[0]) +
-												self.epsilon_adam)
-		logtheta2 += self.epsilon * m_hat[1:] / (numpy.sqrt(v_hat[1:]) +
-												 self.epsilon_adam)
-		self.cov_params[0] = numpy.exp(logtheta1)
-		self.cov_params[1] = numpy.exp(logtheta2)
-		self.update_kernels()
-		self.update_predictive_posterior()
-
-	def update_predictive_posterior(self, only_int_points=False):
-		""" Updates the function g (mean & variance) at each point (observed
-		and points for monte carlo integral)
-
-		:param only_int_points: bool
-			If True it only updates the integration points. (Default=False)
-		"""
-
-		if not only_int_points:
-			mu_g_X, var_g_X = self.predictive_posterior_GP(
-				self.X, points='X')
-			self.mu_g_X = mu_g_X
-			self.mu_g2_X = var_g_X + mu_g_X ** 2
-		mu_g_int_points, var_g_int_points = self.predictive_posterior_GP(
-			self.integration_points, points='int_points')
-		self.mu_g_int_points = mu_g_int_points
-		self.mu_g2_int_points = var_g_int_points + mu_g_int_points ** 2
-
-	def predictive_intensity_function(self, X_eval):
-		""" Computes the predictive intensity function at X_eval by Gaussian
-		quadrature.
-
-		:param X_eval: numpy.ndarray [num_points_eval x D]
-			Points where the intensity function should be evaluated.
-
-		:returns:
-			numpy.ndarray [num_points]: mean of predictive posterior intensity
-			numpy.ndarray [num_points]: variance of predictive posterior
-										intensity
-		"""
-		num_preds = X_eval.shape[0]
-		mu_pred, var_pred = self.predictive_posterior_GP(X_eval)
-
-		mean_lmbda_pred, var_lmbda_pred = numpy.empty(num_preds), \
-										  numpy.empty(num_preds)
-
-		mean_lmbda_q1 = self.lmbda_star_q1
-		var_lmbda_q1 = self.alpha_q1 / (self.beta_q1 ** 2)
-		mean_lmbda_q1_squared = var_lmbda_q1 + mean_lmbda_q1 ** 2
-
-		for ipred in range(num_preds):
-			mu, std = mu_pred[ipred], numpy.sqrt(var_pred[ipred])
-			func1 = lambda g_pred: 1. / (1. + numpy.exp(-g_pred)) * \
-								   numpy.exp(-.5 * (g_pred - mu) ** 2 / std ** 2) / \
-								   numpy.sqrt(2. * numpy.pi * std ** 2)
-			a, b = mu - 10. * std, mu + 10. * std
-			mean_lmbda_pred[ipred] = mean_lmbda_q1 * quadrature(func1, a, b,
-																maxiter=500)[0]
-			func2 = lambda g_pred: (1. / (1. + numpy.exp(-g_pred))) ** 2 * \
-								   numpy.exp(
-									   -.5 * (g_pred - mu) ** 2 / std ** 2) / \
-								   numpy.sqrt(2. * numpy.pi * std ** 2)
-			a, b = mu - 10. * std, mu + 10. * std
-			mean_lmbda_pred_squared = mean_lmbda_q1_squared * \
-									  quadrature(func2, a, b, maxiter=500)[0]
-			var_lmbda_pred[ipred] = mean_lmbda_pred_squared - mean_lmbda_pred[
-				ipred] ** 2
-
-		return mean_lmbda_pred, var_lmbda_pred
-
-	def sample_posterior(self, X_test, num_samples=1):
-		""" Samples log predictive likelihood for test set from posterior.
-
-		:param X_test: [num_X_test x D]
-			Observations in test set.
-		:param num_samples: int
-			How many samples of the intensity function should be drawn from
-			the posterior. (Default=1e4)
-
-		:return: numpy.ndarray [num_samples]
-			Returns the array of sampled likelihoods.
-		"""
-
-		num_events = X_test.shape[0]
-		num_samples = int(num_samples)
-		X = numpy.concatenate([X_test, self.integration_points])
-		K = self.cov_func(X, X)
-		kx = self.cov_func(X, self.induced_points)
-		kappa = kx.dot(self.Ks_inv)
-		Sigma_post = K - kappa.dot(kx.T - self.Sigma_g_s.dot(kappa.T))
-		mu_post = kappa.dot(self.mu_g_s)
-		L_post = numpy.linalg.cholesky(Sigma_post + self.noise * numpy.eye(
-			Sigma_post.shape[0]))
-
-		num_points = X.shape[0]
-		num_hundreds = int(num_samples)
-		pred_log_likelihood = numpy.empty([num_samples])
-
-		samples = []
-		# samples hundred instances at a time
-		for ihundreds in range(num_hundreds):
-			rand_nums = numpy.random.randn(num_points, 1)
-			g_sample = mu_post[:, None] + L_post.dot(rand_nums)
-			lmbda_max_sample = numpy.random.gamma(shape=self.alpha_q1,
-												  scale=1. / self.beta_q1,
-												  size=1)
-			lmbda_sample = lmbda_max_sample / (1. + numpy.exp(-g_sample))
-			samples.append(lmbda_sample)
-		return samples
-
-	def predictive_log_likelihood(self, X_test, num_samples=1e4):
-		""" Samples log predictive likelihood for test set from posterior.
-
-		:param X_test: [num_X_test x D]
-			Observations in test set.
-		:param num_samples: int
-			How many samples of the intensity function should be drawn from
-			the posterior. (Default=1e4)
-
-		:return: numpy.ndarray [num_samples]
-			Returns the array of sampled likelihoods.
-		"""
-
-		num_events = X_test.shape[0]
-		num_samples = int(num_samples)
-		X = numpy.concatenate([X_test, self.integration_points])
-		K = self.cov_func(X, X)
-		kx = self.cov_func(X, self.induced_points)
-		kappa = kx.dot(self.Ks_inv)
-		Sigma_post = K - kappa.dot(kx.T - self.Sigma_g_s.dot(kappa.T))
-		mu_post = kappa.dot(self.mu_g_s)
-		L_post = numpy.linalg.cholesky(Sigma_post + self.noise * numpy.eye(
-			Sigma_post.shape[0]))
-
-		num_points = X.shape[0]
-		num_hundreds = int(num_samples / 1e2)
-		pred_log_likelihood = numpy.empty([num_samples])
-
-		# samples hundred instances at a time
-		for ihundreds in range(num_hundreds):
-			rand_nums = numpy.random.randn(num_points, 100)
-			g_sample = mu_post[:, None] + L_post.dot(rand_nums)
-			lmbda_max_sample = numpy.random.gamma(shape=self.alpha_q1,
-												  scale=1. / self.beta_q1,
-												  size=100)
-			lmbda_sample = lmbda_max_sample / (1. + numpy.exp(-g_sample))
-
-			pred_log_likelihood[ihundreds * 100:(ihundreds + 1) * 100] = \
-				numpy.sum(numpy.log(lmbda_sample[:num_events]), axis=0)
-			pred_log_likelihood[ihundreds * 100:(ihundreds + 1) * 100] -= \
-				numpy.mean(lmbda_sample[num_events:], axis=0) * self.R
-
-		return pred_log_likelihood
-
-	def expanded_predictive_log_likelihood(self, X_test):
-		""" Fast approximation for log predictive test likelihood (Eq. 33 in
-		paper).
-
-		:param X_test: [num_X_test x D]
-			Observations in test set.
-
-		:return: float
-			Approximation of log predictive test likelihood.
-		"""
-		self.update_predictive_posterior(only_int_points=True)
-		N = X_test.shape[0]
-		ks_x_test = self.cov_func(self.induced_points, X_test)
-		mu_g_X_test = ks_x_test.T.dot(self.Ks_inv.dot(self.mu_g_s))
-		u_mean = -self.lmbda_star_q1 * numpy.mean(
-			1. / (1. + numpy.exp(-self.mu_g_int_points))) * self.R - \
-				 numpy.sum(numpy.log(1. + numpy.exp(-mu_g_X_test))) + \
-				 N * numpy.log(self.lmbda_star_q1)
-
-		log_pred_likelihood = u_mean
-		du_dg = numpy.empty(N + self.num_integration_points)
-		du_dg[:N] = 1. / (1. + numpy.exp(mu_g_X_test))
-		du_dg[N:] = - self.lmbda_star_q1 / (1. + numpy.exp(
-			-self.mu_g_int_points)) * (1. - 1. / (1. + numpy.exp(
-			-self.mu_g_int_points))) \
-					/ self.num_integration_points * self.R
-		du_dg2 = numpy.empty(N + self.num_integration_points)
-		du_dg2[:N] = - (1. - 1. / (1. + numpy.exp(mu_g_X_test))) / \
-					 (1. + numpy.exp(mu_g_X_test))
-		du_dg2[N:] = - self.lmbda_star_q1 / (1. + numpy.exp(
-			-self.mu_g_int_points)) * (1. - 1. / (1. + numpy.exp(
-			-self.mu_g_int_points))) * (1. - 2. / (1. + numpy.exp(
-			-self.mu_g_int_points))) / self.num_integration_points * self.R
-
-		du_dlambda = - self.R * numpy.mean(
-			1. / (1. + numpy.exp(-self.mu_g_int_points))) + N / self.lmbda_star_q1
-		du_dlmbda2 = - N / self.lmbda_star_q1 ** 2
-
-		C = numpy.empty([N + self.num_integration_points,
-						 N + self.num_integration_points])
-		inner_matrix = self.Ks_inv.dot(
-			numpy.identity(self.num_inducing_points) -
-			self.Sigma_g_s.dot(self.Ks_inv))
-
-		K_X = self.cov_func(X_test, X_test) + self.noise * numpy.identity(
-			X_test.shape[0])
-
-		C[:N, :N] = K_X - ks_x_test.T.dot(inner_matrix.dot(
-			ks_x_test))
-		del K_X
-		K_int_points = self.cov_func(self.integration_points,
-									 self.integration_points) + \
-					   self.noise * numpy.identity(
-			self.integration_points.shape[0])
-
-		C[N:, N:] = K_int_points - self.ks_int_points.T.dot(inner_matrix.dot(
-			self.ks_int_points))
-		del K_int_points
-
-		K_X_int_points = self.cov_func(self.integration_points, X_test)
-		C[N:, :N] = K_X_int_points - self.ks_int_points.T.dot(inner_matrix.dot(
-			ks_x_test))
-		del K_X_int_points
-
-		C[:N, N:] = C[N:, :N].T
-
-		log_pred_likelihood_corr = .5 * numpy.trace(C.dot(numpy.diag(
-			du_dg2) + numpy.outer(du_dg, du_dg))) \
-								   + .5 * (du_dlmbda2 + du_dlambda ** 2) * self.alpha_q1 / self.beta_q1 ** 2
-		log_pred_likelihood += log_pred_likelihood_corr
-
-		return log_pred_likelihood
+class VMF_SGCP:
+
+    def __init__(
+        self,
+        S_borders,
+        X,
+        cov_params,
+        num_inducing_points,
+        lmbda_star=None,
+        conv_crit=1e-4,
+        num_integration_points=1000,
+        output=False,
+        update_hyperparams=True,
+        noise=1e-4,
+        epsilon=5e-2,
+    ):
+        """Class initialisation for variational mean field inference for
+        sigmoidal Gaussian Cox process.
+
+        :param S_borders: numpy.ndarray [D x 2]
+                Limits of the region of interest.
+        :param X: numpy.ndarray [num_points x D]
+                Positions of the observations.
+        :param cov_params: numpy.ndarray [D + 1]
+                Hyperparameters of the covariance functions. First is amplitude,
+                and the others the length scale for each dimension.
+        :param num_inducing_points: int
+                Number of inducing points (Should be a power of dimensions)
+        :param lmbda_star: float
+                Maximal intensity. If None it is initialized as twice the mean
+                observation rate for a homogeneous process. (Default=None)
+        :param conv_crit:
+                Convergence criterion, when algorithm should stop. (Default=1e-4)
+        :param num_integration_points: int
+                Number of points that should be used for Monte Carlo integration.
+                (Default = 1000)
+        :param output: bool
+                Prints info after each optimisation step. (Default=False)
+        :param update_hyperparams: bool
+                Whether the hyperparameters are updated (by Adam) or not.  (
+                Default=False)
+        :param noise: float
+                Noise added to the diagonal of the covariance matrix (should be
+                small). (Default=1e-4)
+        param epsilon: float
+                Step size for Adam in the hyperparameter update. (Default=5e-2)
+        """
+
+        self.S_borders = S_borders
+        self.S = S_borders[:, 1] - S_borders[:, 0]
+        self.R = numpy.prod(self.S)
+        self.D = S_borders.shape[0]
+        self.noise = noise
+        self.cov_params = cov_params
+        self.num_integration_points = num_integration_points
+        self.num_inducing_points = num_inducing_points  # must be power of D
+        self.X = X
+
+        self.place_inducing_points()
+        self.mu_g_s = numpy.zeros(self.induced_points.shape[0])
+        self.Sigma_g_s = numpy.identity(self.induced_points.shape[0])
+        self.Sigma_g_s_inv = numpy.identity(self.induced_points.shape[0])
+        self.Ks = self.cov_func(self.induced_points, self.induced_points)
+        L = numpy.linalg.cholesky(self.Ks + self.noise * numpy.eye(self.Ks.shape[0]))
+        L_inv = solve_triangular(
+            L, numpy.eye(L.shape[0]), lower=True, check_finite=False
+        )
+        self.Ks_inv = L_inv.T.dot(L_inv)
+        self.logdet_Ks = 2.0 * numpy.sum(numpy.log(L.diagonal()))
+
+        self.place_integration_points()
+        self.ks_X = self.cov_func(self.induced_points, self.X)
+        self.LB_list = []
+        self.times = []
+
+        self.kappa_X = self.Ks_inv.dot(self.ks_X)
+        self.kappa_int_points = self.Ks_inv.dot(self.ks_int_points)
+        self.mu_g_X, var_g_X = self.predictive_posterior_GP(self.X, "X")
+        self.mu_g2_X = var_g_X + self.mu_g_X**2
+        self.mu_g_int_points, var_g_int_points = self.predictive_posterior_GP(
+            self.integration_points, "int_points"
+        )
+        self.mu_g2_int_points = var_g_int_points + self.mu_g_int_points**2
+        self.epsilon = epsilon
+        self.alpha0 = 4.0
+        self.beta0 = 2.0 / (float(self.X.shape[0] / self.R))
+        if lmbda_star is None:
+            self.lmbda_star_q1 = self.alpha0 / self.beta0
+            self.log_lmbda_star_q1 = digamma(self.alpha0) - numpy.log(self.beta0)
+        else:
+            self.lmbda_star_q1 = lmbda_star
+            self.log_lmbda_star_q1 = numpy.log(lmbda_star)
+        self.alpha_q1 = self.alpha0
+        self.beta_q1 = self.beta0
+        self.convergence = numpy.inf
+        self.conv_crit = conv_crit
+        self.num_iterations = 0
+        self.output = output
+        self.update_hyperparams = update_hyperparams
+
+        # ADAM parameters
+        self.beta1_adam = 0.9
+        self.beta2_adam = 0.99
+        self.epsilon_adam = 1e-5
+        self.m_hyper_adam = numpy.zeros(self.D + 1)
+        self.v_hyper_adam = numpy.zeros(self.D + 1)
+        self.m_bm_adam = numpy.zeros(self.D)
+        self.v_bm_adam = numpy.zeros(self.D)
+
+    def place_inducing_points(self):
+        """Places the induced points for sparse GP."""
+
+        num_per_dim = int(numpy.ceil(self.num_inducing_points ** (1.0 / self.D)))
+        induced_grid = numpy.empty([num_per_dim, self.D])
+        for di in range(self.D):
+            dist_between_points = self.S[di] / num_per_dim
+            induced_grid[:, di] = numpy.arange(
+                0.5 * dist_between_points, self.S[di], dist_between_points
+            )
+
+        self.induced_points = numpy.meshgrid(*induced_grid.T.tolist())
+        self.induced_points = numpy.array(self.induced_points).reshape([self.D, -1]).T
+
+    def run(self):
+        """Fitting function for the variational mean-field algorithm."""
+
+        # Initialisation
+        self.times.append(time.perf_counter())
+        self.calculate_PG_expectations()
+        self.calculate_posterior_intensity()
+        converged = False
+        while not converged:
+            self.num_iterations += 1
+            # Update second factor q2
+            self.calculate_postrior_GP()
+            self.update_predictive_posterior()
+            self.update_max_intensity()
+            # Update first factor q1
+            self.calculate_PG_expectations()
+            self.calculate_posterior_intensity()
+            # Update hyperparameters
+            if self.update_hyperparams:
+                self.update_hyperparameters()
+            # Calculate lower bound
+            self.LB_list.append(self.calculate_lower_bound())
+            # Check for convergence
+            if self.num_iterations > 1:
+                self.convergence = numpy.absolute(
+                    self.LB_list[-1] - self.LB_list[-2]
+                ) / numpy.amax(
+                    [numpy.abs(self.LB_list[-1]), numpy.abs(self.LB_list[-2]), 1]
+                )
+            converged = self.convergence < self.conv_crit
+            self.times.append(time.perf_counter())
+            if self.output:
+                self.print_info()
+
+    def print_info(self):
+        """Functions to print info, while iteratively updating posterior."""
+        print(
+            (
+                " +-----------------+ "
+                + "\n |  Iteration %4d |"
+                + "\n |  Conv. = %.4f |"
+                + "\n +-----------------+"
+            )
+            % (self.num_iterations, self.convergence_inner)
+        )
+
+    def place_integration_points(self):
+        """Places the integration points for Monte Carlo integration and
+        updates all related kernels.
+        """
+
+        self.integration_points = numpy.random.rand(self.num_integration_points, self.D)
+        self.integration_points *= self.S[numpy.newaxis]
+        self.ks_int_points = self.cov_func(self.induced_points, self.integration_points)
+        self.kappa_int_points = self.Ks_inv.dot(self.ks_int_points)
+
+    def calculate_posterior_intensity(self):
+        """The rate of the posterior process is updated."""
+
+        self.lmbda_q2 = (
+            0.5
+            * numpy.exp(-0.5 * self.mu_g_int_points + self.log_lmbda_star_q1)
+            / numpy.cosh(0.5 * self.c_int_points)
+        )
+
+    def calculate_PG_expectations(self):
+        """The Polya-Gamma posterior is updated."""
+
+        self.c_X = numpy.sqrt(self.mu_g2_X)
+        self.mu_omega_X = 0.5 / self.c_X * numpy.tanh(0.5 * self.c_X)
+        self.c_int_points = numpy.sqrt(self.mu_g2_int_points)
+        self.mu_omega_int_points = (
+            0.5 / self.c_int_points * numpy.tanh(0.5 * self.c_int_points)
+        )
+
+    def calculate_predictive_posterior_intensity(self, X_prime):
+        """Calculates the posterior intensity at X_prime for the latent
+        Poisson process. (Not the intensity of the observed Poisson process!!!)
+
+        :param X_prime: numpy.ndarray [num_points x D]
+                Position of points, that should be evaluated.
+
+        :return: numpy.ndarray [num_points]
+                Posterior intensity.
+        """
+        mu_g, var_g = self.predictive_posterior_GP(X_prime)
+        mu_g = mu_g
+        mu_g2 = var_g + mu_g**2
+        c = numpy.sqrt(mu_g2)
+        pred_lmbda_q2 = (
+            0.5 * numpy.exp(-0.5 * mu_g + self.log_lmbda_star_q1) / numpy.cosh(0.5 * c)
+        )
+        return pred_lmbda_q2
+
+    def calculate_postrior_GP(self):
+        """The new GP at the inducing points is calculated."""
+
+        A_int_points = self.lmbda_q2 * self.mu_omega_int_points
+        A_X = self.mu_omega_X
+        kAk = (
+            self.kappa_X.dot(A_X[:, numpy.newaxis] * self.kappa_X.T)
+            + self.kappa_int_points.dot(
+                A_int_points[:, numpy.newaxis] * self.kappa_int_points.T
+            )
+            / self.num_integration_points
+            * self.R
+        )
+        self.Sigma_g_s_inv = kAk + self.Ks_inv
+        L_inv = numpy.linalg.cholesky(
+            self.Sigma_g_s_inv + self.noise * numpy.eye(self.Sigma_g_s_inv.shape[0])
+        )
+        L = solve_triangular(
+            L_inv, numpy.eye(L_inv.shape[0]), lower=True, check_finite=False
+        )
+        self.Sigma_g_s = L.T.dot(L)
+        self.logdet_Sigma_g_s = 2 * numpy.sum(numpy.log(L.diagonal()))
+        b_int_points = -0.5 * self.lmbda_q2
+        b_X = 0.5 * numpy.ones(self.X.shape[0])
+        kb = (
+            self.ks_X.dot(b_X)
+            + self.ks_int_points.dot(b_int_points)
+            / self.num_integration_points
+            * self.R
+        )
+        self.mu_g_s = self.Sigma_g_s.dot(kb.dot(self.Ks_inv))
+
+    def predictive_posterior_GP(self, x_prime, points=None):
+        """Computes the predictive posterior for given points
+
+        :param x_prime: numpy.ndarray [num_points x D]
+                Points, which should be predicted for.
+        :param points: str
+                If 'int_points' or 'X' posterior for integration points or
+                observation points is calculated, respectively. (Default=None)
+        :returns:
+                numpy.ndarray [num_points]: mean of predictive posterior
+                numpy.ndarray [num_points]: variance of predictive posterior
+        """
+        if points is None:
+            ks_x_prime = self.cov_func(self.induced_points, x_prime)
+            kappa = self.Ks_inv.dot(ks_x_prime)
+        elif points is "int_points":
+            ks_x_prime = self.ks_int_points
+            kappa = self.kappa_int_points
+        elif points is "X":
+            ks_x_prime = self.ks_X
+            kappa = self.kappa_X
+
+        mu_g_x_prime = kappa.T.dot(self.mu_g_s)
+        K_xx = self.cov_func(x_prime, x_prime, only_diagonal=True)
+        var_g_x_prime = K_xx - numpy.sum(
+            kappa * (ks_x_prime - kappa.T.dot(self.Sigma_g_s).T), axis=0
+        )
+        return mu_g_x_prime, var_g_x_prime
+
+    def cov_func(self, x, x_prime, only_diagonal=False):
+        """Computes the covariance functions between x and x_prime.
+
+        :param x: numpy.ndarray [num_points x D]
+                Contains coordinates for points of x
+        :param x_prime: numpy.ndarray [num_points_prime x D]
+                Contains coordinates for points of x_prime
+        :param only_diagonal: bool
+                If true only diagonal is computed (Works only if x and x_prime
+                are the same, Default=False)
+
+        :return: numpy.ndarray [num_points x num_points_prime]
+        ([num_points_prime] if only diagonal)
+                Kernel matrix.
+        """
+
+        theta_1, theta_2 = self.cov_params[0], self.cov_params[1]
+        if only_diagonal:
+            return theta_1 * numpy.ones(x.shape[0])
+
+        else:
+            x_theta2 = x / theta_2
+            xprime_theta2 = x_prime / theta_2
+            h = (
+                numpy.sum(x_theta2**2, axis=1)[:, None]
+                - 2.0 * numpy.dot(x_theta2, xprime_theta2.T)
+                + numpy.sum(xprime_theta2**2, axis=1)[None]
+            )
+            return theta_1 * numpy.exp(-0.5 * h)
+
+    def calculate_lower_bound(self):
+        """Calculates the variational lower bound for current posterior.
+
+        :return: float
+                Variational lower bound.
+        """
+
+        Sigma_s_mugmug = self.Sigma_g_s + numpy.outer(self.mu_g_s, self.mu_g_s)
+        f_int_points = 0.5 * (
+            -self.mu_g_int_points - self.mu_g2_int_points * self.mu_omega_int_points
+        ) - numpy.log(2)
+        integrand = (
+            f_int_points
+            - numpy.log(self.lmbda_q2 * numpy.cosh(0.5 * self.c_int_points))
+            + self.log_lmbda_star_q1
+            + 0.5 * self.c_int_points**2 * self.mu_omega_int_points
+            + 1.0
+        )
+        f_X = 0.5 * (self.mu_g_X - self.mu_g2_X * self.mu_omega_X) - numpy.log(2)
+        summand = (
+            f_X
+            + self.log_lmbda_star_q1
+            - numpy.log(numpy.cosh(0.5 * self.c_X))
+            + 0.5 * self.c_X**2 * self.mu_omega_X
+        )
+
+        L = integrand.dot(self.lmbda_q2) / self.num_integration_points * self.R
+        L -= self.lmbda_star_q1 * self.R
+        L += numpy.sum(summand)
+        L -= 0.5 * numpy.trace(self.Ks_inv.dot(Sigma_s_mugmug))
+        L -= 0.5 * self.logdet_Ks
+        L += 0.5 * self.logdet_Sigma_g_s + 0.5 * self.num_inducing_points
+        L += (
+            self.alpha0 * numpy.log(self.beta0)
+            - gammaln(self.alpha0)
+            + (self.alpha0 - 1) * self.log_lmbda_star_q1
+            - self.beta0 * self.lmbda_star_q1
+        )
+        L += (
+            self.alpha_q1
+            - numpy.log(self.beta_q1)
+            + gammaln(self.alpha_q1)
+            + (1.0 - self.alpha_q1) * digamma(self.alpha_q1)
+        )
+
+        return L
+
+    def update_max_intensity(self):
+        """Updates the posterior for the maximal intensity."""
+        self.alpha_q1 = (
+            self.X.shape[0]
+            + numpy.sum(self.lmbda_q2) / self.num_integration_points * self.R
+            + self.alpha0
+        )
+        self.beta_q1 = self.beta0 + self.R
+        self.lmbda_star_q1 = self.alpha_q1 / self.beta_q1
+        self.log_lmbda_star_q1 = digamma(self.alpha_q1) - numpy.log(self.beta_q1)
+
+    def update_kernels(self):
+        """Updates all kernels (for inducing, observed and integration points)."""
+        self.ks_int_points = self.cov_func(self.induced_points, self.integration_points)
+        self.ks_X = self.cov_func(self.induced_points, self.X)
+        self.Ks = self.cov_func(self.induced_points, self.induced_points)
+        L = numpy.linalg.cholesky(self.Ks + self.noise * numpy.eye(self.Ks.shape[0]))
+        L_inv = solve_triangular(
+            L, numpy.eye(L.shape[0]), lower=True, check_finite=False
+        )
+        self.Ks_inv = L_inv.T.dot(L_inv)
+        self.logdet_Ks = 2.0 * numpy.sum(numpy.log(L.diagonal()))
+        self.kappa_X = self.Ks_inv.dot(self.ks_X)
+        self.kappa_int_points = self.Ks_inv.dot(self.ks_int_points)
+
+    def calculate_hyperparam_derivative(self):
+        """Calculates the derivative of the hyperparameters.
+
+        :return: numpy.ndarray [D + 1]
+                Derivatives of hyperparameters.
+        """
+
+        theta1, theta2 = self.cov_params[0], numpy.copy(self.cov_params[1])
+        Sigma_s_mugmug = self.Sigma_g_s + numpy.outer(self.mu_g_s, self.mu_g_s)
+        dks_X = numpy.empty(
+            [self.ks_X.shape[0], self.ks_X.shape[1], 1 + theta2.shape[0]]
+        )
+        dks_int_points = numpy.empty(
+            [
+                self.ks_int_points.shape[0],
+                self.ks_int_points.shape[1],
+                1 + theta2.shape[0],
+            ]
+        )
+        dKs = numpy.empty([self.Ks.shape[0], self.Ks.shape[1], 1 + theta2.shape[0]])
+        dKss = numpy.zeros([1 + theta2.shape[0]])
+        dKss[0] = 1.0
+
+        # kernel derivatives wrt theta1
+        dks_X[:, :, 0] = self.ks_X / theta1
+        dks_int_points[:, :, 0] = self.ks_int_points / theta1
+        dKs[:, :, 0] = self.Ks / theta1
+        # kernel derivatives wrt theta2
+        dx = numpy.subtract(self.induced_points[:, None], self.X[None])
+        dks_X[:, :, 1:] = self.ks_X[:, :, None] * (dx**2) / (theta2[None, None] ** 3)
+        dx = numpy.subtract(self.induced_points[:, None], self.integration_points[None])
+        dks_int_points[:, :, 1:] = (
+            self.ks_int_points[:, :, None] * (dx**2) / (theta2[None, None] ** 3)
+        )
+        dx = numpy.subtract(self.induced_points[:, None], self.induced_points[None])
+        dKs[:, :, 1:] = self.Ks[:, :, None] * (dx**2) / (theta2[None, None] ** 3)
+        dL_dtheta = numpy.empty(1 + len(theta2))
+
+        for itheta in range(1 + len(theta2)):
+            dKs_inv = -self.Ks_inv.dot(dKs[:, :, itheta].dot(self.Ks_inv))
+
+            dkappa_X = self.Ks_inv.dot(dks_X[:, :, itheta]) + dKs_inv.dot(self.ks_X)
+            dkappa_int_points = self.Ks_inv.dot(
+                dks_int_points[:, :, itheta]
+            ) + dKs_inv.dot(self.ks_int_points)
+
+            dKtilde_X = (
+                dKss[itheta]
+                - numpy.sum(dks_X[:, :, itheta] * self.kappa_X, axis=0)
+                - numpy.sum(self.ks_X * dkappa_X, axis=0)
+            )
+            dKtilde_int_points = (
+                dKss[itheta]
+                - numpy.sum(
+                    dks_int_points[:, :, itheta] * self.kappa_int_points, axis=0
+                )
+                - numpy.sum(self.ks_int_points * dkappa_int_points, axis=0)
+            )
+
+            dg1_X = self.mu_g_s.dot(dkappa_X)
+            dg1_int_points = self.mu_g_s.dot(dkappa_int_points)
+
+            dg2_X = (
+                dKtilde_X
+                + 2.0 * numpy.sum(self.kappa_X * Sigma_s_mugmug.dot(dkappa_X), axis=0)
+            ) * self.mu_omega_X
+            dg2_int_points = (
+                dKtilde_int_points
+                + 2.0
+                * numpy.sum(
+                    self.kappa_int_points * Sigma_s_mugmug.dot(dkappa_int_points),
+                    axis=0,
+                )
+            ) * self.mu_omega_int_points
+
+            dL_dtheta[itheta] = 0.5 * (numpy.sum(dg1_X) - numpy.sum(dg2_X))
+            dL_dtheta[itheta] += (
+                0.5
+                * numpy.dot(-dg1_int_points - dg2_int_points, self.lmbda_q2)
+                / self.num_integration_points
+                * self.R
+            )
+            dL_dtheta[itheta] -= 0.5 * numpy.trace(self.Ks_inv.dot(dKs[:, :, itheta]))
+            dL_dtheta[itheta] += 0.5 * numpy.trace(
+                self.Ks_inv.dot(dKs[:, :, itheta].dot(self.Ks_inv.dot(Sigma_s_mugmug)))
+            )
+
+        return dL_dtheta
+
+    def update_hyperparameters(self):
+        """Updates the hyperparameters with Adam."""
+        dL_dtheta = self.calculate_hyperparam_derivative()
+        logtheta1, logtheta2 = numpy.log(self.cov_params[0]), numpy.log(
+            self.cov_params[1]
+        )
+        dL_dlogtheta1 = dL_dtheta[0] * numpy.exp(logtheta1)
+        dL_dlogtheta2 = dL_dtheta[1:] * numpy.exp(logtheta2)
+
+        self.m_hyper_adam[0] = (
+            self.beta1_adam * self.m_hyper_adam[0]
+            + (1.0 - self.beta1_adam) * dL_dlogtheta1
+        )
+        self.v_hyper_adam[0] = (
+            self.beta2_adam * self.v_hyper_adam[0]
+            + (1.0 - self.beta2_adam) * dL_dlogtheta1**2
+        )
+        self.m_hyper_adam[1:] = (
+            self.beta1_adam * self.m_hyper_adam[1:]
+            + (1.0 - self.beta1_adam) * dL_dlogtheta2
+        )
+        self.v_hyper_adam[1:] = (
+            self.beta2_adam * self.v_hyper_adam[1:]
+            + (1.0 - self.beta2_adam) * dL_dlogtheta2**2
+        )
+        m_hat = self.m_hyper_adam / (1.0 - self.beta1_adam)
+        v_hat = self.v_hyper_adam / (1.0 - self.beta2_adam)
+        logtheta1 += (
+            self.epsilon * m_hat[0] / (numpy.sqrt(v_hat[0]) + self.epsilon_adam)
+        )
+        logtheta2 += (
+            self.epsilon * m_hat[1:] / (numpy.sqrt(v_hat[1:]) + self.epsilon_adam)
+        )
+        self.cov_params[0] = numpy.exp(logtheta1)
+        self.cov_params[1] = numpy.exp(logtheta2)
+        self.update_kernels()
+        self.update_predictive_posterior()
+
+    def update_predictive_posterior(self, only_int_points=False):
+        """Updates the function g (mean & variance) at each point (observed
+        and points for monte carlo integral)
+
+        :param only_int_points: bool
+                If True it only updates the integration points. (Default=False)
+        """
+
+        if not only_int_points:
+            mu_g_X, var_g_X = self.predictive_posterior_GP(self.X, points="X")
+            self.mu_g_X = mu_g_X
+            self.mu_g2_X = var_g_X + mu_g_X**2
+        mu_g_int_points, var_g_int_points = self.predictive_posterior_GP(
+            self.integration_points, points="int_points"
+        )
+        self.mu_g_int_points = mu_g_int_points
+        self.mu_g2_int_points = var_g_int_points + mu_g_int_points**2
+
+    def predictive_intensity_function(self, X_eval):
+        """Computes the predictive intensity function at X_eval by Gaussian
+        quadrature.
+
+        :param X_eval: numpy.ndarray [num_points_eval x D]
+                Points where the intensity function should be evaluated.
+
+        :returns:
+                numpy.ndarray [num_points]: mean of predictive posterior intensity
+                numpy.ndarray [num_points]: variance of predictive posterior
+                                                                        intensity
+        """
+        num_preds = X_eval.shape[0]
+        mu_pred, var_pred = self.predictive_posterior_GP(X_eval)
+
+        mean_lmbda_pred, var_lmbda_pred = numpy.empty(num_preds), numpy.empty(num_preds)
+
+        mean_lmbda_q1 = self.lmbda_star_q1
+        var_lmbda_q1 = self.alpha_q1 / (self.beta_q1**2)
+        mean_lmbda_q1_squared = var_lmbda_q1 + mean_lmbda_q1**2
+
+        for ipred in range(num_preds):
+            mu, std = mu_pred[ipred], numpy.sqrt(var_pred[ipred])
+            func1 = (
+                lambda g_pred: 1.0
+                / (1.0 + numpy.exp(-g_pred))
+                * numpy.exp(-0.5 * (g_pred - mu) ** 2 / std**2)
+                / numpy.sqrt(2.0 * numpy.pi * std**2)
+            )
+            a, b = mu - 10.0 * std, mu + 10.0 * std
+            mean_lmbda_pred[ipred] = (
+                mean_lmbda_q1 * quadrature(func1, a, b, maxiter=500)[0]
+            )
+            func2 = (
+                lambda g_pred: (1.0 / (1.0 + numpy.exp(-g_pred))) ** 2
+                * numpy.exp(-0.5 * (g_pred - mu) ** 2 / std**2)
+                / numpy.sqrt(2.0 * numpy.pi * std**2)
+            )
+            a, b = mu - 10.0 * std, mu + 10.0 * std
+            mean_lmbda_pred_squared = (
+                mean_lmbda_q1_squared * quadrature(func2, a, b, maxiter=500)[0]
+            )
+            var_lmbda_pred[ipred] = (
+                mean_lmbda_pred_squared - mean_lmbda_pred[ipred] ** 2
+            )
+
+        return mean_lmbda_pred, var_lmbda_pred
+
+    def sample_posterior(self, X_test, num_samples=1):
+        """Samples log predictive likelihood for test set from posterior.
+
+        :param X_test: [num_X_test x D]
+                Observations in test set.
+        :param num_samples: int
+                How many samples of the intensity function should be drawn from
+                the posterior. (Default=1e4)
+
+        :return: numpy.ndarray [num_samples]
+                Returns the array of sampled likelihoods.
+        """
+
+        num_events = X_test.shape[0]
+        num_samples = int(num_samples)
+        X = numpy.concatenate([X_test, self.integration_points])
+        K = self.cov_func(X, X)
+        kx = self.cov_func(X, self.induced_points)
+        kappa = kx.dot(self.Ks_inv)
+        Sigma_post = K - kappa.dot(kx.T - self.Sigma_g_s.dot(kappa.T))
+        mu_post = kappa.dot(self.mu_g_s)
+        L_post = numpy.linalg.cholesky(
+            Sigma_post + self.noise * numpy.eye(Sigma_post.shape[0])
+        )
+
+        num_points = X.shape[0]
+        num_hundreds = int(num_samples)
+        pred_log_likelihood = numpy.empty([num_samples])
+
+        samples = []
+        # samples hundred instances at a time
+        for ihundreds in range(num_hundreds):
+            rand_nums = numpy.random.randn(num_points, 1)
+            g_sample = mu_post[:, None] + L_post.dot(rand_nums)
+            lmbda_max_sample = numpy.random.gamma(
+                shape=self.alpha_q1, scale=1.0 / self.beta_q1, size=1
+            )
+            lmbda_sample = lmbda_max_sample / (1.0 + numpy.exp(-g_sample))
+            samples.append(lmbda_sample)
+        return samples
+
+    def predictive_log_likelihood(self, X_test, num_samples=1e4):
+        """Samples log predictive likelihood for test set from posterior.
+
+        :param X_test: [num_X_test x D]
+                Observations in test set.
+        :param num_samples: int
+                How many samples of the intensity function should be drawn from
+                the posterior. (Default=1e4)
+
+        :return: numpy.ndarray [num_samples]
+                Returns the array of sampled likelihoods.
+        """
+
+        num_events = X_test.shape[0]
+        num_samples = int(num_samples)
+        X = numpy.concatenate([X_test, self.integration_points])
+        K = self.cov_func(X, X)
+        kx = self.cov_func(X, self.induced_points)
+        kappa = kx.dot(self.Ks_inv)
+        Sigma_post = K - kappa.dot(kx.T - self.Sigma_g_s.dot(kappa.T))
+        mu_post = kappa.dot(self.mu_g_s)
+        L_post = numpy.linalg.cholesky(
+            Sigma_post + self.noise * numpy.eye(Sigma_post.shape[0])
+        )
+
+        num_points = X.shape[0]
+        num_hundreds = int(num_samples / 1e2)
+        pred_log_likelihood = numpy.empty([num_samples])
+
+        # samples hundred instances at a time
+        for ihundreds in range(num_hundreds):
+            rand_nums = numpy.random.randn(num_points, 100)
+            g_sample = mu_post[:, None] + L_post.dot(rand_nums)
+            lmbda_max_sample = numpy.random.gamma(
+                shape=self.alpha_q1, scale=1.0 / self.beta_q1, size=100
+            )
+            lmbda_sample = lmbda_max_sample / (1.0 + numpy.exp(-g_sample))
+
+            pred_log_likelihood[ihundreds * 100 : (ihundreds + 1) * 100] = numpy.sum(
+                numpy.log(lmbda_sample[:num_events]), axis=0
+            )
+            pred_log_likelihood[ihundreds * 100 : (ihundreds + 1) * 100] -= (
+                numpy.mean(lmbda_sample[num_events:], axis=0) * self.R
+            )
+
+        return pred_log_likelihood
+
+    def expanded_predictive_log_likelihood(self, X_test):
+        """Fast approximation for log predictive test likelihood (Eq. 33 in
+        paper).
+
+        :param X_test: [num_X_test x D]
+                Observations in test set.
+
+        :return: float
+                Approximation of log predictive test likelihood.
+        """
+        self.update_predictive_posterior(only_int_points=True)
+        N = X_test.shape[0]
+        ks_x_test = self.cov_func(self.induced_points, X_test)
+        mu_g_X_test = ks_x_test.T.dot(self.Ks_inv.dot(self.mu_g_s))
+        u_mean = (
+            -self.lmbda_star_q1
+            * numpy.mean(1.0 / (1.0 + numpy.exp(-self.mu_g_int_points)))
+            * self.R
+            - numpy.sum(numpy.log(1.0 + numpy.exp(-mu_g_X_test)))
+            + N * numpy.log(self.lmbda_star_q1)
+        )
+
+        log_pred_likelihood = u_mean
+        du_dg = numpy.empty(N + self.num_integration_points)
+        du_dg[:N] = 1.0 / (1.0 + numpy.exp(mu_g_X_test))
+        du_dg[N:] = (
+            -self.lmbda_star_q1
+            / (1.0 + numpy.exp(-self.mu_g_int_points))
+            * (1.0 - 1.0 / (1.0 + numpy.exp(-self.mu_g_int_points)))
+            / self.num_integration_points
+            * self.R
+        )
+        du_dg2 = numpy.empty(N + self.num_integration_points)
+        du_dg2[:N] = -(1.0 - 1.0 / (1.0 + numpy.exp(mu_g_X_test))) / (
+            1.0 + numpy.exp(mu_g_X_test)
+        )
+        du_dg2[N:] = (
+            -self.lmbda_star_q1
+            / (1.0 + numpy.exp(-self.mu_g_int_points))
+            * (1.0 - 1.0 / (1.0 + numpy.exp(-self.mu_g_int_points)))
+            * (1.0 - 2.0 / (1.0 + numpy.exp(-self.mu_g_int_points)))
+            / self.num_integration_points
+            * self.R
+        )
+
+        du_dlambda = (
+            -self.R * numpy.mean(1.0 / (1.0 + numpy.exp(-self.mu_g_int_points)))
+            + N / self.lmbda_star_q1
+        )
+        du_dlmbda2 = -N / self.lmbda_star_q1**2
+
+        C = numpy.empty(
+            [N + self.num_integration_points, N + self.num_integration_points]
+        )
+        inner_matrix = self.Ks_inv.dot(
+            numpy.identity(self.num_inducing_points) - self.Sigma_g_s.dot(self.Ks_inv)
+        )
+
+        K_X = self.cov_func(X_test, X_test) + self.noise * numpy.identity(
+            X_test.shape[0]
+        )
+
+        C[:N, :N] = K_X - ks_x_test.T.dot(inner_matrix.dot(ks_x_test))
+        del K_X
+        K_int_points = self.cov_func(
+            self.integration_points, self.integration_points
+        ) + self.noise * numpy.identity(self.integration_points.shape[0])
+
+        C[N:, N:] = K_int_points - self.ks_int_points.T.dot(
+            inner_matrix.dot(self.ks_int_points)
+        )
+        del K_int_points
+
+        K_X_int_points = self.cov_func(self.integration_points, X_test)
+        C[N:, :N] = K_X_int_points - self.ks_int_points.T.dot(
+            inner_matrix.dot(ks_x_test)
+        )
+        del K_X_int_points
+
+        C[:N, N:] = C[N:, :N].T
+
+        log_pred_likelihood_corr = (
+            0.5 * numpy.trace(C.dot(numpy.diag(du_dg2) + numpy.outer(du_dg, du_dg)))
+            + 0.5 * (du_dlmbda2 + du_dlambda**2) * self.alpha_q1 / self.beta_q1**2
+        )
+        log_pred_likelihood += log_pred_likelihood_corr
+
+        return log_pred_likelihood
diff --git a/stpy/borel_set.py b/stpy/borel_set.py
index 2760ffc..2440e4f 100644
--- a/stpy/borel_set.py
+++ b/stpy/borel_set.py
@@ -6,299 +6,314 @@
 from stpy.helpers.helper import cartesian
 
 
-class BorelSet():
-
-	def __init__(self, d, bounds):
-		self.d = d
-		self.bounds = bounds
-		self.calculate_volume()
-		self.type = "box"
-
-	def description(self):
-		return self.bounds
-
-	def calculate_volume(self):
-		self.vol = 1.
-		for i in range(self.d):
-			self.vol = self.vol * (self.bounds[i, 1] - self.bounds[i, 0])
-
-	def volume(self):
-		return self.vol
-
-	def center_point(self):
-		return (self.bounds[:, 1] + self.bounds[:, 0]) / 2
-
-	def perimeter(self):
-		cir = 0.
-		for i in range(self.d):
-			cir += 2 * (self.bounds[i, 1] - self.bounds[i, 0])
-		return cir
-
-	def uniform_sample(self, n):
-		sample = torch.zeros(n, self.d).double()
-		for i in range(self.d):
-			sample_i = torch.from_numpy(np.random.uniform(self.bounds[i, 0], self.bounds[i, 1], n))
-			sample[:, i] = sample_i
-		return sample
-
-	def return_legendre_discretization(self, n):
-		nodes, weights = np.polynomial.legendre.leggauss(n)
-		nodes_arr = []
-		weights_arr = []
-		for i in range(self.d):
-			a, b = float(self.bounds[i, 0]), float(self.bounds[i, 1])
-			nodes = nodes * (b - a) / 2. + (a + b) / 2.
-			nodes_arr.append(nodes)
-			weights_arr.append(weights * 0.5 * (b - a))
-
-		nodes = cartesian(nodes_arr)
-		weights = cartesian(weights_arr)
-		return torch.prod(torch.from_numpy(weights), dim=1), torch.from_numpy(nodes)
-
-	def return_discretization(self, n, offsets=None):
-		dis = []
-		for i in range(self.d):
-			if offsets is None:
-				x = np.linspace(self.bounds[i, 0], self.bounds[i, 1], n)
-			else:
-				x = np.linspace(self.bounds[i, 0] - offsets[i], self.bounds[i, 1] + offsets[i], n)
-			dis.append(x)
-		r = cartesian(dis)
-		r = torch.from_numpy(r)
-		return r
-
-	def inside(self, set):
-		"""
-		Tests if set is inside this set
-		:param set:
-		:return:
-		"""
-		for i in range(self.d):
-			if self.bounds[i, 0] > set.bounds[i, 0] or self.bounds[i, 1] < set.bounds[i, 1]:
-				return False
-		return True
-
-	def is_inside(self, x):
-		"""
-		:param x:  (n,d) to check if a<=x<b
-		:return: bool
-		"""
-		mask = torch.full((x.size()[0], 1), True, dtype=torch.bool).view(-1)
-		for i in range(self.d):
-			mask1 = self.bounds[i, 0] <= x[:, i]
-			mask2 = x[:, i] < self.bounds[i, 1]
-			mask = mask1 * mask2 * mask
-		return mask
+class BorelSet:
+
+    def __init__(self, d, bounds):
+        self.d = d
+        self.bounds = bounds
+        self.calculate_volume()
+        self.type = "box"
+
+    def description(self):
+        return self.bounds
+
+    def calculate_volume(self):
+        self.vol = 1.0
+        for i in range(self.d):
+            self.vol = self.vol * (self.bounds[i, 1] - self.bounds[i, 0])
+
+    def volume(self):
+        return self.vol
+
+    def center_point(self):
+        return (self.bounds[:, 1] + self.bounds[:, 0]) / 2
+
+    def perimeter(self):
+        cir = 0.0
+        for i in range(self.d):
+            cir += 2 * (self.bounds[i, 1] - self.bounds[i, 0])
+        return cir
+
+    def uniform_sample(self, n):
+        sample = torch.zeros(n, self.d).double()
+        for i in range(self.d):
+            sample_i = torch.tensor(
+                np.random.uniform(self.bounds[i, 0], self.bounds[i, 1], n)
+            )
+            sample[:, i] = sample_i
+        return sample
+
+    def return_legendre_discretization(self, n):
+        nodes, weights = np.polynomial.legendre.leggauss(n)
+        nodes_arr = []
+        weights_arr = []
+        for i in range(self.d):
+            a, b = float(self.bounds[i, 0]), float(self.bounds[i, 1])
+            nodes = nodes * (b - a) / 2.0 + (a + b) / 2.0
+            nodes_arr.append(nodes)
+            weights_arr.append(weights * 0.5 * (b - a))
+
+        nodes = cartesian(nodes_arr)
+        weights = cartesian(weights_arr)
+        return torch.prod(torch.tensor(weights), dim=1), torch.tensor(nodes)
+
+    def return_discretization(self, n, offsets=None):
+        dis = []
+        for i in range(self.d):
+            if offsets is None:
+                x = np.linspace(self.bounds[i, 0].cpu(), self.bounds[i, 1].cpu(), n)
+            else:
+                x = np.linspace(
+                    self.bounds[i, 0].cpu() - offsets[i].cpu(),
+                    self.bounds[i, 1].cpu() + offsets[i].cpu(),
+                    n,
+                )
+            dis.append(x)
+        r = cartesian(dis)
+        r = torch.tensor(r)
+        return r
+
+    def inside(self, set):
+        """
+        Tests if set is inside this set
+        :param set:
+        :return:
+        """
+        for i in range(self.d):
+            if (
+                self.bounds[i, 0] > set.bounds[i, 0]
+                or self.bounds[i, 1] < set.bounds[i, 1]
+            ):
+                return False
+        return True
+
+    def is_inside(self, x):
+        """
+        :param x:  (n,d) to check if a<=x<b
+        :return: bool
+        """
+        mask = torch.full((x.size()[0], 1), True, dtype=torch.bool).view(-1)
+        for i in range(self.d):
+            mask1 = self.bounds[i, 0] <= x[:, i]
+            mask2 = x[:, i] < self.bounds[i, 1]
+            mask = mask1 * mask2 * mask
+        return mask
 
 
 class BallSet(BorelSet):
-	def __init__(self, d, center, radius):
-		self.d = d
-		self.center = center
-		self.radius = radius
-		self.calculate_volume()
-		self.type = "round"
-
-	def calculate_volume(self):
-		self.vol = (self.radius ** self.d) * np.pi ** (self.d // 2) / (scipy.special.gamma(self.d // 2 + 1))
-
-	def description(self):
-		return self.center, self.radius
-
-	def return_discretization(self, n):
-		if self.d == 1:
-			dis = []
-			x = np.linspace(self.center - self.radius, self.center + self.radius, n)
-			dis.append(x)
-			r = cartesian(dis)
-			r = torch.from_numpy(r)
-			return r
-
-
-		elif self.d == 2:
-
-			p, w = np.polynomial.legendre.leggauss(n)
-			mu = np.arange(1, n + 1)
-			points = np.array(
-				[
-					np.tile(self.radius * np.cos(mu * np.pi / (n + 1)), n),
-					np.outer(p, self.radius * np.sin(mu * np.pi / (n + 1))).flatten(),
-				]
-			).T
-			points[:, 0] += float(self.center[0])
-			points[:, 1] += float(self.center[1])
-
-		# k = n - 2
-		# theta = 2 * np.pi * np.arange(1, k + 2) / (k + 1)
-		# p, w = np.polynomial.legendre.leggauss(n + 1)
-		# # scale points to [r0, r1] (where r0 = 0, r1 = 1 for now)
-		# p = np.sqrt(0.5 * (p + 1.0))
-		# p_theta = np.dstack(np.meshgrid(p, theta)).reshape(-1, 2).T
-		# points = np.array(
-		# 	[p_theta[0] * self.radius * np.cos(p_theta[1]), p_theta[0] * self.radius * np.sin(p_theta[1])]
-		# ).T
-		# points[:,0] += float(self.center[0])
-		# points[:,1] += float(self.center[1])
-		#
-		# points = np.concatenate((points,self.center.view(-1,self.d).numpy()))
-
-		return torch.from_numpy(points)
-
-	def return_legendre_discretization(self, n):
-		if self.d == 2:
-			p, w = np.polynomial.legendre.leggauss(n)
-			mu = np.arange(1, n + 1)
-			points = np.array(
-				[
-					np.tile(self.radius * np.cos(mu * np.pi / (n + 1)), n),
-					np.outer(p, self.radius * np.sin(mu * np.pi / (n + 1))).flatten(),
-				]
-			).T
-			points[:, 0] += float(self.center[0])
-			points[:, 1] += float(self.center[1])
-			weights = np.outer(w, np.sin(mu * np.pi / (n + 1)) ** 2).flatten() / (n + 1)
-			return torch.from_numpy(weights), torch.from_numpy(points)
-		else:
-			raise AssertionError("Wrong type of set considered.")
-
-	def inside(self, set):
-		"""
-		Tests if set is inside this set
-		:param set:
-		:return:
-		"""
-
-		## the tested set is box
-		if set.type == "box":
-			for i in range(self.d):
-				if set.bounds[i, 0] > self.center[i] - self.radius or set.bounds[i, 1] < self.center[i] - self.radius:
-					return False
-		## set is round
-		else:
-			if (self.center - set.center) ** 2 > self.radius ** 2:
-				return False
-		return True
-
-	def is_inside(self, x):
-		"""
-		:param x:  (n,d) to check if a<=x<b
-		:return: bool
-		"""
-		n = x.size()[0]
-		mask = torch.full((x.size()[0], 1), True, dtype=torch.bool).view(-1)
-		mask = self.radius ** 2 >= (x - torch.tile(self.center, (n, 1))) ** 2
-		return mask
+    def __init__(self, d, center, radius):
+        self.d = d
+        self.center = center
+        self.radius = radius
+        self.calculate_volume()
+        self.type = "round"
+
+    def calculate_volume(self):
+        self.vol = (
+            (self.radius**self.d)
+            * np.pi ** (self.d // 2)
+            / (scipy.special.gamma(self.d // 2 + 1))
+        )
+
+    def description(self):
+        return self.center, self.radius
+
+    def return_discretization(self, n):
+        if self.d == 1:
+            dis = []
+            x = np.linspace(self.center - self.radius, self.center + self.radius, n)
+            dis.append(x)
+            r = cartesian(dis)
+            r = torch.tensor(r)
+            return r
+
+        elif self.d == 2:
+
+            p, w = np.polynomial.legendre.leggauss(n)
+            mu = np.arange(1, n + 1)
+            points = np.array(
+                [
+                    np.tile(self.radius * np.cos(mu * np.pi / (n + 1)), n),
+                    np.outer(p, self.radius * np.sin(mu * np.pi / (n + 1))).flatten(),
+                ]
+            ).T
+            points[:, 0] += float(self.center[0])
+            points[:, 1] += float(self.center[1])
+
+        # k = n - 2
+        # theta = 2 * np.pi * np.arange(1, k + 2) / (k + 1)
+        # p, w = np.polynomial.legendre.leggauss(n + 1)
+        # # scale points to [r0, r1] (where r0 = 0, r1 = 1 for now)
+        # p = np.sqrt(0.5 * (p + 1.0))
+        # p_theta = np.dstack(np.meshgrid(p, theta)).reshape(-1, 2).T
+        # points = np.array(
+        # 	[p_theta[0] * self.radius * np.cos(p_theta[1]), p_theta[0] * self.radius * np.sin(p_theta[1])]
+        # ).T
+        # points[:,0] += float(self.center[0])
+        # points[:,1] += float(self.center[1])
+        #
+        # points = np.concatenate((points,self.center.view(-1,self.d).numpy()))
+
+        return torch.tensor(points)
+
+    def return_legendre_discretization(self, n):
+        if self.d == 2:
+            p, w = np.polynomial.legendre.leggauss(n)
+            mu = np.arange(1, n + 1)
+            points = np.array(
+                [
+                    np.tile(self.radius * np.cos(mu * np.pi / (n + 1)), n),
+                    np.outer(p, self.radius * np.sin(mu * np.pi / (n + 1))).flatten(),
+                ]
+            ).T
+            points[:, 0] += float(self.center[0])
+            points[:, 1] += float(self.center[1])
+            weights = np.outer(w, np.sin(mu * np.pi / (n + 1)) ** 2).flatten() / (n + 1)
+            return torch.tensor(weights), torch.tensor(points)
+        else:
+            raise AssertionError("Wrong type of set considered.")
+
+    def inside(self, set):
+        """
+        Tests if set is inside this set
+        :param set:
+        :return:
+        """
+
+        ## the tested set is box
+        if set.type == "box":
+            for i in range(self.d):
+                if (
+                    set.bounds[i, 0] > self.center[i] - self.radius
+                    or set.bounds[i, 1] < self.center[i] - self.radius
+                ):
+                    return False
+        ## set is round
+        else:
+            if (self.center - set.center) ** 2 > self.radius**2:
+                return False
+        return True
+
+    def is_inside(self, x):
+        """
+        :param x:  (n,d) to check if a<=x<b
+        :return: bool
+        """
+        n = x.size()[0]
+        mask = torch.full((x.size()[0], 1), True, dtype=torch.bool).view(-1)
+        mask = self.radius**2 >= (x - torch.tile(self.center, (n, 1))) ** 2
+        return mask
 
 
 class Node(BorelSet):
 
-	def __init__(self, d, bounds, parent):
-		super().__init__(d, bounds)
-		self.left = None
-		self.right = None
-		self.children = None
-		self.parent = parent
-
-		if self.parent is None:
-			self.level = 1
-		else:
-			self.level = parent.level + 1
-
-
-class HierarchicalBorelSets():
-
-	def __init__(self, d, interval, levels):
-		if d == 1:
-			self.top_node = Node(d, torch.Tensor([interval]), None)
-		elif d == 2:
-			self.top_node = Node(d, torch.Tensor(interval), None)
-
-		self.Sets = [self.top_node]
-		self.levels = levels
-		if d == 1:
-			self.construct_1d(interval, levels, self.Sets, self.top_node)
-		else:
-			self.construct_2d(self.top_node.bounds, levels, self.Sets, self.top_node)
-		self.d = d
-
-	def get_parent_set(self):
-		return self.top_node
-
-	def get_sets_level(self, l):
-		out = []
-		for s in self.Sets:
-			if s.level == l:
-				out.append(s)
-		return out
-
-	def get_all_sets(self):
-		return self.Sets
-
-	def get_ball_coverings(self, n, radius='auto'):
-		D = self.get_parent_set()
-		centers = D.return_discretization(n)
-		n = centers.size()[0]
-		sets = []
-		for i in range(n):
-			if radius == 'auto':
-				sets.append(BallSet(D.d, centers[i, :], 2. / n))
-			else:
-				sets.append(BallSet(D.d, centers[i, :], radius))
-		return sets
-
-	def construct_1d(self, interval, levels, S, parent):
-
-		if levels > 1:
-			a, b = interval
-			c = (a + b) / 2.
-
-			S_1 = Node(1, torch.Tensor([[a, c]]), parent)
-			S_2 = Node(1, torch.Tensor([[c, b]]), parent)
-
-			parent.left = S_1
-			parent.right = S_2
-
-			S.append(S_1)
-			self.construct_1d((a, c), levels - 1, S, S_1)
-			S.append(S_2)
-			self.construct_1d((c, b), levels - 1, S, S_2)
-
-		else:
-			return None
-
-	def construct_2d(self, interval, levels, S, parent):
-		if levels > 1:
-			xa = interval[0, 0]
-			xb = interval[0, 1]
-			ya = interval[1, 0]
-			yb = interval[1, 1]
-
-			midx = xa + (xb - xa) / 2.
-			midy = ya + (yb - ya) / 2.
-
-			S1 = Node(2, torch.Tensor([[xa, midx], [ya, midy]]), parent)
-			S2 = Node(2, torch.Tensor([[xa, midx], [midy, yb]]), parent)
-			S3 = Node(2, torch.Tensor([[midx, xb], [ya, midy]]), parent)
-			S4 = Node(2, torch.Tensor([[midx, xb], [midy, yb]]), parent)
-
-			parent.children = [S1, S2, S3, S4]
-
-			for child in parent.children:
-				S.append(child)
-				self.construct_2d(child.bounds, levels - 1, S, child)
-		else:
-			return None
+    def __init__(self, d, bounds, parent):
+        super().__init__(d, bounds)
+        self.left = None
+        self.right = None
+        self.children = None
+        self.parent = parent
+
+        if self.parent is None:
+            self.level = 1
+        else:
+            self.level = parent.level + 1
+
+
+class HierarchicalBorelSets:
+
+    def __init__(self, d, interval, levels):
+        if d == 1:
+            self.top_node = Node(d, torch.tensor([interval]), None)
+        elif d == 2:
+            self.top_node = Node(d, torch.tensor(interval), None)
+
+        self.Sets = [self.top_node]
+        self.levels = levels
+        if d == 1:
+            self.construct_1d(interval, levels, self.Sets, self.top_node)
+        else:
+            self.construct_2d(self.top_node.bounds, levels, self.Sets, self.top_node)
+        self.d = d
+
+    def get_parent_set(self):
+        return self.top_node
+
+    def get_sets_level(self, l):
+        out = []
+        for s in self.Sets:
+            if s.level == l:
+                out.append(s)
+        return out
+
+    def get_all_sets(self):
+        return self.Sets
+
+    def get_ball_coverings(self, n, radius="auto"):
+        D = self.get_parent_set()
+        centers = D.return_discretization(n)
+        n = centers.size()[0]
+        sets = []
+        for i in range(n):
+            if radius == "auto":
+                sets.append(BallSet(D.d, centers[i, :], 2.0 / n))
+            else:
+                sets.append(BallSet(D.d, centers[i, :], radius))
+        return sets
+
+    def construct_1d(self, interval, levels, S, parent):
+
+        if levels > 1:
+            a, b = interval
+            c = (a + b) / 2.0
+
+            S_1 = Node(1, torch.tensor([[a, c]]), parent)
+            S_2 = Node(1, torch.tensor([[c, b]]), parent)
+
+            parent.left = S_1
+            parent.right = S_2
+
+            S.append(S_1)
+            self.construct_1d((a, c), levels - 1, S, S_1)
+            S.append(S_2)
+            self.construct_1d((c, b), levels - 1, S, S_2)
+
+        else:
+            return None
+
+    def construct_2d(self, interval, levels, S, parent):
+        if levels > 1:
+            xa = interval[0, 0]
+            xb = interval[0, 1]
+            ya = interval[1, 0]
+            yb = interval[1, 1]
+
+            midx = xa + (xb - xa) / 2.0
+            midy = ya + (yb - ya) / 2.0
+
+            S1 = Node(2, torch.tensor([[xa, midx], [ya, midy]]), parent)
+            S2 = Node(2, torch.tensor([[xa, midx], [midy, yb]]), parent)
+            S3 = Node(2, torch.tensor([[midx, xb], [ya, midy]]), parent)
+            S4 = Node(2, torch.tensor([[midx, xb], [midy, yb]]), parent)
+
+            parent.children = [S1, S2, S3, S4]
+
+            for child in parent.children:
+                S.append(child)
+                self.construct_2d(child.bounds, levels - 1, S, child)
+        else:
+            return None
 
 
 if __name__ == "__main__":
-	center = torch.Tensor([0.5, 0.5]).double()
-	radius = 0.1
-	d = 2
-	B = BallSet(d, center, radius)
-
-	weights, xtest = B.return_legendre_discretization(10)
-	xtest2 = B.return_discretization(10)
-	print(torch.sum(weights))
-	plt.plot(xtest[:, 0], xtest[:, 1], 'ko')
-	plt.plot(xtest2[:, 0], xtest2[:, 1], 'ro')
-	plt.show()
+    center = torch.tensor([0.5, 0.5]).double()
+    radius = 0.1
+    d = 2
+    B = BallSet(d, center, radius)
+
+    weights, xtest = B.return_legendre_discretization(10)
+    xtest2 = B.return_discretization(10)
+    print(torch.sum(weights))
+    plt.plot(xtest[:, 0], xtest[:, 1], "ko")
+    plt.plot(xtest2[:, 0], xtest2[:, 1], "ro")
+    plt.show()
diff --git a/stpy/candidate_set.py b/stpy/candidate_set.py
index 663a2aa..e3333e7 100644
--- a/stpy/candidate_set.py
+++ b/stpy/candidate_set.py
@@ -2,60 +2,61 @@
 import torch
 
 
-class CandidateSet():
+class CandidateSet:
+
+    def __init__(self):
+        pass
 
-	def __init__(self):
-		pass
 
 class CandidateDiscreteSet(CandidateSet):
 
-	def __init__(self, xtest):
-		super().__init__()
-		self.xtest = xtest
-		self.embedded = False
-
-	def get_set_size(self):
-		return self.xtest.size()[0]
-
-	def get_dim(self):
-		return self.xtest.size()[1]
-
-	def get_emb_dim(self):
-		if self.embedded:
-			return self.emb_xtest.size()[1]
-		else:
-			return self.xtest.size()[1]
-
-	def get_random_elements(self, size = 1):
-		n = self.get_set_size()
-		indices = np.random.choice(np.arange(0,n,1), size)
-		print (indices)
-		if self.embedded:
-			elem = self.emb_xtest[indices, :]
-		else:
-			elem = self.xtest[indices,:]
-		print (elem)
-		return elem
-
-	def debug_subsample(self):
-		self.xtest = self.xtest[0:20000,:]
-
-	def get_options_per_dim(self):
-		d = {}
-		dims = self.get_dim()
-		for i in range(dims):
-			d[i] = torch.unique(self.xtest[:,i])
-		return d
-
-	def get_options(self):
-		if self.embedded:
-			return self.emb_xtest
-		else:
-			return self.xtest
-
-	def get_options_raw(self):
-		return self.xtest
-
-	def use_embedding(self, embed):
-		self.embedded = True
-		self.emb_xtest = embed(self.xtest)
+    def __init__(self, xtest):
+        super().__init__()
+        self.xtest = xtest
+        self.embedded = False
+
+    def get_set_size(self):
+        return self.xtest.size()[0]
+
+    def get_dim(self):
+        return self.xtest.size()[1]
+
+    def get_emb_dim(self):
+        if self.embedded:
+            return self.emb_xtest.size()[1]
+        else:
+            return self.xtest.size()[1]
+
+    def get_random_elements(self, size=1):
+        n = self.get_set_size()
+        indices = np.random.choice(np.arange(0, n, 1), size)
+        print(indices)
+        if self.embedded:
+            elem = self.emb_xtest[indices, :]
+        else:
+            elem = self.xtest[indices, :]
+        print(elem)
+        return elem
+
+    def debug_subsample(self):
+        self.xtest = self.xtest[0:20000, :]
+
+    def get_options_per_dim(self):
+        d = {}
+        dims = self.get_dim()
+        for i in range(dims):
+            d[i] = torch.unique(self.xtest[:, i])
+        return d
+
+    def get_options(self):
+        if self.embedded:
+            return self.emb_xtest
+        else:
+            return self.xtest
+
+    def get_options_raw(self):
+        return self.xtest
+
+    def use_embedding(self, embed):
+        self.embedded = True
+        self.emb_xtest = embed(self.xtest)
diff --git a/stpy/continuous_processes/categorical_mixture.py b/stpy/continuous_processes/categorical_mixture.py
index c22bff8..8dc3689 100755
--- a/stpy/continuous_processes/categorical_mixture.py
+++ b/stpy/continuous_processes/categorical_mixture.py
@@ -8,179 +8,188 @@
 
 class CategoricalMixture(GaussianProcess):
 
-	def __init__(self, processes, init_weights=None, d=1, bounds=None):
-		if init_weights is None:
-			self.k = len(processes)
-			init_weights = torch.ones(size=(self.k, 1)).view(-1).double() * 1. / float(self.k)
-		else:
-			self.k = len(processes)
-
-		if len(processes) != init_weights.shape[0]:
-			raise AssertionError("Not the same number")
-
-		self.processes = processes
-		self.bounds = bounds
-		self.beta = 2.
-		self.d = d
-		self.x = None
-		self.y = None
-		self.init_weights = init_weights
-		if torch.sum(self.init_weights) > 1.:
-			self.init_weights = self.init_weights / torch.sum(self.init_weights)
-		self.weights = self.init_weights
-
-	def add_data_point(self, x, y):
-		for model in self.processes:
-			model.add_data_point(x, y)
-
-	def log_prob_normal(self, K, y):
-		Knumpy = K.detach().numpy()
-		ynumpy = y.detach().numpy()
-
-		decomp = scipy.linalg.lu_factor(Knumpy)
-		alpha = scipy.linalg.lu_solve(decomp, ynumpy)
-
-		logprob = -0.5 * ynumpy.T.dot(alpha) - 0.5 * np.linalg.slogdet(Knumpy)[1] - 0.5 * ynumpy.shape[0] * np.log(
-			2 * np.pi)
-
-		return float(logprob)
-
-	def fit_gp(self, x, y, iterative=False):
-		self.x = x
-		self.y = y
-
-		logprobs = torch.zeros(size=(self.k, 1)).view(-1).double()
-
-		for j in range(self.k):
-			GP = self.processes[j]
-			GP.fit(x, y)
-			K = GP.get_kernel()
-			logprobs[j] = self.log_prob_normal(K, y)
-
-		# print("Neg. log likelihood vector:", -logprobs)
-
-		log_init_prob = torch.log(self.init_weights)
-		log_posterior = log_init_prob + logprobs
-		log_evidence = torch.logsumexp(log_posterior, dim=0)
-		self.weights = torch.exp(log_posterior - log_evidence)
-
-		# print ("Categorical Probability: ",self.weights)
-		# print ("---------------------------------")
-
-		self.fit = True
-		return True
-
-	def mean_std(self, xtest):
-		mu = torch.zeros(size=(xtest.size()[0], 1)).double()
-		s = torch.zeros(size=(xtest.size()[0], 1)).double()
-		for j in range(self.k):
-			(a1, a2) = self.processes[j].mean_std(xtest)
-
-			mu = mu + self.weights[j] * a1
-			s = s + self.weights[j] * a2 ** 2
-		s = torch.sqrt(s)
-		return (mu, s)
-
-	def sample(self, xtest, size=1, with_mask=False):
-		# sample a GP
-		k = np.random.choice(np.arange(0, self.k, 1), p=self.weights.flatten())
-		mask = [k]
-		if self.fit == True:
-			self.processes[k].fit(self.x, self.y)
-			samples = self.processes[k].sample(xtest, size=1)
-		else:
-			samples = self.processes[k].sample(xtest, size=1)
-
-		for s in range(size - 1):
-			k = np.random.choice(np.arange(0, self.k, 1), p=self.weights.flatten())
-			mask.append(k)
-			if self.fit == True:
-				self.processes[k].fit(self.x, self.y)
-				sample = self.processes[k].sample(xtest, size=1)
-				samples = torch.cat((samples, sample), dim=1)
-			else:
-				sample = self.processes[k].sample(xtest, size=1)
-				samples = torch.cat((samples, sample), dim=1)
-		if with_mask == True:
-			return (samples, mask)
-		else:
-			return samples
+    def __init__(self, processes, init_weights=None, d=1, bounds=None):
+        if init_weights is None:
+            self.k = len(processes)
+            init_weights = (
+                torch.ones(size=(self.k, 1)).view(-1).double() * 1.0 / float(self.k)
+            )
+        else:
+            self.k = len(processes)
+
+        if len(processes) != init_weights.shape[0]:
+            raise AssertionError("Not the same number")
+
+        self.processes = processes
+        self.bounds = bounds
+        self.beta = 2.0
+        self.d = d
+        self.x = None
+        self.y = None
+        self.init_weights = init_weights
+        if torch.sum(self.init_weights) > 1.0:
+            self.init_weights = self.init_weights / torch.sum(self.init_weights)
+        self.weights = self.init_weights
+
+    def add_data_point(self, x, y):
+        for model in self.processes:
+            model.add_data_point(x, y)
+
+    def log_prob_normal(self, K, y):
+        Knumpy = K.detach().numpy()
+        ynumpy = y.detach().numpy()
+
+        decomp = scipy.linalg.lu_factor(Knumpy)
+        alpha = scipy.linalg.lu_solve(decomp, ynumpy)
+
+        logprob = (
+            -0.5 * ynumpy.T.dot(alpha)
+            - 0.5 * np.linalg.slogdet(Knumpy)[1]
+            - 0.5 * ynumpy.shape[0] * np.log(2 * np.pi)
+        )
+
+        return float(logprob)
+
+    def fit_gp(self, x, y, iterative=False):
+        self.x = x
+        self.y = y
+
+        logprobs = torch.zeros(size=(self.k, 1)).view(-1).double()
+
+        for j in range(self.k):
+            GP = self.processes[j]
+            GP.fit(x, y)
+            K = GP.get_kernel()
+            logprobs[j] = self.log_prob_normal(K, y)
+
+        # print("Neg. log likelihood vector:", -logprobs)
+
+        log_init_prob = torch.log(self.init_weights)
+        log_posterior = log_init_prob + logprobs
+        log_evidence = torch.logsumexp(log_posterior, dim=0)
+        self.weights = torch.exp(log_posterior - log_evidence)
+
+        # print ("Categorical Probability: ",self.weights)
+        # print ("---------------------------------")
+
+        self.fit = True
+        return True
+
+    def mean_std(self, xtest):
+        mu = torch.zeros(size=(xtest.size()[0], 1)).double()
+        s = torch.zeros(size=(xtest.size()[0], 1)).double()
+        for j in range(self.k):
+            (a1, a2) = self.processes[j].mean_std(xtest)
+
+            mu = mu + self.weights[j] * a1
+            s = s + self.weights[j] * a2**2
+        s = torch.sqrt(s)
+        return (mu, s)
+
+    def sample(self, xtest, size=1, with_mask=False):
+        # sample a GP
+        k = np.random.choice(np.arange(0, self.k, 1), p=self.weights.flatten())
+        mask = [k]
+        if self.fit == True:
+            self.processes[k].fit(self.x, self.y)
+            samples = self.processes[k].sample(xtest, size=1)
+        else:
+            samples = self.processes[k].sample(xtest, size=1)
+
+        for s in range(size - 1):
+            k = np.random.choice(np.arange(0, self.k, 1), p=self.weights.flatten())
+            mask.append(k)
+            if self.fit == True:
+                self.processes[k].fit(self.x, self.y)
+                sample = self.processes[k].sample(xtest, size=1)
+                samples = torch.cat((samples, sample), dim=1)
+            else:
+                sample = self.processes[k].sample(xtest, size=1)
+                samples = torch.cat((samples, sample), dim=1)
+        if with_mask == True:
+            return (samples, mask)
+        else:
+            return samples
 
 
 if __name__ == "__main__":
 
-	# domain size
-	L_infinity_ball = 5
-	# dimension
-	d = 1
-	# error variance
-	s = 0.001
-	# grid density
-	n = 512
-	# number of intial points
-	N = 15
-
-	# model
-	# GP1 = GaussianProcess(kernel="squared_exponential", s=s, gamma = 1.5, diameter=L_infinity_ball)
-	GP1 = GaussianProcess(kernel="modified_matern", s=s, kappa=1., nu=2, gamma=1.5)
-	GP2 = GaussianProcess(kernel="modified_matern", s=s, kappa=1., nu=1, gamma=0.7)
-	# GP2 = GaussianProcess(kernel="squared_exponential", s=s, gamma=1.1)
-	GP3 = GaussianProcess(kernel="modified_matern", s=s, kappa=1., nu=2, gamma=1)
-	GP4 = GaussianProcess(kernel="linear", s=s, kappa=1.)
-
-	# data
-	# GPTrue = GaussianProcess(kernel="linear", s=0, kappa=1., diameter=L_infinity_ball)
-	# GPTrue = GaussianProcess(kernel="squared_exponential", s=s, gamma=2., kappa = 1)
-	GPTrue = GaussianProcess(kernel="modified_matern", s=s, kappa=1., nu=2, gamma=1.1)
-
-	# test environment
-
-	d = 1
-	from stpy.test_functions.benchmarks import GaussianProcessSample
-
-	BenchmarkFunc = GaussianProcessSample(d=d, n=n, sigma=0., gamma=0.2, name="squared_exponential")
-	x = BenchmarkFunc.initial_guess(N)
-	xtest = BenchmarkFunc.interval(n)
-	BenchmarkFunc.optimize(xtest, s)
-	gamma = BenchmarkFunc.bandwidth()
-	bounds = BenchmarkFunc.bounds()
-	BenchmarkFunc.scale_max(xtest=xtest)
-	F = lambda x: BenchmarkFunc.eval(x, sigma=s)
-
-	# targets
-	y = F(x)
-	GPs = [GP1, GP2, GP3, GP4]
-	# Mix = CategoricalMixture(GPs,init_weights=np.array([0.01,0.01,0.98]))
-	Mix = CategoricalMixture(GPs)
-
-	for j in range(N):
-		plt.figure(1)
-		plt.clf()
-		X = x[0:j + 1, :].reshape(-1, 1)
-		y = F(X)
-		Mix.fit_gp(X, y)
-		(mu, var) = Mix.mean_std(xtest)
-		samples = Mix.sample(xtest, size=5)
-		f = F(xtest).numpy()
-		mu = mu.numpy()
-		var = var.numpy()
-		samples = samples.numpy()
-		xtest2 = xtest.numpy()
-
-		plt.plot(xtest2, samples, '--', linewidth=2, alpha=0.3)
-		plt.plot(xtest2, mu, 'k', linewidth=3)
-		plt.plot(xtest2, mu, 'k', linewidth=3)
-		plt.fill_between(xtest2.flat, (mu - 2 * var).flat, (mu + 2 * var).flat, color="#dddddd")
-		plt.plot(X, y, 'ro', markersize=10)
-		plt.plot(xtest2, f, 'g', linewidth=3)
-		plt.draw()
-
-		plt.figure(2)
-		plt.clf()
-		plt.title("Probability of Category")
-		plt.bar(np.arange(len(GPs)), Mix.weights, np.ones(len(GPs)) * 0.5)
-		plt.xticks(np.arange(len(GPs)), [GP.description() for GP in GPs], rotation=30)
-		plt.subplots_adjust(bottom=0.35)
-		plt.plot()
-		plt.show()
+    # domain size
+    L_infinity_ball = 5
+    # dimension
+    d = 1
+    # error variance
+    s = 0.001
+    # grid density
+    n = 512
+    # number of intial points
+    N = 15
+
+    # model
+    # GP1 = GaussianProcess(kernel="squared_exponential", s=s, gamma = 1.5, diameter=L_infinity_ball)
+    GP1 = GaussianProcess(kernel="modified_matern", s=s, kappa=1.0, nu=2, gamma=1.5)
+    GP2 = GaussianProcess(kernel="modified_matern", s=s, kappa=1.0, nu=1, gamma=0.7)
+    # GP2 = GaussianProcess(kernel="squared_exponential", s=s, gamma=1.1)
+    GP3 = GaussianProcess(kernel="modified_matern", s=s, kappa=1.0, nu=2, gamma=1)
+    GP4 = GaussianProcess(kernel="linear", s=s, kappa=1.0)
+
+    # data
+    # GPTrue = GaussianProcess(kernel="linear", s=0, kappa=1., diameter=L_infinity_ball)
+    # GPTrue = GaussianProcess(kernel="squared_exponential", s=s, gamma=2., kappa = 1)
+    GPTrue = GaussianProcess(kernel="modified_matern", s=s, kappa=1.0, nu=2, gamma=1.1)
+
+    # test environment
+
+    d = 1
+    from stpy.test_functions.benchmarks import GaussianProcessSample
+
+    BenchmarkFunc = GaussianProcessSample(
+        d=d, n=n, sigma=0.0, gamma=0.2, name="squared_exponential"
+    )
+    x = BenchmarkFunc.initial_guess(N)
+    xtest = BenchmarkFunc.interval(n)
+    BenchmarkFunc.optimize(xtest, s)
+    gamma = BenchmarkFunc.bandwidth()
+    bounds = BenchmarkFunc.bounds()
+    BenchmarkFunc.scale_max(xtest=xtest)
+    F = lambda x: BenchmarkFunc.eval(x, sigma=s)
+
+    # targets
+    y = F(x)
+    GPs = [GP1, GP2, GP3, GP4]
+    # Mix = CategoricalMixture(GPs,init_weights=np.array([0.01,0.01,0.98]))
+    Mix = CategoricalMixture(GPs)
+
+    for j in range(N):
+        plt.figure(1)
+        plt.clf()
+        X = x[0 : j + 1, :].reshape(-1, 1)
+        y = F(X)
+        Mix.fit_gp(X, y)
+        (mu, var) = Mix.mean_std(xtest)
+        samples = Mix.sample(xtest, size=5)
+        f = F(xtest).numpy()
+        mu = mu.numpy()
+        var = var.numpy()
+        samples = samples.numpy()
+        xtest2 = xtest.numpy()
+
+        plt.plot(xtest2, samples, "--", linewidth=2, alpha=0.3)
+        plt.plot(xtest2, mu, "k", linewidth=3)
+        plt.plot(xtest2, mu, "k", linewidth=3)
+        plt.fill_between(
+            xtest2.flat, (mu - 2 * var).flat, (mu + 2 * var).flat, color="#dddddd"
+        )
+        plt.plot(X, y, "ro", markersize=10)
+        plt.plot(xtest2, f, "g", linewidth=3)
+        plt.draw()
+
+        plt.figure(2)
+        plt.clf()
+        plt.title("Probability of Category")
+        plt.bar(np.arange(len(GPs)), Mix.weights, np.ones(len(GPs)) * 0.5)
+        plt.xticks(np.arange(len(GPs)), [GP.description() for GP in GPs], rotation=30)
+        plt.subplots_adjust(bottom=0.35)
+        plt.plot()
+        plt.show()
 # plt.pause(4)
diff --git a/stpy/continuous_processes/convex_rkhs.py b/stpy/continuous_processes/convex_rkhs.py
index 30f3775..a3df5e1 100644
--- a/stpy/continuous_processes/convex_rkhs.py
+++ b/stpy/continuous_processes/convex_rkhs.py
@@ -2,17 +2,21 @@
 import torch
 from torchmin import minimize
 from stpy.candidate_set import CandidateDiscreteSet
-from stpy.generative_models.conditional_generative_model import ConditionalGenerativeModel
+from stpy.generative_models.conditional_generative_model import (
+    ConditionalGenerativeModel,
+)
+
+
 class ConvexRKHS(KernelizedFeatures):
-    """
-    """
+    """ """
 
-    def __init__(self, embedding, m, lam = 0. , s = 0.01):
+    def __init__(self, embedding, m, lam=0.0, s=0.01):
         super().__init__(embedding, m)
         self.Gamma = torch.eye(m, requires_grad=True).double()
         self.lam = lam
         self.s = s
-    def fit(self,x=None,y=None):
+
+    def fit(self, x=None, y=None):
         """
         legacy method
         :param x:
@@ -20,32 +24,53 @@ def fit(self,x=None,y=None):
         :return:
         """
         pass
+
     def weight_scaling(self, Gamma, scale, x_single, y, Phi):
         x = torch.tile(x_single, (y.size()[0], 1))
-        return torch.exp(-torch.sum(((Phi(x) - Phi(y)) @ Gamma /scale) ** 2, axis=1))
+        return torch.exp(-torch.sum(((Phi(x) - Phi(y)) @ Gamma / scale) ** 2, axis=1))
 
     def local_fit(self, weights):
         D = torch.diag(weights)
         X = self.embed(self.x)
-        theta = torch.linalg.inv((X.T @ D @ X) + self.lam * torch.eye(self.m)) @ X.T @ D @ self.y
+        theta = (
+            torch.linalg.inv((X.T @ D @ X) + self.lam * torch.eye(self.m))
+            @ X.T
+            @ D
+            @ self.y
+        )
         return theta
-    def optimize_params(self, type='bandwidth', restarts=10, regularizer=None,
-						maxiter=1000, mingradnorm=1e-4, verbose=False, optimizer="pymanopt", scale=1., weight=1., save = False,
-								save_name = 'model.np', init_func = None, bounds = None, parallel = False, cores = None):
+
+    def optimize_params(
+        self,
+        type="bandwidth",
+        restarts=10,
+        regularizer=None,
+        maxiter=1000,
+        mingradnorm=1e-4,
+        verbose=False,
+        optimizer="pymanopt",
+        scale=1.0,
+        weight=1.0,
+        save=False,
+        save_name="model.np",
+        init_func=None,
+        bounds=None,
+        parallel=False,
+        cores=None,
+    ):
 
         x_data = self.x
         y_data = self.y
         Phi = lambda x: self.embedding.embed(x)
         m = self.get_basis_size()
 
-
         def total_loss(gamma):
             weights = []
             predictions = []
             for i in range(x_data.size()[0]):
                 x = x_data[i]
-                Gamma =  torch.diag(gamma)
-                w = self.weight_scaling(Gamma, 1., x, x_data, Phi)
+                Gamma = torch.diag(gamma)
+                w = self.weight_scaling(Gamma, 1.0, x, x_data, Phi)
                 X = Phi(x_data)
 
                 # local fit in the new coordinates
@@ -61,21 +86,21 @@ def total_loss(gamma):
 
             for p1, w1 in zip(predictions, weights):
                 # loss that makes sure we predict correctly
-                loss = 1* torch.sum(((p1 - y_data) ** 2)/(self.s**2) * (w1)) / 2
+                loss = 1 * torch.sum(((p1 - y_data) ** 2) / (self.s**2) * (w1)) / 2
 
                 for p2, w2 in zip(predictions, weights):
                     # loss that makes sure the predictions are consistent (this can be a larger set)
-                    loss += 1* torch.sum((p1 - p2)**2/(self.s**2) * (w1 * w2))
+                    loss += 1 * torch.sum((p1 - p2) ** 2 / (self.s**2) * (w1 * w2))
 
-            return loss + 0.001*torch.sum(gamma**2)
+            return loss + 0.001 * torch.sum(gamma**2)
 
         # optimize this
         vals = []
         args = []
         for _ in range(restarts):
-            gamma = torch.randn(m, requires_grad=True).double()**2
+            gamma = torch.randn(m, requires_grad=True).double() ** 2
             total_loss(gamma)
-            result = minimize(total_loss, gamma, method='bfgs', disp=2)
+            result = minimize(total_loss, gamma, method="bfgs", disp=2)
             vals.append(result.fun)
             args.append(result.x)
 
@@ -83,11 +108,11 @@ def total_loss(gamma):
 
     def mean(self, xtest):
         phitest = self.embed(xtest)
-        out = torch.zeros(size = (phitest.size()[0],1)).double()
+        out = torch.zeros(size=(phitest.size()[0], 1)).double()
         for i, x in enumerate(xtest):
-            w = self.weight_scaling(self.Gamma, 1., x, self.x, self.embed)
-            out[i] = 0.
-            f = self.embed(x)@self.local_fit(w)
+            w = self.weight_scaling(self.Gamma, 1.0, x, self.x, self.embed)
+            out[i] = 0.0
+            f = self.embed(x) @ self.local_fit(w)
             out[i] = f
         return out
 
@@ -96,17 +121,17 @@ def best_points_so_far(self):
         get all points which are above max - 2*s
         :return:
         """
-        conservative_best_value = torch.max(self.y) - 2*self.s
+        conservative_best_value = torch.max(self.y) - 2 * self.s
         mask = self.y > conservative_best_value
-        return self.x[mask,:]
+        return self.x[mask, :]
 
-    def sample_neighbourhood_sample(self, x_loc, candidate_set, cut_off = 0.01, size = 10):
-        if isinstance(CandidateDiscreteSet,candidate_set):
+    def sample_neighbourhood_sample(self, x_loc, candidate_set, cut_off=0.01, size=10):
+        if isinstance(CandidateDiscreteSet, candidate_set):
             xtest = self.embed(candidate_set.get_options_raw)
-            w = self.weight_scaling(self.Gamma, 1., x_loc,xtest, self.embed)
+            w = self.weight_scaling(self.Gamma, 1.0, x_loc, xtest, self.embed)
             selection = xtest[w > cut_off]
             max_v = selection.size()[0]
-            indices = np.random.choice(max_v, size = size)
+            indices = np.random.choice(max_v, size=size)
             out = selection[indices]
             return out
         elif isinstance(ConditionalGenerativeModel, candidate_set):
@@ -115,7 +140,7 @@ def sample_neighbourhood_sample(self, x_loc, candidate_set, cut_off = 0.01, size
             NotImplementedError("The requested candidate set method is not implemented")
 
     def func_gradient(self, x):
-        w = self.weight_scaling(self.Gamma, 1., x, self.x,  self.embed)
+        w = self.weight_scaling(self.Gamma, 1.0, x, self.x, self.embed)
         return self.local_fit(weights=w)
 
 
@@ -129,23 +154,23 @@ def func_gradient(self, x):
     n = 256
     N = 4
     lam = 1e-6
-    gamma_original = torch.randn(size = (embedding.get_m(),)).double()
+    gamma_original = torch.randn(size=(embedding.get_m(),)).double()
     xtest = interval_torch(d=1, n=n)
-    x = torch.zeros(size =(N,1)).double()
+    x = torch.zeros(size=(N, 1)).double()
     x = x.uniform_()
 
     Phi_original = lambda x: embedding.embed(x) @ torch.diag(gamma_original)
     Phi = lambda x: embedding.embed(x)
     y = torch.sum(Phi_original(x) ** 2, axis=1).view(-1)
-    ytest= torch.sum(Phi_original(xtest) ** 2, axis=1).view(-1)
-    Estimator = ConvexRKHS(embedding, embedding.get_m(), lam = lam )
-    #Estimator = torch.compile(Estimator)
+    ytest = torch.sum(Phi_original(xtest) ** 2, axis=1).view(-1)
+    Estimator = ConvexRKHS(embedding, embedding.get_m(), lam=lam)
+    # Estimator = torch.compile(Estimator)
 
     Estimator.load_data((x, y))
     Estimator.optimize_params()
 
-    print ("True gamma:",gamma_original)
-    print ("Optimized gamma:", torch.diag(Estimator.Gamma))
+    print("True gamma:", gamma_original)
+    print("Optimized gamma:", torch.diag(Estimator.Gamma))
     offset = 20
     Phi = lambda x: embedding.embed(x)
     fig, ax1 = plt.subplots()
@@ -153,24 +178,31 @@ def func_gradient(self, x):
 
     for i in range(xtest.size()[0]):
         x = xtest[i]
-        w = Estimator.weight_scaling(Estimator.Gamma,  1., x, xtest, Phi)
+        w = Estimator.weight_scaling(Estimator.Gamma, 1.0, x, xtest, Phi)
         D = torch.diag(w)
         X = Phi(xtest)
-        theta = torch.linalg.inv((X.T@D@X) + lam * torch.eye(embedding.get_m()))@X.T@D@ytest
-        prediction = (X@theta).detach()
-
-        if i%64 == 0:
-            p = ax1.plot(xtest[i],
-                     prediction[i],'o',ms = 10)
-
-            ax1.plot(xtest[np.max([0,i-offset]):np.min([i+offset,n])],
-                     prediction[np.max([0,i-offset]):np.min([i+offset,n])], color = p[0].get_color())
-            ax2.plot(xtest, w, color = p[0].get_color())
+        theta = (
+            torch.linalg.inv((X.T @ D @ X) + lam * torch.eye(embedding.get_m()))
+            @ X.T
+            @ D
+            @ ytest
+        )
+        prediction = (X @ theta).detach()
+
+        if i % 64 == 0:
+            p = ax1.plot(xtest[i], prediction[i], "o", ms=10)
+
+            ax1.plot(
+                xtest[np.max([0, i - offset]) : np.min([i + offset, n])],
+                prediction[np.max([0, i - offset]) : np.min([i + offset, n])],
+                color=p[0].get_color(),
+            )
+            ax2.plot(xtest, w, color=p[0].get_color())
 
     mu = Estimator.mean(xtest)
 
-    ax1.plot(xtest, mu, 'b')
-    ax1.plot(xtest,ytest,'k--')
-    ax1.plot(Estimator.x,Estimator.y,'ko')
+    ax1.plot(xtest, mu, "b")
+    ax1.plot(xtest, ytest, "k--")
+    ax1.plot(Estimator.x, Estimator.y, "ko")
 
-    plt.show()
\ No newline at end of file
+    plt.show()
diff --git a/stpy/continuous_processes/dirichlet_mixture.py b/stpy/continuous_processes/dirichlet_mixture.py
index 2839b61..cb6a6ff 100755
--- a/stpy/continuous_processes/dirichlet_mixture.py
+++ b/stpy/continuous_processes/dirichlet_mixture.py
@@ -7,117 +7,127 @@
 
 class DirichletMixture(Estimator):
 
-	def __init__(self, processes):
-		self.processes = processes
-		self.k = len(self.processes)
-		self.s = processes[0].s
-
-	def fit_GP(self, X, y, xtest=None, N=200):
-		self.X = X
-		self.y = y
-		n = X.shape[0]
-		self.fit = True
-		return True
-
-	def custom_kernel(self, a, b, alpha):
-		kernel = alpha[0] * self.processes[0].kernel(a, b)
-		for j in np.arange(1, self.k, 1):
-			kernel = kernel + alpha[j] * self.processes[j].kernel(a, b)
-		return kernel
-
-	def mean_var(self, xtest, N=100):
-
-		self.K_mix = np.zeros(shape=(n, n))
-
-		mu = xtest * 0
-		s = xtest * 0
-
-		samples = np.zeros(shape=(N, xtest.shape[0], xtest.shape[1]))
-
-		for i in range(N):
-			alpha = np.random.dirichlet(np.ones(shape=(self.k)) * (1. / float(self.k)), 1)[0]
-			print("Dirichlet sample:", alpha)
-			kernel = lambda a, b: self.custom_kernel(a, b, alpha)
-			GP_mix = GaussianProcess(kernel="custom", custom=kernel, s=self.s)
-			GP_mix.fit_GP(self.X, self.y)
-			samples[i, :, :] = GP_mix.sample(xtest)
-
-		mu = np.mean(samples, axis=0)
-		s = np.var(samples, axis=0)
-		s = np.sqrt(s)
-
-		return (mu, s)
-
-	def sample(self, xtest, size=1, with_mask=False):
-		# sample a GP
-		if self.fit == True:
-			alpha = np.random.dirichlet(np.ones(shape=(self.k)) * (1. / float(self.k)), 1)[0]
-			kernel = lambda a, b: self.custom_kernel(a, b, alpha)
-			GP_mix = GaussianProcess(kernel="custom", custom=kernel, s=self.s)
-			GP_mix.fit_GP(self.X, self.y)
-			return GP_mix.sample(xtest)
-		else:
-			alpha = np.random.dirichlet(np.ones(shape=(self.k)) * (1. / float(self.k)), 1)[0]
-			kernel = lambda a, b: self.custom_kernel(a, b, alpha)
-			GP_mix = GaussianProcess(kernel="custom", custom=kernel, s=self.s)
-			return GP_mix.sample(xtest)
+    def __init__(self, processes):
+        self.processes = processes
+        self.k = len(self.processes)
+        self.s = processes[0].s
+
+    def fit_GP(self, X, y, xtest=None, N=200):
+        self.X = X
+        self.y = y
+        n = X.shape[0]
+        self.fit = True
+        return True
+
+    def custom_kernel(self, a, b, alpha):
+        kernel = alpha[0] * self.processes[0].kernel(a, b)
+        for j in np.arange(1, self.k, 1):
+            kernel = kernel + alpha[j] * self.processes[j].kernel(a, b)
+        return kernel
+
+    def mean_var(self, xtest, N=100):
+
+        self.K_mix = np.zeros(shape=(n, n))
+
+        mu = xtest * 0
+        s = xtest * 0
+
+        samples = np.zeros(shape=(N, xtest.shape[0], xtest.shape[1]))
+
+        for i in range(N):
+            alpha = np.random.dirichlet(
+                np.ones(shape=(self.k)) * (1.0 / float(self.k)), 1
+            )[0]
+            print("Dirichlet sample:", alpha)
+            kernel = lambda a, b: self.custom_kernel(a, b, alpha)
+            GP_mix = GaussianProcess(kernel="custom", custom=kernel, s=self.s)
+            GP_mix.fit_GP(self.X, self.y)
+            samples[i, :, :] = GP_mix.sample(xtest)
+
+        mu = np.mean(samples, axis=0)
+        s = np.var(samples, axis=0)
+        s = np.sqrt(s)
+
+        return (mu, s)
+
+    def sample(self, xtest, size=1, with_mask=False):
+        # sample a GP
+        if self.fit == True:
+            alpha = np.random.dirichlet(
+                np.ones(shape=(self.k)) * (1.0 / float(self.k)), 1
+            )[0]
+            kernel = lambda a, b: self.custom_kernel(a, b, alpha)
+            GP_mix = GaussianProcess(kernel="custom", custom=kernel, s=self.s)
+            GP_mix.fit_GP(self.X, self.y)
+            return GP_mix.sample(xtest)
+        else:
+            alpha = np.random.dirichlet(
+                np.ones(shape=(self.k)) * (1.0 / float(self.k)), 1
+            )[0]
+            kernel = lambda a, b: self.custom_kernel(a, b, alpha)
+            GP_mix = GaussianProcess(kernel="custom", custom=kernel, s=self.s)
+            return GP_mix.sample(xtest)
 
 
 if __name__ == "__main__":
 
-	# domain size
-	L_infinity_ball = 5
-	# dimension
-	d = 1
-	# error variance
-	s = 0.001
-	# grid density
-	n = 1024
-	# number of intial points
-	N = 15
-	# smoothness
-	gamma = 2
-
-	# model
-	GP1 = GaussianProcess(kernel="squared_exponential", s=s, gamma=1.5, diameter=L_infinity_ball)
-	GP2 = GaussianProcess(kernel="squared_exponential", s=s, gamma=1.1)
-	GP3 = GaussianProcess(kernel="modified_matern", s=s, kappa=1., nu=2, gamma=1.1)
-	GP4 = GaussianProcess(kernel="linear", s=s, kappa=1.)
-
-	# data
-	# GPTrue = GaussianProcess(kernel="linear", s=0, kappa=1., diameter=L_infinity_ball)
-	GPTrue = GaussianProcess(kernel="squared_exponential", s=s, gamma=2., kappa=1)
-	# GPTrue = GaussianProcess(kernel = "modified_matern", s =s, kappa = 1., nu = 2, gamma = 1.1)
-
-	# test environment
-	TT = code.test_problems.test_functions.test_function()
-	(d, xtest, x, gamma) = TT.sample_ss_bounds(N, n, d=d, L_infinity_ball=L_infinity_ball)
-	f = lambda x: TT.sample_ss(x, sigma=0, GP=GPTrue)
-
-	# targets
-	y = f(x)
-	GPs = [GP1, GP2, GP3, GP4]
-	Mix = DirichletMixture(GPs)
-	for j in range(N):
-		plt.figure(1)
-		plt.clf()
-		X = x[0:j + 1, :].reshape(-1, 1)
-		y = f(X)
-		Mix.fit_GP(X, y)
-		(mu, var) = Mix.mean_var(xtest)
-		samples = Mix.sample(xtest, size=5)
-		plt.plot(xtest, samples, '--', linewidth=3, alpha=0.1)
-		plt.plot(xtest, mu, 'k', linewidth=4)
-		plt.plot(xtest, mu, 'k', linewidth=4)
-		plt.fill_between(xtest.flat, (mu - var).flat, (mu + var).flat, color="#dddddd")
-		plt.plot(X, y, 'ro', markersize=10)
-		plt.plot(xtest, f(xtest), 'g', linewidth=4)
-		plt.draw()
-		# plt.figure(2)
-		# plt.clf()
-		# plt.title("Probability of Category")
-		# plt.bar(np.arange(len(GPs)), Mix.weights, np.ones(len(GPs))*0.5)
-		# plt.xticks(np.arange(len(GPs)), [GP.description() for GP in GPs], rotation=30)
-		# plt.subplots_adjust(bottom=0.35)
-		# plt.draw()
-		plt.pause(4)
+    # domain size
+    L_infinity_ball = 5
+    # dimension
+    d = 1
+    # error variance
+    s = 0.001
+    # grid density
+    n = 1024
+    # number of intial points
+    N = 15
+    # smoothness
+    gamma = 2
+
+    # model
+    GP1 = GaussianProcess(
+        kernel="squared_exponential", s=s, gamma=1.5, diameter=L_infinity_ball
+    )
+    GP2 = GaussianProcess(kernel="squared_exponential", s=s, gamma=1.1)
+    GP3 = GaussianProcess(kernel="modified_matern", s=s, kappa=1.0, nu=2, gamma=1.1)
+    GP4 = GaussianProcess(kernel="linear", s=s, kappa=1.0)
+
+    # data
+    # GPTrue = GaussianProcess(kernel="linear", s=0, kappa=1., diameter=L_infinity_ball)
+    GPTrue = GaussianProcess(kernel="squared_exponential", s=s, gamma=2.0, kappa=1)
+    # GPTrue = GaussianProcess(kernel = "modified_matern", s =s, kappa = 1., nu = 2, gamma = 1.1)
+
+    # test environment
+    TT = code.test_problems.test_functions.test_function()
+    (d, xtest, x, gamma) = TT.sample_ss_bounds(
+        N, n, d=d, L_infinity_ball=L_infinity_ball
+    )
+    f = lambda x: TT.sample_ss(x, sigma=0, GP=GPTrue)
+
+    # targets
+    y = f(x)
+    GPs = [GP1, GP2, GP3, GP4]
+    Mix = DirichletMixture(GPs)
+    for j in range(N):
+        plt.figure(1)
+        plt.clf()
+        X = x[0 : j + 1, :].reshape(-1, 1)
+        y = f(X)
+        Mix.fit_GP(X, y)
+        (mu, var) = Mix.mean_var(xtest)
+        samples = Mix.sample(xtest, size=5)
+        plt.plot(xtest, samples, "--", linewidth=3, alpha=0.1)
+        plt.plot(xtest, mu, "k", linewidth=4)
+        plt.plot(xtest, mu, "k", linewidth=4)
+        plt.fill_between(xtest.flat, (mu - var).flat, (mu + var).flat, color="#dddddd")
+        plt.plot(X, y, "ro", markersize=10)
+        plt.plot(xtest, f(xtest), "g", linewidth=4)
+        plt.draw()
+        # plt.figure(2)
+        # plt.clf()
+        # plt.title("Probability of Category")
+        # plt.bar(np.arange(len(GPs)), Mix.weights, np.ones(len(GPs))*0.5)
+        # plt.xticks(np.arange(len(GPs)), [GP.description() for GP in GPs], rotation=30)
+        # plt.subplots_adjust(bottom=0.35)
+        # plt.draw()
+        plt.pause(4)
diff --git a/stpy/continuous_processes/fourier_fea.py b/stpy/continuous_processes/fourier_fea.py
index b635c1c..c39333c 100755
--- a/stpy/continuous_processes/fourier_fea.py
+++ b/stpy/continuous_processes/fourier_fea.py
@@ -5,500 +5,662 @@
 
 
 class GaussianProcessFF(KernelizedFeatures):
-	'''	
-		Random Fourier Features for Gaussian Kernel
-	'''
-
-	def __init__(self, project=None, gamma=0.1, s=0.001, approx="rff", m=100, d=1, diameter=1.0, verbose=True,
-				 groups=None,
-				 bounds=None, scale=1.0, kernel="squared_exponential", nu=0.5, kappa=1.0):
-
-		self.gamma = gamma
-		self.s = s
-		self.x = None
-		self.K = 0
-		self.mu = 0.0
-		self.fit = False
-		self.beta = None
-		self.m = m
-		self.project = None
-		self.nu = nu
-		self.lam = 1.
-		if groups is None:
-			self.no_groups = 1
-		else:
-			self.no_groups = len(groups)
-
-		self.approx = approx
-		self.d = d
-		self.bounds = bounds
-		self.groups = groups
-		self.diameter = diameter
-		self.admits_first_order = True
-		self.verbose = verbose
-		self.kernel = kernel
-		self.scale = scale
-		self.m_old = None
-		self.kappa = kappa
-		self.heuristic_variance = False
-		if self.groups is None:
-			self.embedding_map = self.sample_embedding(self.d, self.m, self.gamma)
-			self.m = self.embedding_map.m
-		else:
-			self.no_groups = float(len(self.groups))
-			self.embedding_map = self.sample_embedding_group()
-
-	def resample(self):
-		self.embedding_map = self.sample_embedding_group()
-
-	def description(self):
-		"""
-		Description of GP in text
-		:return: string with description
-		"""
-		return "Fourier Features object\n" + "Appprox: " + self.approx + "\n" + "Bandwidth: " + str(
-			self.gamma) + "\n" + "Groups:" + str(self.groups) + "\n noise: " + str(self.s)
-
-	def get_gamma(self, t):
-		if self.kernel == "squared_exponential" and self.groups is None:
-			return (np.log(t)) ** self.d
-		elif self.kernel == "linear":
-			return 10 * self.m
-		elif self.kernel == "squared_exponential" and self.groups is not None:
-			return len(self.groups) * (np.log(t))
-		elif self.kernel == "matern":
-			return (np.log(t)) ** self.d
-		elif self.kernel == "modified_matern":
-			return (np.log(t)) ** self.d
-
-	def sample_embedding_group(self):
-		# self.m is a vector of ms
-		# self.gamma is a vector of gammas
-		embedding_map = []
-
-		self.d_effective = int(self.d / self.no_groups)
-
-		if self.groups is not None:
-			self.d_group_sizes = [len(group) for group in self.groups]
-			self.d_effective = max(self.d_group_sizes)
-
-		if np.sum(np.array(list(self.gamma.size()))) > 1:
-			self.gamma = self.gamma
-		else:
-			self.gamma = torch.ones(int(self.no_groups), dtype=torch.float64) * self.gamma
-
-		for i, group in enumerate(self.groups):
-			embedding_map.append(self.sample_embedding(len(group), self.m[i], self.gamma[i]))
-			self.m[i] = embedding_map[i].m
-		return embedding_map
-
-	def sample_embedding(self, d_effective, m, gamma):
-		if self.m_old is not None:
-			self.m = self.m_old
-
-		if self.approx == "quad":
-			embedding_map = QuadratureEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-												groups=None,
-												kernel=self.kernel, approx=self.approx)
-		elif self.approx == "rff":
-			embedding_map = RFFEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-										 groups=None,
-										 kernel=self.kernel, approx=self.approx)
-		elif self.approx == "rff2":
-			embedding_map = RFFEmbedding(biased=True, gamma=gamma, nu=self.nu, m=m, d=d_effective,
-										 diameter=self.diameter, groups=None,
-										 kernel=self.kernel, approx=self.approx)
-		elif self.approx == "halton":
-			embedding_map = RFFEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-										 groups=None,
-										 kernel=self.kernel, approx=self.approx)
-		elif self.approx == "hermite":
-			embedding_map = HermiteEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-											 groups=None,
-											 kernel=self.kernel, approx=self.approx)
-		elif self.approx == "trapezoidal":
-			embedding_map = TrapezoidalEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-												 groups=None,
-												 kernel=self.kernel, approx=self.approx)
-		elif self.approx == "ccff":
-			embedding_map = ClenshawCurtisEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-													groups=None,
-													kernel=self.kernel, approx=self.approx)
-		elif self.approx == "matern_secific":
-			embedding_map = MaternEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-											groups=None,
-											kernel=self.kernel, approx=self.approx)
-		elif self.approx == "quad_periodic":
-			embedding_map = QuadPeriodicEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-												  groups=None,
-												  kernel=self.kernel, approx=self.approx)
-		elif self.approx == "kl":
-			embedding_map = KLEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective,
-										diameter=self.diameter, groups=None, kernel=self.kernel, approx=self.approx)
-		elif self.approx == "orf":
-			embedding_map = RFFEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-										 groups=None,
-										 kernel=self.kernel, approx=self.approx)
-		else:
-			embedding_map = QuadratureEmbedding(gamma=gamma, nu=self.nu, m=m, d=d_effective, diameter=self.diameter,
-												groups=None,
-												kernel=self.kernel, approx=self.approx)
-		self.m_old = self.m
-
-		return embedding_map
-
-	def embed(self, x):
-		if self.groups is None:
-
-			if self.project is not None:
-				x = self.project(x)
-
-			return self.embedding_map.embed(x)
-
-		else:
-			return self.embed_whole(x)
-
-	def embed_group(self, x, group):
-		return self.embedding_map[group].embed(x) / (np.sqrt(self.no_groups))
-
-	def embed_whole(self, x):
-		if self.project is not None:
-			x = self.project(x)
-
-		if self.groups == None:
-			return self.embed(x)
-		else:
-			n = x.size()[0]
-			M = torch.zeros(int(torch.sum(self.m)), n, dtype=torch.float64)
-			for i, group in enumerate(self.groups):
-				embeding = self.embed_group(x[:, group], i)
-				index = int(torch.sum(self.m[0:i], dim=0))
-				index_next = int(torch.sum(self.m[0:i + 1], dim=0))
-				M[index:index_next, :] = torch.t(embeding)
-		return torch.t(M)
-
-	def get_basis_size(self):
-		return self.m
-
-	def set_basis_size(self, m):
-		self.m_old = None
-		self.m = m
-
-	def right_kernel(self):
-		embeding = self.embed(self.x)
-		Z = self.linear_kernel(embeding, embeding)
-		K = (Z + self.s * self.s * torch.eye(self.n, dtype=torch.float64))
-		return K
-
-	def fit_gp(self, x, y, iterative=False):
-		'''
-			Function to Fit GP
-		'''
-
-		self.x = x
-		self.y = y
-		self.n = list(self.x.size())[0]
-		self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
-
-		if self.groups == None:
-			embeding = self.embed(x)
-			self.Z_ = self.linear_kernel(torch.t(embeding), torch.t(embeding))
-			self.K = (self.Z_ + self.s * self.s * torch.eye(self.m, dtype=torch.float64))
-			self.Q = torch.t(embeding)
-
-		else:  ## additive models
-			M = torch.t(self.embed_whole(x))
-			self.Q = M
-			self.Z_ = self.linear_kernel(M, M)
-			self.K = self.kappa * self.Z_ + self.s * self.s * torch.eye(int(torch.sum(self.m)), dtype=torch.float64)
-
-		self.fit = True
-
-		return None
-
-	def log_marginal_likelihood_self(self):
-		return self.log_marginal_likelihood(self.gamma, torch.eye(self.d, dtype=torch.float64), self.kappa)
-
-	def log_marginal_likelihood(self, gamma, Rot, kappa, kernel="default"):
-		"""
-		Calculated the log marginal likelihood
-		:param kernel: custom kenrel object
-		:return: float
-		"""
-		# func = self.kernel_object.get_kernel_function()
-
-		self.x = torch.mm(self.x, Rot)
-		L = torch.torch.cholesky(self.K, upper=False)
-		logdet = -0.5 * 2 * torch.sum(torch.log(torch.diag(L)))
-
-		Q = self.embed_whole(self.x)
-		rhs = torch.mm(torch.t(Q), self.y)
-		alpha, _ = torch.solve(rhs, self.K)
-		logprob = -0.5 * (torch.mm(torch.t(self.y), self.y) - torch.mm(torch.t(rhs),
-																	   alpha)) / self.s ** 2 + logdet  # - 0.5*self.n*np.log(2*np.pi)
-		logprob = -logprob
-
-		return logprob
-
-	def mean_std(self, xtest, reuse=False):
-		'''
-			Calculate mean and variance for GP at xtest points
-		'''
-		# compute the mean at our test points.
-
-		if self.project is not None:
-			self.project(xtest)
-
-		if self.groups == None:
-			embeding = self.embed(xtest)
-			Q = self.embed(self.x)
-		else:
-			self.Z_ = self.K - self.s * self.s * torch.eye(int(torch.sum(self.m)), dtype=torch.float64)
-			embeding = self.embed_whole(xtest)
-			Q = self.embed_whole(self.x)
-
-		theta_mean, _ = torch.solve(torch.mm(torch.t(Q), self.y), self.K)
-		ymean = torch.mm(embeding, theta_mean)
-
-		temp = torch.t(torch.solve(torch.t(embeding), self.K)[0])
-		diagonal = self.s * self.s * torch.einsum('ij,ji->i', (temp, torch.t(embeding))).view(-1, 1)
-		yvar = torch.sqrt(diagonal)
-
-		return (ymean, yvar)
-
-	# def posterior_inf(self, xtest, tol=10e-5, max_int=20000):
-	# 	alpha = np.random.randn(self.n, 1)
-	# 	err = 10.
-	# 	F = 10.0
-	# 	counter = 0
-	# 	embeding = self.embed(self.x)
-	# 	K = (linear_kernel(embeding.T, embeding.T) + self.s * self.s * np.eye(self.n))
-	# 	Kinv = np.linalg.pinv(K)
-	#
-	# 	q = []
-	# 	for index in range(self.n):
-	# 		q.append(self.embed(self.x[index, :].reshape(1, -1)))
-	# 	q = np.array(q)
-	#
-	# 	while (counter < max_int and err / F > tol):
-	# 		# first find which index gives maximum
-	# 		# print (K.shape)
-	# 		index = np.argmax(np.abs(K.dot(alpha) - self.y))
-	# 		sign = np.sign(K.dot(alpha)[index] - self.y[index])
-	#
-	# 		k = linear_kernel(embeding.T, q[index, :, :].T).reshape(-1, 1)
-	# 		# print ("k: ", k.shape)
-	# 		oldalpha = alpha
-	# 		alpha = alpha - 1. / np.sqrt(counter + 1) * Kinv.dot(self.s * K.dot(alpha) + sign * k)
-	# 		err = np.linalg.norm(oldalpha - alpha)
-	# 		counter += 1
-	# 		F = np.max(np.abs(K.dot(alpha) - self.y)) + self.s * alpha.T.dot(K.dot(alpha))[0][0]
-	#
-	# 	y_inf = linear_kernel(self.embed(self.x).T, self.embed(xtest).T).T.dot(alpha)
-	# 	return y_inf
-
-	def sample_theta(self, size=1):
-		if self.groups is None:
-			basis = self.m
-		else:
-			basis = int(int(torch.sum(self.m)))
-		zeros = torch.zeros(basis, size, dtype=torch.float64)
-		random_vector = torch.normal(mean=zeros, std=1.)
-
-		if self.fit == True:
-			# random vector
-			Z = torch.pinverse(self.K)
-			self.L = torch.cholesky(Z, upper=False)
-			theta_mean = torch.mm(Z, torch.mm(self.Q, self.y))
-			theta = torch.mm(self.s * self.L, random_vector)
-			theta = theta + theta_mean
-		else:
-			theta_mean = 0
-			Z = (1. + self.s * self.s) * torch.eye(basis, dtype=torch.float64)
-			L = torch.cholesky(Z, upper=False)
-			theta = torch.mm(L, random_vector) + theta_mean
-		return theta
-
-	def sample(self, xtest, size=1):
-		'''
-			Sample functions from Gaussian Process
-		'''
-		theta = self.sample_theta(size=size)
-		if self.groups == None:
-			f = torch.mm(self.embed(xtest), theta)
-		else:
-			f = torch.zeros(xtest.size()[0], size, dtype=torch.float64)
-			for i, group in enumerate(self.groups):
-				embeding = self.embed_group(xtest[:, group], i)
-				index = int(torch.sum(self.m[0:i], dim=0))
-				index_next = int(torch.sum(self.m[0:i + 1], dim=0))
-				f += torch.mm(embeding, theta[index:index_next, :])
-		return f
-
-	def sample_and_max(self, xtest, size=1):
-		'''
-			Sample functions from Gaussian Process and take Maximum
-		'''
-		f = self.sample(xtest, size=size)
-
-		index = np.argmax(f.detach(), axis=0)
-		return (xtest[index, :], f[index, :])
-
-	def ucb_optimize(self, beta, multistart=25):
-
-		mean = lambda x: self.mean_std(torch.from_numpy(x).view(1, -1))[0][0][0]
-		sigma = lambda x: self.mean_std(torch.from_numpy(x).view(1, -1))[1][0][0]
-
-		fun = lambda x: -(mean(x) + np.sqrt(beta) * sigma(x))
-		# grad = lambda x: -complex_step_derivative(fun,1e-10,x.reshape(1,-1))
-
-		mybounds = self.bounds
-		results = []
-		from scipy.optimize import minimize
-
-		for i in range(multistart):
-			x0 = np.random.randn(self.d)
-			for i in range(self.d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-
-			res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds)
-			solution = res.x
-			results.append([solution, -fun(solution)])
-
-		results = np.array(results)
-		index = np.argmax(results[:, 1])
-		solution = results[index, 0]
-
-		return (solution, -fun(solution))
-
-	def special_embed_eval(self, x, theta):
-		f = 0
-		x = torch.from_numpy(x)
-		# print (x)
-		for i, group in enumerate(self.groups):
-			embeding = self.embed_group(x[group].view(-1, len(group)), i)
-			index = torch.sum(self.m[0:i], dim=0)
-			index_next = torch.sum(self.m[0:i + 1], dim=0)
-			f += torch.mm(embeding, theta[int(index):int(index_next), :])
-		return f.numpy()
-
-	def special_embed_eval_grad(self, x, theta):
-		ff = lambda x: self.special_embed_eval(x.flatten(), theta)
-		grad = complex_step_derivative(ff, 1e-10, x.reshape(-1, 1).T).flatten()
-		return grad
-
-	def get_lambdas_additive(self, theta):
-		fun = lambda x: -self.special_embed_eval(x, theta)
-		grad = lambda x: -self.special_embed_eval_grad(x, theta)
-		return [fun, grad]
-
-	def get_lambdas(self, theta):
-
-		# complex step differentiation
-		fun = lambda x: -(torch.mm(self.embed(torch.from_numpy(x).view(1, self.d)), theta).numpy()).flatten()
-		grad = lambda x: -complex_step_derivative(fun, 1e-10, x.reshape(self.d, 1).T).flatten()
-		return [fun, grad]
-
-	def sample_and_optimize(self, xtest=None, multistart=25, minimizer="L-BFGS-B", grid=100, verbose=0):
-		'''
-			Sample functions from Gaussian Process and take Maximum using
-			first order maximization
-		'''
-
-		# sample linear approximating
-		theta = self.sample_theta()
-		from scipy.optimize import minimize
-
-		# get bounds
-		if self.bounds == None:
-			mybounds = tuple([(-self.diameter, self.diameter) for i in range(self.d)])
-		else:
-			mybounds = self.bounds
-
-		fun = lambda x: -torch.mm(torch.t(theta), torch.t(self.embed(torch.from_numpy(x).view(1, -1)))).numpy()
-
-		results = []
-		for j in range(multistart):
-			x0 = np.random.randn(self.d)
-			for i in range(self.d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-
-			if minimizer == "L-BFGS-B":
-				res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds)
-				solution = res.x
-			elif minimizer == "ProjGD":
-				res = projected_gradient_descent(fun, grad, x0, mybounds, tol=0.001,
-												 nu=1. / (self.m * np.max(np.abs(theta))))
-				solution = res.x
-			elif minimizer == "coordinate-wise":
-
-				solution = np.random.randn(self.d)
-				for i in range(self.d):
-					if verbose > 0:
-						print("Dimension: ", i)
-					fun_cw = lambda x: lambda_coordinate(fun, x0, i, x)
-					ranges = [slice(mybounds[i][0], mybounds[i][1], 1. / float(grid))]
-					out = scipy.optimize.brute(fun_cw, ranges, finish=None)
-					solution[i] = out
-				if verbose > 0:
-					print("Soln:", out.T)
-			elif minimizer == "CD_cw":
-				raise BaseException("Not implemented yet")
-			else:
-				raise AssertionError("Wrong optimizer selected.")
-
-			results.append([solution, -fun(solution)])
-
-		results = np.array(results)
-		index = np.argmax(results[:, 1])
-		solution = results[index, 0]
-
-		return (torch.from_numpy(solution), -torch.from_numpy(fun(solution)))
+    """
+    Random Fourier Features for Gaussian Kernel
+    """
+
+    def __init__(
+        self,
+        project=None,
+        gamma=0.1,
+        s=0.001,
+        approx="rff",
+        m=100,
+        d=1,
+        diameter=1.0,
+        verbose=True,
+        groups=None,
+        bounds=None,
+        scale=1.0,
+        kernel="squared_exponential",
+        nu=0.5,
+        kappa=1.0,
+    ):
+
+        self.gamma = gamma
+        self.s = s
+        self.x = None
+        self.K = 0
+        self.mu = 0.0
+        self.fit = False
+        self.beta = None
+        self.m = m
+        self.project = None
+        self.nu = nu
+        self.lam = 1.0
+        if groups is None:
+            self.no_groups = 1
+        else:
+            self.no_groups = len(groups)
+
+        self.approx = approx
+        self.d = d
+        self.bounds = bounds
+        self.groups = groups
+        self.diameter = diameter
+        self.admits_first_order = True
+        self.verbose = verbose
+        self.kernel = kernel
+        self.scale = scale
+        self.m_old = None
+        self.kappa = kappa
+        self.heuristic_variance = False
+        if self.groups is None:
+            self.embedding_map = self.sample_embedding(self.d, self.m, self.gamma)
+            self.m = self.embedding_map.m
+        else:
+            self.no_groups = float(len(self.groups))
+            self.embedding_map = self.sample_embedding_group()
+
+    def resample(self):
+        self.embedding_map = self.sample_embedding_group()
+
+    def description(self):
+        """
+        Description of GP in text
+        :return: string with description
+        """
+        return (
+            "Fourier Features object\n"
+            + "Appprox: "
+            + self.approx
+            + "\n"
+            + "Bandwidth: "
+            + str(self.gamma)
+            + "\n"
+            + "Groups:"
+            + str(self.groups)
+            + "\n noise: "
+            + str(self.s)
+        )
+
+    def get_gamma(self, t):
+        if self.kernel == "squared_exponential" and self.groups is None:
+            return (np.log(t)) ** self.d
+        elif self.kernel == "linear":
+            return 10 * self.m
+        elif self.kernel == "squared_exponential" and self.groups is not None:
+            return len(self.groups) * (np.log(t))
+        elif self.kernel == "matern":
+            return (np.log(t)) ** self.d
+        elif self.kernel == "modified_matern":
+            return (np.log(t)) ** self.d
+
+    def sample_embedding_group(self):
+        # self.m is a vector of ms
+        # self.gamma is a vector of gammas
+        embedding_map = []
+
+        self.d_effective = int(self.d / self.no_groups)
+
+        if self.groups is not None:
+            self.d_group_sizes = [len(group) for group in self.groups]
+            self.d_effective = max(self.d_group_sizes)
+
+        if np.sum(np.array(list(self.gamma.size()))) > 1:
+            self.gamma = self.gamma
+        else:
+            self.gamma = (
+                torch.ones(int(self.no_groups), dtype=torch.float64) * self.gamma
+            )
+
+        for i, group in enumerate(self.groups):
+            embedding_map.append(
+                self.sample_embedding(len(group), self.m[i], self.gamma[i])
+            )
+            self.m[i] = embedding_map[i].m
+        return embedding_map
+
+    def sample_embedding(self, d_effective, m, gamma):
+        if self.m_old is not None:
+            self.m = self.m_old
+
+        if self.approx == "quad":
+            embedding_map = QuadratureEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "rff":
+            embedding_map = RFFEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "rff2":
+            embedding_map = RFFEmbedding(
+                biased=True,
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "halton":
+            embedding_map = RFFEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "hermite":
+            embedding_map = HermiteEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "trapezoidal":
+            embedding_map = TrapezoidalEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "ccff":
+            embedding_map = ClenshawCurtisEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "matern_secific":
+            embedding_map = MaternEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "quad_periodic":
+            embedding_map = QuadPeriodicEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "kl":
+            embedding_map = KLEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        elif self.approx == "orf":
+            embedding_map = RFFEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        else:
+            embedding_map = QuadratureEmbedding(
+                gamma=gamma,
+                nu=self.nu,
+                m=m,
+                d=d_effective,
+                diameter=self.diameter,
+                groups=None,
+                kernel=self.kernel,
+                approx=self.approx,
+            )
+        self.m_old = self.m
+
+        return embedding_map
+
+    def embed(self, x):
+        if self.groups is None:
+
+            if self.project is not None:
+                x = self.project(x)
+
+            return self.embedding_map.embed(x)
+
+        else:
+            return self.embed_whole(x)
+
+    def embed_group(self, x, group):
+        return self.embedding_map[group].embed(x) / (np.sqrt(self.no_groups))
+
+    def embed_whole(self, x):
+        if self.project is not None:
+            x = self.project(x)
+
+        if self.groups == None:
+            return self.embed(x)
+        else:
+            n = x.size()[0]
+            M = torch.zeros(int(torch.sum(self.m)), n, dtype=torch.float64)
+            for i, group in enumerate(self.groups):
+                embeding = self.embed_group(x[:, group], i)
+                index = int(torch.sum(self.m[0:i], dim=0))
+                index_next = int(torch.sum(self.m[0 : i + 1], dim=0))
+                M[index:index_next, :] = torch.t(embeding)
+        return torch.t(M)
+
+    def get_basis_size(self):
+        return self.m
+
+    def set_basis_size(self, m):
+        self.m_old = None
+        self.m = m
+
+    def right_kernel(self):
+        embeding = self.embed(self.x)
+        Z = self.linear_kernel(embeding, embeding)
+        K = Z + self.s * self.s * torch.eye(self.n, dtype=torch.float64)
+        return K
+
+    def fit_gp(self, x, y, iterative=False):
+        """
+        Function to Fit GP
+        """
+
+        self.x = x
+        self.y = y
+        self.n = list(self.x.size())[0]
+        self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
+
+        if self.groups == None:
+            embeding = self.embed(x)
+            self.Z_ = self.linear_kernel(torch.t(embeding), torch.t(embeding))
+            self.K = self.Z_ + self.s * self.s * torch.eye(self.m, dtype=torch.float64)
+            self.Q = torch.t(embeding)
+
+        else:  ## additive models
+            M = torch.t(self.embed_whole(x))
+            self.Q = M
+            self.Z_ = self.linear_kernel(M, M)
+            self.K = self.kappa * self.Z_ + self.s * self.s * torch.eye(
+                int(torch.sum(self.m)), dtype=torch.float64
+            )
+
+        self.fit = True
+
+        return None
+
+    def log_marginal_likelihood_self(self):
+        return self.log_marginal_likelihood(
+            self.gamma, torch.eye(self.d, dtype=torch.float64), self.kappa
+        )
+
+    def log_marginal_likelihood(self, gamma, Rot, kappa, kernel="default"):
+        """
+        Calculated the log marginal likelihood
+        :param kernel: custom kenrel object
+        :return: float
+        """
+        # func = self.kernel_object.get_kernel_function()
+
+        self.x = torch.mm(self.x, Rot)
+        L = torch.torch.cholesky(self.K, upper=False)
+        logdet = -0.5 * 2 * torch.sum(torch.log(torch.diag(L)))
+
+        Q = self.embed_whole(self.x)
+        rhs = torch.mm(torch.t(Q), self.y)
+        alpha, _ = torch.solve(rhs, self.K)
+        logprob = (
+            -0.5
+            * (torch.mm(torch.t(self.y), self.y) - torch.mm(torch.t(rhs), alpha))
+            / self.s**2
+            + logdet
+        )  # - 0.5*self.n*np.log(2*np.pi)
+        logprob = -logprob
+
+        return logprob
+
+    def mean_std(self, xtest, reuse=False):
+        """
+        Calculate mean and variance for GP at xtest points
+        """
+        # compute the mean at our test points.
+
+        if self.project is not None:
+            self.project(xtest)
+
+        if self.groups == None:
+            embeding = self.embed(xtest)
+            Q = self.embed(self.x)
+        else:
+            self.Z_ = self.K - self.s * self.s * torch.eye(
+                int(torch.sum(self.m)), dtype=torch.float64
+            )
+            embeding = self.embed_whole(xtest)
+            Q = self.embed_whole(self.x)
+
+        theta_mean, _ = torch.solve(torch.mm(torch.t(Q), self.y), self.K)
+        ymean = torch.mm(embeding, theta_mean)
+
+        temp = torch.t(torch.solve(torch.t(embeding), self.K)[0])
+        diagonal = (
+            self.s
+            * self.s
+            * torch.einsum("ij,ji->i", (temp, torch.t(embeding))).view(-1, 1)
+        )
+        yvar = torch.sqrt(diagonal)
+
+        return (ymean, yvar)
+
+    # def posterior_inf(self, xtest, tol=10e-5, max_int=20000):
+    # 	alpha = np.random.randn(self.n, 1)
+    # 	err = 10.
+    # 	F = 10.0
+    # 	counter = 0
+    # 	embeding = self.embed(self.x)
+    # 	K = (linear_kernel(embeding.T, embeding.T) + self.s * self.s * np.eye(self.n))
+    # 	Kinv = np.linalg.pinv(K)
+    #
+    # 	q = []
+    # 	for index in range(self.n):
+    # 		q.append(self.embed(self.x[index, :].reshape(1, -1)))
+    # 	q = np.array(q)
+    #
+    # 	while (counter < max_int and err / F > tol):
+    # 		# first find which index gives maximum
+    # 		# print (K.shape)
+    # 		index = np.argmax(np.abs(K.dot(alpha) - self.y))
+    # 		sign = np.sign(K.dot(alpha)[index] - self.y[index])
+    #
+    # 		k = linear_kernel(embeding.T, q[index, :, :].T).reshape(-1, 1)
+    # 		# print ("k: ", k.shape)
+    # 		oldalpha = alpha
+    # 		alpha = alpha - 1. / np.sqrt(counter + 1) * Kinv.dot(self.s * K.dot(alpha) + sign * k)
+    # 		err = np.linalg.norm(oldalpha - alpha)
+    # 		counter += 1
+    # 		F = np.max(np.abs(K.dot(alpha) - self.y)) + self.s * alpha.T.dot(K.dot(alpha))[0][0]
+    #
+    # 	y_inf = linear_kernel(self.embed(self.x).T, self.embed(xtest).T).T.dot(alpha)
+    # 	return y_inf
+
+    def sample_theta(self, size=1):
+        if self.groups is None:
+            basis = self.m
+        else:
+            basis = int(int(torch.sum(self.m)))
+        zeros = torch.zeros(basis, size, dtype=torch.float64)
+        random_vector = torch.normal(mean=zeros, std=1.0)
+
+        if self.fit == True:
+            # random vector
+            Z = torch.pinverse(self.K)
+            self.L = torch.cholesky(Z, upper=False)
+            theta_mean = torch.mm(Z, torch.mm(self.Q, self.y))
+            theta = torch.mm(self.s * self.L, random_vector)
+            theta = theta + theta_mean
+        else:
+            theta_mean = 0
+            Z = (1.0 + self.s * self.s) * torch.eye(basis, dtype=torch.float64)
+            L = torch.cholesky(Z, upper=False)
+            theta = torch.mm(L, random_vector) + theta_mean
+        return theta
+
+    def sample(self, xtest, size=1):
+        """
+        Sample functions from Gaussian Process
+        """
+        theta = self.sample_theta(size=size)
+        if self.groups == None:
+            f = torch.mm(self.embed(xtest), theta)
+        else:
+            f = torch.zeros(xtest.size()[0], size, dtype=torch.float64)
+            for i, group in enumerate(self.groups):
+                embeding = self.embed_group(xtest[:, group], i)
+                index = int(torch.sum(self.m[0:i], dim=0))
+                index_next = int(torch.sum(self.m[0 : i + 1], dim=0))
+                f += torch.mm(embeding, theta[index:index_next, :])
+        return f
+
+    def sample_and_max(self, xtest, size=1):
+        """
+        Sample functions from Gaussian Process and take Maximum
+        """
+        f = self.sample(xtest, size=size)
+
+        index = np.argmax(f.detach(), axis=0)
+        return (xtest[index, :], f[index, :])
+
+    def ucb_optimize(self, beta, multistart=25):
+
+        mean = lambda x: self.mean_std(torch.from_numpy(x).view(1, -1))[0][0][0]
+        sigma = lambda x: self.mean_std(torch.from_numpy(x).view(1, -1))[1][0][0]
+
+        fun = lambda x: -(mean(x) + np.sqrt(beta) * sigma(x))
+        # grad = lambda x: -complex_step_derivative(fun,1e-10,x.reshape(1,-1))
+
+        mybounds = self.bounds
+        results = []
+        from scipy.optimize import minimize
+
+        for i in range(multistart):
+            x0 = np.random.randn(self.d)
+            for i in range(self.d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+
+            res = minimize(
+                fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds
+            )
+            solution = res.x
+            results.append([solution, -fun(solution)])
+
+        results = np.array(results)
+        index = np.argmax(results[:, 1])
+        solution = results[index, 0]
+
+        return (solution, -fun(solution))
+
+    def special_embed_eval(self, x, theta):
+        f = 0
+        x = torch.from_numpy(x)
+        # print (x)
+        for i, group in enumerate(self.groups):
+            embeding = self.embed_group(x[group].view(-1, len(group)), i)
+            index = torch.sum(self.m[0:i], dim=0)
+            index_next = torch.sum(self.m[0 : i + 1], dim=0)
+            f += torch.mm(embeding, theta[int(index) : int(index_next), :])
+        return f.numpy()
+
+    def special_embed_eval_grad(self, x, theta):
+        ff = lambda x: self.special_embed_eval(x.flatten(), theta)
+        grad = complex_step_derivative(ff, 1e-10, x.reshape(-1, 1).T).flatten()
+        return grad
+
+    def get_lambdas_additive(self, theta):
+        fun = lambda x: -self.special_embed_eval(x, theta)
+        grad = lambda x: -self.special_embed_eval_grad(x, theta)
+        return [fun, grad]
+
+    def get_lambdas(self, theta):
+
+        # complex step differentiation
+        fun = lambda x: -(
+            torch.mm(self.embed(torch.from_numpy(x).view(1, self.d)), theta).numpy()
+        ).flatten()
+        grad = lambda x: -complex_step_derivative(
+            fun, 1e-10, x.reshape(self.d, 1).T
+        ).flatten()
+        return [fun, grad]
+
+    def sample_and_optimize(
+        self, xtest=None, multistart=25, minimizer="L-BFGS-B", grid=100, verbose=0
+    ):
+        """
+        Sample functions from Gaussian Process and take Maximum using
+        first order maximization
+        """
+
+        # sample linear approximating
+        theta = self.sample_theta()
+        from scipy.optimize import minimize
+
+        # get bounds
+        if self.bounds == None:
+            mybounds = tuple([(-self.diameter, self.diameter) for i in range(self.d)])
+        else:
+            mybounds = self.bounds
+
+        fun = lambda x: -torch.mm(
+            torch.t(theta), torch.t(self.embed(torch.from_numpy(x).view(1, -1)))
+        ).numpy()
+
+        results = []
+        for j in range(multistart):
+            x0 = np.random.randn(self.d)
+            for i in range(self.d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+
+            if minimizer == "L-BFGS-B":
+                res = minimize(
+                    fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds
+                )
+                solution = res.x
+            elif minimizer == "ProjGD":
+                res = projected_gradient_descent(
+                    fun,
+                    grad,
+                    x0,
+                    mybounds,
+                    tol=0.001,
+                    nu=1.0 / (self.m * np.max(np.abs(theta))),
+                )
+                solution = res.x
+            elif minimizer == "coordinate-wise":
+
+                solution = np.random.randn(self.d)
+                for i in range(self.d):
+                    if verbose > 0:
+                        print("Dimension: ", i)
+                    fun_cw = lambda x: lambda_coordinate(fun, x0, i, x)
+                    ranges = [slice(mybounds[i][0], mybounds[i][1], 1.0 / float(grid))]
+                    out = scipy.optimize.brute(fun_cw, ranges, finish=None)
+                    solution[i] = out
+                if verbose > 0:
+                    print("Soln:", out.T)
+            elif minimizer == "CD_cw":
+                raise BaseException("Not implemented yet")
+            else:
+                raise AssertionError("Wrong optimizer selected.")
+
+            results.append([solution, -fun(solution)])
+
+        results = np.array(results)
+        index = np.argmax(results[:, 1])
+        solution = results[index, 0]
+
+        return (torch.from_numpy(solution), -torch.from_numpy(fun(solution)))
 
 
 if __name__ == "__main__":
-	# domain size
-	L_infinity_ball = 1
-	# dimension
-	d = 2
-	# error variance
-	s = 0.001
-	# grid density
-	n = 50
-	# number of intial points
-	N = 200
-	# smoothness
-	gamma = torch.from_numpy(np.array([0.4, 0.4]))
-	# test problem
-
-	xtest = torch.from_numpy(interval(n, d))
-	x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d)))
-
-	f_no_noise = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
-	# f_no_noise = lambda q: torch.sin((q[:,0] * 4)).view(-1, 1)
-
-	f = lambda q: f_no_noise(q) + torch.normal(mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.,
-											   out=None) * s
-	# targets
-	y = f(x)
-
-	# GP model with squared exponential
-	m = torch.from_numpy(np.array([100, 100]))
-
-	groups = [[0], [1]]
-	GP = GaussianProcessFF(kernel="squared_exponential", s=s, m=m, d=d, gamma=gamma, groups=groups, approx="hermite")
-	# GP2 = GaussianProcess(kernel="ard", s=s, d=d, gamma=gamma, groups=None)
-
-	# fit GP
-	GP.fit_gp(x, y)
-	# GP2.fit_gp(x,y)
-
-	GP.optimize_params("rots", 10, optimizer="pymanopt")
-
-	print("Log probability:", GP.log_marginal_likelihood_self())
-	# print ("Log probability:", GP2.log_marginal_likelihood_self() )
-
-	GP.visualize(xtest, f_true=f_no_noise)
+    # domain size
+    L_infinity_ball = 1
+    # dimension
+    d = 2
+    # error variance
+    s = 0.001
+    # grid density
+    n = 50
+    # number of intial points
+    N = 200
+    # smoothness
+    gamma = torch.from_numpy(np.array([0.4, 0.4]))
+    # test problem
+
+    xtest = torch.from_numpy(interval(n, d))
+    x = torch.from_numpy(
+        np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d))
+    )
+
+    f_no_noise = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
+    # f_no_noise = lambda q: torch.sin((q[:,0] * 4)).view(-1, 1)
+
+    f = (
+        lambda q: f_no_noise(q)
+        + torch.normal(
+            mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.0, out=None
+        )
+        * s
+    )
+    # targets
+    y = f(x)
+
+    # GP model with squared exponential
+    m = torch.from_numpy(np.array([100, 100]))
+
+    groups = [[0], [1]]
+    GP = GaussianProcessFF(
+        kernel="squared_exponential",
+        s=s,
+        m=m,
+        d=d,
+        gamma=gamma,
+        groups=groups,
+        approx="hermite",
+    )
+    # GP2 = GaussianProcess(kernel="ard", s=s, d=d, gamma=gamma, groups=None)
+
+    # fit GP
+    GP.fit_gp(x, y)
+    # GP2.fit_gp(x,y)
+
+    GP.optimize_params("rots", 10, optimizer="pymanopt")
+
+    print("Log probability:", GP.log_marginal_likelihood_self())
+    # print ("Log probability:", GP2.log_marginal_likelihood_self() )
+
+    GP.visualize(xtest, f_true=f_no_noise)
 # GP2.visualize(xtest, f_true=f_no_noise)
 
 
diff --git a/stpy/continuous_processes/ga_process.py b/stpy/continuous_processes/ga_process.py
index c0a1537..6317b08 100755
--- a/stpy/continuous_processes/ga_process.py
+++ b/stpy/continuous_processes/ga_process.py
@@ -5,208 +5,234 @@
 
 class GammaContProcess(Estimator):
 
-	def __init__(self, gamma=1, s=0.001, kappa=1., kernel="squared_exponential", diameter=1.0,
-				 groups=None, bounds=None, nu=2, safe=False, kernel_custom=None, d=1):
-		"""
-
-		:param gamma: Smoothnes parameter for squared exponential, laplace and matern kernel
-		:param s: level of noise
-		:param kernel: choose from a list
-		:param diameter: diameter of the set (deprecated)
-		:param groups: additive groups
-		:param bounds: bounds for the continuous optimization
-		:param v: parameter for matern kernel
-		"""
-
-		## GP properties
-		self.s = s
-		self.d = d
-		self.x = None
-		self.K = np.array([1.0])
-		self.mu = 0.0
-		self.safe = False
-		self.fit = False
-		self.diameter = diameter
-		self.bounds = bounds
-		self.admits_first_order = False
-		self.back_prop = True
-
-		## kernel hyperparameters
-		if kernel_custom is not None:
-			self.kernel_object = kernel_custom
-			self.kernel = kernel_custom.kernel
-		else:
-			self.kernel_object = KernelFunction(kernel_name=kernel, gamma=gamma, nu=nu, groups=groups, kappa=kappa)
-			self.kernel = self.kernel_object.kernel
-
-			self.gamma = gamma
-			self.v = nu
-			self.groups = groups
-			self.kappa = kappa
-			self.custom = kernel_custom
-			self.optkernel = kernel
-
-	def description(self):
-		"""
-		Description of GP in text
-		:return: string with description
-		"""
-		return self.kernel_object.description() + "\n noise: " + str(self.s)
-
-	def get_gamma(self, t):
-		"""
-		??
-		:param t:
-		:return:
-		"""
-		if self.optkernel == "squared_exponential" and self.groups is None:
-			return (np.log(t)) ** self.d
-		elif self.optkernel == "linear":
-			return 10 * self.d
-		elif self.optkernel == "squared_exponential" and self.groups is not None:
-			return len(self.groups) * (np.log(t))
-		elif self.optkernel == "matern":
-			return (np.log(t)) ** self.d
-		elif self.optkernel == "modified_matern":
-			return (np.log(t)) ** self.d
-
-	def make_safe(self, x):
-		"""
-		Make the input dataset numerically stable by removing duplicates?
-		:param x:
-		:return:
-		"""
-		self.epsilon = 0.001
-		# remove vectors that are very close to each other
-		return x
-
-	def fit_gp(self, x, y, iterative=False, extrapoint=False):
-		"""
-		Fits the Gaussian process, possible update is via iterative inverse
-		:param x: data x
-		:param y: values y
-		:param iterative: iterative inverse, where only last point of x is used
-		:param extrapoint: iterative inverse must be allowed, x is the only addition
-		:return:
-		"""
-		# first fit
-		if (self.fit == False or iterative == False):
-			if self.safe == True:
-				x = self.make_safe(x)
-
-			self.x = x
-			self.y = y
-			try:
-				self.n, self.d = list(x.size())
-			except:
-				self.n, self.d = x.shape
-			self.K = self.kernel(x, x) + self.s * self.s * torch.eye(self.n, dtype=torch.float64)
-
-			self.fit = True
-		else:
-			# iterative inverse
-			if (iterative == True):
-				if extrapoint == False:
-					last_point = self.x[-1, :].view(1, -1)
-				else:
-					last_point = x
-				old_K = self.K
-				old_Kinv = self.Kinv
-			else:
-				pass
-
-		return None
-
-	def beta(self, delta=1e-12, norm=1):
-		beta_value = self.s * norm + torch.sqrt(
-			2 * torch.log(1. / delta + torch.log(torch.det(self.K) / self.s ** self.n)))
-		return beta_value
-
-	def execute(self, xtest):
-		if self.fit == True:
-			K_star = self.kernel(self.x, xtest)
-		else:
-			K_star = None
-		K_star_star = self.kernel(xtest, xtest)
-		return (K_star, K_star_star)
-
-	# @check_numpy(1)
-	def mean_var(self, xtest, full=False):
-		"""
-		Return posterior mean and variance as tuple
-		:param xtest: grid, numpy array (2D)
-		:param full: Instead of just poinwise variance, full covariance can be outputed (bool)
-		:return: (tensor,tensor)
-		"""
-
-		(K_star, K_star_star) = self.execute(xtest)
-
-		if self.fit == False:
-			if full == False:
-
-				x = torch.sum(xtest, dim=1)
-				first = torch.diag(K_star_star).view(-1, 1)
-				variance = first
-				yvar = torch.sqrt(variance)
-			else:
-				first = K_star_star
-				yvar = first
-
-			return (0 * x.view(-1, 1), yvar)
-
-		if self.back_prop == False:
-			decomp = torch.btrifact(self.K.unsqueeze(0))
-			A = torch.btrisolve(self.y.unsqueeze(0), *decomp)[0, :, :]
-			self.B = torch.t(torch.btrisolve(torch.t(K_star).unsqueeze(0), *decomp)[0, :, :])
-		else:
-			A, _ = torch.gesv(self.y, self.K)
-			self.B = torch.t(torch.gesv(torch.t(K_star), self.K)[0])
-
-		ymean = torch.mm(K_star, A)
-
-		if full == False:
-			first = torch.diag(K_star_star).view(-1, 1)
-			second = torch.einsum('ij,ji->i', (self.B, torch.t(K_star))).view(-1, 1)
-			variance = first - second
-			yvar = torch.sqrt(variance)
-		else:
-			first = K_star_star
-			second = torch.mm(self.B, torch.t(K_star))
-			yvar = first - second
-
-		return (ymean, yvar)
-
-	def sample(self, xtest, size=1):
-		"""
-		Samples Path from GP, return a numpy array evaluated over grid
-		:param xtest: grid
-		:param size: number of samples
-		:return: numpy array
-		"""
-		nn = list(xtest.size())[0]
-
-		if self.fit == True:
-			(ymean, yvar) = self.mean_var(xtest, full=True)
-			Cov = yvar + self.s * self.s * torch.eye(nn, dtype=torch.float64)
-			L = torch.cholesky(Cov, upper=False)
-			random_vector = torch.normal(mean=torch.zeros(nn, size, dtype=torch.float64), std=1.)
-			f = ymean + torch.abs(torch.mm(L, random_vector))
-		else:
-			(K_star, K_star_star) = self.execute(xtest)
-			L = torch.cholesky(K_star_star + (10e-10 + self.s * self.s) * torch.eye(nn, dtype=torch.float64),
-							   upper=False)
-			random_vector = torch.normal(mean=torch.zeros(nn, size, dtype=torch.float64), std=1.)
-			f = self.mu + torch.mm(L, random_vector)
-		return f
-
-	def sample_and_max(self, xtest, size=1):
-		"""
-		Samples Path from GP and takes argmax
-		:param xtest: grid
-		:param size: number of samples
-		:return: (argmax, max)
-		"""
-		f = self.sample(xtest, size=size)
-		self.temp = f
-		val, index = torch.max(f, dim=0)
-		return (xtest[index, :], val)
+    def __init__(
+        self,
+        gamma=1,
+        s=0.001,
+        kappa=1.0,
+        kernel="squared_exponential",
+        diameter=1.0,
+        groups=None,
+        bounds=None,
+        nu=2,
+        safe=False,
+        kernel_custom=None,
+        d=1,
+    ):
+        """
+
+        :param gamma: Smoothnes parameter for squared exponential, laplace and matern kernel
+        :param s: level of noise
+        :param kernel: choose from a list
+        :param diameter: diameter of the set (deprecated)
+        :param groups: additive groups
+        :param bounds: bounds for the continuous optimization
+        :param v: parameter for matern kernel
+        """
+
+        ## GP properties
+        self.s = s
+        self.d = d
+        self.x = None
+        self.K = np.array([1.0])
+        self.mu = 0.0
+        self.safe = False
+        self.fit = False
+        self.diameter = diameter
+        self.bounds = bounds
+        self.admits_first_order = False
+        self.back_prop = True
+
+        ## kernel hyperparameters
+        if kernel_custom is not None:
+            self.kernel_object = kernel_custom
+            self.kernel = kernel_custom.kernel
+        else:
+            self.kernel_object = KernelFunction(
+                kernel_name=kernel, gamma=gamma, nu=nu, groups=groups, kappa=kappa
+            )
+            self.kernel = self.kernel_object.kernel
+
+            self.gamma = gamma
+            self.v = nu
+            self.groups = groups
+            self.kappa = kappa
+            self.custom = kernel_custom
+            self.optkernel = kernel
+
+    def description(self):
+        """
+        Description of GP in text
+        :return: string with description
+        """
+        return self.kernel_object.description() + "\n noise: " + str(self.s)
+
+    def get_gamma(self, t):
+        """
+        ??
+        :param t:
+        :return:
+        """
+        if self.optkernel == "squared_exponential" and self.groups is None:
+            return (np.log(t)) ** self.d
+        elif self.optkernel == "linear":
+            return 10 * self.d
+        elif self.optkernel == "squared_exponential" and self.groups is not None:
+            return len(self.groups) * (np.log(t))
+        elif self.optkernel == "matern":
+            return (np.log(t)) ** self.d
+        elif self.optkernel == "modified_matern":
+            return (np.log(t)) ** self.d
+
+    def make_safe(self, x):
+        """
+        Make the input dataset numerically stable by removing duplicates?
+        :param x:
+        :return:
+        """
+        self.epsilon = 0.001
+        # remove vectors that are very close to each other
+        return x
+
+    def fit_gp(self, x, y, iterative=False, extrapoint=False):
+        """
+        Fits the Gaussian process, possible update is via iterative inverse
+        :param x: data x
+        :param y: values y
+        :param iterative: iterative inverse, where only last point of x is used
+        :param extrapoint: iterative inverse must be allowed, x is the only addition
+        :return:
+        """
+        # first fit
+        if self.fit == False or iterative == False:
+            if self.safe == True:
+                x = self.make_safe(x)
+
+            self.x = x
+            self.y = y
+            try:
+                self.n, self.d = list(x.size())
+            except:
+                self.n, self.d = x.shape
+            self.K = self.kernel(x, x) + self.s * self.s * torch.eye(
+                self.n, dtype=torch.float64
+            )
+
+            self.fit = True
+        else:
+            # iterative inverse
+            if iterative == True:
+                if extrapoint == False:
+                    last_point = self.x[-1, :].view(1, -1)
+                else:
+                    last_point = x
+                old_K = self.K
+                old_Kinv = self.Kinv
+            else:
+                pass
+
+        return None
+
+    def beta(self, delta=1e-12, norm=1):
+        beta_value = self.s * norm + torch.sqrt(
+            2 * torch.log(1.0 / delta + torch.log(torch.det(self.K) / self.s**self.n))
+        )
+        return beta_value
+
+    def execute(self, xtest):
+        if self.fit == True:
+            K_star = self.kernel(self.x, xtest)
+        else:
+            K_star = None
+        K_star_star = self.kernel(xtest, xtest)
+        return (K_star, K_star_star)
+
+    # @check_numpy(1)
+    def mean_var(self, xtest, full=False):
+        """
+        Return posterior mean and variance as tuple
+        :param xtest: grid, numpy array (2D)
+        :param full: Instead of just poinwise variance, full covariance can be outputed (bool)
+        :return: (tensor,tensor)
+        """
+
+        (K_star, K_star_star) = self.execute(xtest)
+
+        if self.fit == False:
+            if full == False:
+
+                x = torch.sum(xtest, dim=1)
+                first = torch.diag(K_star_star).view(-1, 1)
+                variance = first
+                yvar = torch.sqrt(variance)
+            else:
+                first = K_star_star
+                yvar = first
+
+            return (0 * x.view(-1, 1), yvar)
+
+        if self.back_prop == False:
+            decomp = torch.btrifact(self.K.unsqueeze(0))
+            A = torch.btrisolve(self.y.unsqueeze(0), *decomp)[0, :, :]
+            self.B = torch.t(
+                torch.btrisolve(torch.t(K_star).unsqueeze(0), *decomp)[0, :, :]
+            )
+        else:
+            A, _ = torch.gesv(self.y, self.K)
+            self.B = torch.t(torch.gesv(torch.t(K_star), self.K)[0])
+
+        ymean = torch.mm(K_star, A)
+
+        if full == False:
+            first = torch.diag(K_star_star).view(-1, 1)
+            second = torch.einsum("ij,ji->i", (self.B, torch.t(K_star))).view(-1, 1)
+            variance = first - second
+            yvar = torch.sqrt(variance)
+        else:
+            first = K_star_star
+            second = torch.mm(self.B, torch.t(K_star))
+            yvar = first - second
+
+        return (ymean, yvar)
+
+    def sample(self, xtest, size=1):
+        """
+        Samples Path from GP, return a numpy array evaluated over grid
+        :param xtest: grid
+        :param size: number of samples
+        :return: numpy array
+        """
+        nn = list(xtest.size())[0]
+
+        if self.fit == True:
+            (ymean, yvar) = self.mean_var(xtest, full=True)
+            Cov = yvar + self.s * self.s * torch.eye(nn, dtype=torch.float64)
+            L = torch.cholesky(Cov, upper=False)
+            random_vector = torch.normal(
+                mean=torch.zeros(nn, size, dtype=torch.float64), std=1.0
+            )
+            f = ymean + torch.abs(torch.mm(L, random_vector))
+        else:
+            (K_star, K_star_star) = self.execute(xtest)
+            L = torch.cholesky(
+                K_star_star
+                + (10e-10 + self.s * self.s) * torch.eye(nn, dtype=torch.float64),
+                upper=False,
+            )
+            random_vector = torch.normal(
+                mean=torch.zeros(nn, size, dtype=torch.float64), std=1.0
+            )
+            f = self.mu + torch.mm(L, random_vector)
+        return f
+
+    def sample_and_max(self, xtest, size=1):
+        """
+        Samples Path from GP and takes argmax
+        :param xtest: grid
+        :param size: number of samples
+        :return: (argmax, max)
+        """
+        f = self.sample(xtest, size=size)
+        self.temp = f
+        val, index = torch.max(f, dim=0)
+        return (xtest[index, :], val)
diff --git a/stpy/continuous_processes/gauss_procc.py b/stpy/continuous_processes/gauss_procc.py
index d7379ff..e8882e1 100755
--- a/stpy/continuous_processes/gauss_procc.py
+++ b/stpy/continuous_processes/gauss_procc.py
@@ -4,8 +4,6 @@
 import scipy as scipy
 import torch
 from cvxpylayers.torch import CvxpyLayer
-#from functorch import hessian
-import functorch
 from pymanopt.manifolds import Euclidean, Stiefel, PSDFixedRank
 from torch.autograd import grad
 from torchmin import minimize as minimize_torch
@@ -17,1132 +15,1524 @@
 
 class GaussianProcess(Estimator):
 
-	def __init__(self, gamma=1, s=0.001, kappa=1., kernel_name="squared_exponential", diameter=1.0,
-				 groups=None, bounds=None, nu=1.5, kernel=None, d=1, power=2, lam=1., loss = 'squared', huber_delta = 1.35,
-				 hyper = 'classical', B = 1., svr_eps = 0.1):
-		"""
-
-		:param gamma: Smoothnes parameter for squared exponential, laplace and matern kernel
-		:param s: level of noise
-		:param kernel: choose from a list
-		:param diameter: diameter of the set (deprecated)
-		:param groups: additive groups
-		:param bounds: bounds for the continuous optimization
-		:param v: parameter for matern kernel
-		"""
-
-		## GP properties
-		self.s = s
-		self.d = d
-		self.x = None
-		self.K = np.array([1.0])
-		self.mu = 0.0
-		self.lam = lam
-		self.total_bound = B
-		self.prob = 0.5
-		self.svr_eps = svr_eps
-		self.safe = False
-		self.fitted = False
-		self.diameter = diameter
-		self.bounds = bounds
-		self.admits_first_order = False
-		self.back_prop = True
-		self.loss = loss
-		self.huber_delta = huber_delta
-		self.hyper = hyper
-		self.prepared_log_marginal = False
-		self.warm_start_solution = None
-		self.max_size = 10000
-		## kernel hyperparameters
-		if kernel is not None:
-			self.kernel_object = kernel
-			self.kernel = kernel.kernel
-			self.d = kernel.d
-		else:
-			self.kernel_object = KernelFunction(kernel_name=kernel_name, gamma=gamma, nu=nu, groups=groups, kappa=kappa,
-												power=power, d=d)
-			self.kernel = self.kernel_object.kernel
-
-			self.gamma = gamma
-			self.v = nu
-			self.groups = groups
-			self.kappa = kappa
-			self.custom = kernel
-			self.optkernel = kernel_name
-
-	def residuals(self,x,y):
-		res = (self.mean(x) - y)
-		return res
-
-	def description(self):
-		"""
-		Description of GP in text
-		:return: string with description
-		"""
-		return self.kernel_object.description() + "\nlambda=" + str(self.s)
-
-	def embed(self, x):
-		return self.kernel_object.embed(x)
-
-	def get_basis_size(self):
-		return self.kernel_object.get_basis_size()
-
-	def make_safe(self, x):
-		"""
-		Make the input dataset numerically stable by removing duplicates?
-		:param x:
-		:return:
-		"""
-		self.epsilon = 0.001
-		# remove vectors that are very close to each other
-		return x
-
-	def add_data_point(self, x, y, Sigma = None):
-
-		if self.x is not None:
-			self.x = torch.cat((self.x, x), dim=0)
-			self.y = torch.cat((self.y, y), dim=0)
-			if Sigma is None:
-				self.Sigma = torch.block_diag(self.Sigma, torch.eye(x.size()[0],dtype = torch.double) * self.s)
-		else:
-			self.x = x
-			self.y = y
-			self.Sigma = Sigma
-		self.fit_gp(self.x, self.y, Sigma = self.Sigma)
-
-	def fit(self, x=None, y=None):
-		if x is not None:
-			self.fit_gp(x,y)
-		else:
-			self.fit_gp(self.x,self.y)
-
-	def lcb(self, xtest):
-		"""
-		Lower confidence bound
-		:return:
-		"""
-		mu, s = self.mean_std(xtest)
-		return mu - 2 * s
-
-	def ucb(self, xtest):
-		"""
-		Upper confidence bound
-		:param xtest:
-		:return:
-		"""
-		mu, s = self.mean_std(xtest)
-		return mu + 2*s
-
-	def fit_gp(self, x, y, Sigma = None, iterative=False, extrapoint=False):
-		"""
-		Fits the Gaussian process, possible update is via iterative inverse
-		:param x: data x
-		:param y: values y
-		:param iterative: iterative inverse, where only last point of x is used
-		:param extrapoint: iterative inverse must be allowed, x is the only addition
-		:return:
-		"""
-		# first fit
-		try:
-			self.n, self.d = list(x.size())
-		except:
-			self.n, self.d = x.shape
-
-		if Sigma is None:
-			self.Sigma = (self.s) * torch.eye(self.n, dtype=torch.float64)
-		else:
-			self.Sigma = Sigma
-
-		if (self.fitted == False or iterative == False):
-
-			if self.safe == True:
-				x = self.make_safe(x)
-
-			self.x = x
-			self.y = y
-			self.K = self.kernel(x, x) + self.Sigma.T @ self.Sigma
-			self.fitted = True
-		else:
-			# iterative inverse
-			if (iterative == True):
-				if extrapoint == False:
-					last_point = self.x[-1, :].view(1, -1)
-				else:
-					last_point = x
-				old_K = self.K
-				old_Kinv = self.Kinv
-			else:
-				pass
-		self.mean_std(x)
-		return None
-
-	def norm(self):
-		if self.fitted:
-			val = torch.sqrt(self.A.T @ self.kernel(self.x, self.x) @ self.A)
-			return val
-		else:
-			return None
-
-	def beta(self, delta=1e-3, norm=1):
-		"""
-		return concentration parameter given the current estimates
-
-		:param delta: failure probability
-		:param norm: norm assumption
-		:return:
-		"""
-		beta_value = self.s * norm + \
-					 torch.sqrt(2 * torch.log(1. / delta + torch.log(torch.det(self.K) / self.s ** self.n)))
-		return beta_value
-
-	def execute(self, xtest):
-		"""
-		Calculates the covariance between data and xtest
-		:param xtest:
-		:return:
-		"""
-		if self.fitted == True:
-			K_star = self.kernel(self.x, xtest)
-		else:
-			K_star = None
-		K_star_star = self.kernel(xtest, xtest)
-		return (K_star, K_star_star)
-
-	def _huber_fit(self, K_star, newK = None):
-		alpha = cp.Variable(self.n)
-		self.jitter = 10e-5
-		if newK is None:
-			K = self.kernel(self.x, self.x) + self.jitter * torch.eye(self.n, dtype=torch.float64)
-		else:
-			K = newK.detach()
-		K = cp.atoms.affine.wraps.psd_wrap(K)
-		objective = cp.Minimize(cp.sum(cp.huber((K @ alpha - self.y.view(-1).numpy())/self.s,M = self.huber_delta)) + self.lam * cp.quad_form(alpha, K))
-		prob = cp.Problem(objective)
-		prob.solve(solver = cp.MOSEK, enforce_dpp = False)
-		if K_star is not None:
-			return K_star@torch.from_numpy(alpha.value).view(-1,1)
-		else:
-			return torch.from_numpy(alpha.value).view(-1,1)
-
-	def _svr_fit(self, K_star, newK = None):
-		alpha = cp.Variable(self.n)
-		self.jitter = 10e-5
-		if newK is None:
-			K = self.kernel(self.x, self.x) + self.jitter * torch.eye(self.n, dtype=torch.float64)
-		else:
-			K = newK.detach()
-
-		K = cp.atoms.affine.wraps.psd_wrap(K)
-		objective = cp.Minimize(self.lam * cp.quad_form(alpha, K))
-		constraints = [cp.abs(K @ alpha - self.y.view(-1).numpy()) <= self.svr_eps ]
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver = cp.MOSEK, enforce_dpp = False)
-		if K_star is not None:
-			return K_star@torch.from_numpy(alpha.value).view(-1,1)
-		else:
-			return torch.from_numpy(alpha.value).view(-1,1)
-
-
-	def _unif_fit(self, K_star, newK = None):
-		alpha = cp.Variable((self.n,1))
-		self.jitter = 10e-5
-		if newK is None:
-			K = self.kernel(self.x, self.x) + self.jitter * torch.eye(self.n, dtype=torch.float64)
-		else:
-			K = newK.detach()
-
-		K = cp.atoms.affine.wraps.psd_wrap(K)
-		con = 2*self.total_bound*self.prob/((1-self.prob)*np.sqrt(2*np.pi*self.s**2))
-		objective = cp.Minimize(cp.sum(cp.logistic(cp.square(
-			(K @ alpha - self.y.view(-1, 1).numpy())/ (np.sqrt(2)*self.s)) + np.log(con) )) + self.lam * cp.quad_form(alpha, K))
-		prob = cp.Problem(objective)
-		prob.solve(solver = cp.MOSEK, enforce_dpp = False)
-		if K_star is not None:
-			return K_star@torch.from_numpy(alpha.value).view(-1,1)
-		else:
-			return torch.from_numpy(alpha.value).view(-1,1)
-
-	def _unif_fit_torch(self, K_star, newK = None, warm_start = None):
-		self.jitter = 10e-5
-		if newK is None:
-			K = self.kernel(self.x, self.x) + self.jitter * torch.eye(self.n, dtype=torch.float64)
-		else:
-			K = newK.detach()
-
-		con = 2 * self.total_bound * self.prob / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s ** 2))
-		unif = lambda alpha: torch.sum(torch.log(torch.exp( ((K@alpha-self.y.view(-1))**2)/(2*self.s**2) + np.log(con) ) + 1 ) ) \
-										  + self.lam * alpha  @ K@ alpha
-		if warm_start is None:
-			x_init = torch.zeros(size = (self.n,1)).view(-1).double()
-		else:
-			x_init = warm_start.view(-1)
-
-		res = minimize_torch(unif, x_init, method='l-bfgs', tol=1e-3, disp=0,
-							 options={'max_iter': 200, 'gtol': 1e-3})
-		alpha = res.x
-
-		if K_star is not None:
-			return K_star @ alpha.view(-1, 1)
-		else:
-			return alpha.view(-1, 1)
-
-	def _huber_fit_torch(self, K_star, newK = None):
-		self.jitter = 10e-5
-		if newK is None:
-			K = self.kernel(self.x, self.x) + self.jitter * torch.eye(self.n, dtype=torch.float64)
-		else:
-			K = newK
-		L = torch.linalg.cholesky(K)
-
-		huber = lambda beta: torch.nn.functional.huber_loss(L @ beta / self.s, self.y.view(-1) / self.s,
-															reduction='sum',
-															delta=self.huber_delta) + self.lam * beta @ beta
-		#x_init = torch.linalg.solve(L.T@L+torch.eye(self.n).double()*self.s**2*self.lam, self.y)
-		x_init = torch.zeros(size = (self.n,1)).view(-1).double()
-		res = minimize_torch(huber, x_init, method='l-bfgs', tol=1e-4, disp=0,
-							 options={'max_iter': 10**3, 'gtol': 1e-4})
-		alpha = torch.linalg.solve(L,res.x)
-		if K_star is not None:
-			return K_star @ alpha.view(-1, 1)
-		else:
-			return alpha.view(-1,1)
-
-	def mean_std(self, xtest, full=False, reuse=False):
-		if xtest.size()[0]<self.max_size:
-			return self.mean_std_sub(xtest,full=full, reuse=reuse)
-		else:
-			stepby = self.max_size
-			mu = torch.zeros(size=(xtest.size()[0], 1)).double()
-			std = torch.zeros(size=(xtest.size()[0], 1)).double()
-
-			# first
-			i = 0
-			mu[i * stepby:(i + 1) * stepby], std[i * stepby:(i + 1) * stepby] = self.mean_std_sub(
-				xtest[i * stepby:(i + 1) * stepby, :], reuse=False)
-
-			for i in np.arange(1, xtest.size()[0] // stepby, 1):
-				print(i, "/", xtest.size()[0] // stepby)
-				mu[i * stepby:(i + 1) * stepby], std[i * stepby:(i + 1) * stepby] = self.mean_std_sub(
-					xtest[i * stepby:(i + 1) * stepby, :], reuse=True)
-
-			# last
-			if xtest.size()[0] % stepby > 0:
-				mu[xtest.size()[0] - xtest.size()[0] % stepby:], std[
-															  xtest.size()[0] - xtest.size()[0] % stepby:] = self.mean_std_sub(
-					xtest[xtest.size()[0] - xtest.size()[0] % stepby:, :], reuse=True)
-
-			return mu, std
-
-	def mean_std_sub(self, xtest, full=False, reuse=False):
-		"""
-		Return posterior mean and variance as tuple
-		:param xtest: grid, numpy array (2D)
-		:param full: Instead of just poinwise variance, full covariance can be outputed (bool)
-		:return: (tensor,tensor)
-		"""
-		if full:
-			(K_star, K_star_star) = self.execute(xtest)
-		else:
-			K_star = self.kernel(self.x, xtest)
-			diag_K_star_star = torch.hstack([self.kernel(xtest[i,:].view(1,-1),xtest[i,:].view(1,-1)).view(1) for i in range(xtest.size()[0])])
-
-		if self.fitted == False:
-			# the process is not fitted
-
-			if full == False:
-				x = torch.sum(xtest, dim=1)
-				#first = torch.diag(K_star_star).view(-1, 1)
-				first = diag_K_star_star.view(-1,1)
-				variance = first
-				yvar = torch.sqrt(variance)
-			else:
-				x = torch.sum(xtest, dim=1)
-				first = K_star_star
-				yvar = first
-
-			return (0 * x.view(-1, 1), yvar)
-
-		else:
-
-			if self.back_prop == False:
-				if reuse == False:
-					#self.decomp = torch.lu(self.K.unsqueeze(0))
-					self.LU, self.pivot = torch.linalg.lu_factor(self.K.unsqueeze(0))
-					#self.A = torch.lu_solve(self.y.unsqueeze(0), *self.decomp)[0, :, :]
-					self.A = torch.linalg.lu_solve(self.LU, self.pivot, self.y.unsqueeze(0))[0,:,:]
-				self.B = torch.t(torch.linalg.lu_solve(self.LU, self.pivot ,torch.t(K_star).unsqueeze(0))[0, :, :])
-			else:
-				if reuse == False:
-					self.A = torch.linalg.lstsq(self.K, self.y)[0]
-				#self.B = torch.t(torch.linalg.solve(self.K, torch.t(K_star)))
-				self.B = torch.t(torch.linalg.lstsq(self.K, torch.t(K_star))[0])
-
-			if self.loss == "squared":
-				ymean = torch.mm(K_star, self.A)
-			elif self.loss == "huber":
-				ymean = self._huber_fit(K_star)
-			elif self.loss == "svr":
-				ymean = self._svr_fit(K_star)
-			elif self.loss == "unif"  or self.loss == "unif_new":
-				ymean = self._unif_fit_torch(K_star)
-			else:
-				raise AssertionError("Loss function not implemented.")
-
-			if full == False:
-				first = diag_K_star_star.view(-1,1)
-				second = torch.einsum('ij,ji->i', (self.B, torch.t(K_star))).view(-1, 1)
-				variance = first - second
-				yvar = torch.sqrt(variance)
-			else:
-				first = K_star_star
-				second = torch.mm(self.B, torch.t(K_star))
-				yvar = first - second
-
-			return (ymean, yvar)
-
-	def mean(self, xtest):
-		"""
-		Calculates the mean prediction over a specific input space
-		:param xtest: input
-		:return:
-		"""
-		K_star = self.kernel(self.x, xtest)
-
-		if self.loss == "squared":
-			ymean = torch.mm(K_star, self.A)
-		elif self.loss == "huber":
-			ymean = self._huber_fit(K_star)
-		else:
-			raise AssertionError("Loss function not implemented.")
-
-		return ymean
-
-	def gradient_mean_var(self, point, hessian=True):
-		"""
-		Can calculate gradient at single point atm.
-
-		:param point:
-		:return:
-		"""
-
-		# mean
-		point.requires_grad_(True)
-		mu = self.mean_std(point)[0]
-		nabla_mu = grad(mu, point, create_graph=True)[0][0]
-
-		if hessian == True:
-			# variance
-			H = self.kernel_object.get_2_der(point)
-			C = self.kernel_object.get_1_der(point, self.x)
-
-			V = H - torch.t(C) @ self.K @ C
-
-			return [nabla_mu, V]
-		else:
-			return nabla_mu
-
-	def mean_gradient_hessian(self, xtest, hessian=False):
-		hessian_mu = torch.zeros(size=(self.d, self.d), dtype=torch.float64)
-		xtest.requires_grad_(True)
-		# xtest.retain_grad()
-		mu = self.mean_std(xtest)[0]
-		# mu.backward(retain_graph=True)
-
-		# nabla_mu = xtest.grad
-		nabla_mu = grad(mu, xtest, create_graph=True)[0][0]
-
-		if hessian == False:
-			return nabla_mu
-		else:
-			for i in range(self.d):
-				hessian_mu[i, :] = grad(nabla_mu[i], xtest, create_graph=True, retain_graph=True)[0][0]
-			return [nabla_mu, hessian_mu]
-
-	def sample(self, xtest, size=1, jitter=10e-8):
-		"""
-			Samples Path from GP, return a numpy array evaluated over grid
-		:param xtest: grid
-		:param size: number of samples
-		:return: numpy array
-		"""
-		nn = list(xtest.size())[0]
-
-		if self.fitted == True:
-			(ymean, yvar) = self.mean_std(xtest, full=True)
-			Cov = yvar + 10e-10 * torch.eye(nn, dtype=torch.float64)
-			L = torch.linalg.cholesky(Cov)
-			# L = torch.from_numpy(np.linalg.cholesky(Cov.numpy()))
-			random_vector = torch.normal(mean=torch.zeros(nn, size, dtype=torch.float64), std=1.)
-			f = ymean + torch.mm(L, random_vector)
-		else:
-			(K_star, K_star_star) = self.execute(xtest)
-			L = torch.linalg.cholesky(K_star_star + jitter * torch.eye(nn, dtype=torch.float64))
-			random_vector = torch.normal(mean=torch.zeros(nn, size, dtype=torch.float64), std=1.)
-			f = self.mu + torch.mm(L, random_vector)
-		return f
-
-	def sample_and_max(self, xtest, size=1):
-		"""
-			Samples Path from GP and takes argmax
-		:param xtest: grid
-		:param size: number of samples
-		:return: (argmax, max)
-		"""
-		f = self.sample(xtest, size=size)
-		self.temp = f
-		val, index = torch.max(f, dim=0)
-		return (xtest[index, :], val)
-
-
-	def log_marginal(self, kernel, X, weight):
-
-		if self.loss == "squared":
-			return self._log_marginal_squared(kernel, X, weight)
-		elif self.loss == "unif_new":
-			return self._log_marginal_unif(kernel, X, weight)
-		else:
-			return self._log_marginal_map(kernel, X, weight)
-
-	def _log_marginal_unif(self,kernel,X,weight):
-		if not self.prepared_log_marginal:
-			self._prepare_log_marginal_unif()
-
-		func = kernel.get_kernel()
-		self.jitter = 10e-4
-		K = func(self.x, self.x, **X) + torch.eye(self.n, dtype=torch.float64) * self.jitter
-		#print ("Kernel")
-		#print (K)
-		L = torch.linalg.cholesky(K)
-		self.L_unif.value = (L.data.numpy())
-
-		self.prob_unif.solve(solver=cp.MOSEK, enforce_dpp=False, warm_start=True)
-
-		solution = torch.zeros(size=(self.n, 1), requires_grad=True).reshape(-1).double()
-		solution.data = torch.from_numpy(self.beta_unif.value)
-		con = 2 * self.total_bound * self.prob / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s ** 2))
-
-		loglikelihood = lambda beta: torch.sum(torch.log(torch.exp( ((L@beta-self.y.view(-1))**2)/(2*self.s**2) + np.log(con) ) + 1 ) ) \
-										  + self.lam * beta.T  @ beta
-
-		H = hessian(loglikelihood)(solution)
-		logdet = - 0.5* torch.slogdet(H)[1] * weight
-		logprob = -0.5* loglikelihood(solution) + logdet
-		logprob = -logprob
-		return logprob
-
-	def _prepare_log_marginal_unif(self):
-
-		self.beta_unif = cp.Variable(self.n)
-		self.L_unif = cp.Parameter((self.n, self.n))
-
-		con = 2 * self.total_bound * self.prob / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s ** 2))
-		#self.objective_unif = cp.Minimize(cp.sum(cp.logistic(cp.square(
-		#	(self.K_unif @ self.alpha_unif - self.y.view(-1).numpy()) / (np.sqrt(2) * self.s)) + np.log(con))) + self.lam * cp.quad_form(
-		#	self.alpha_unif, self.L))
-		self.objective_unif = cp.Minimize(cp.sum(cp.logistic(cp.square(
-			(self.L_unif @ self.beta_unif - self.y.view(-1).numpy()) / (np.sqrt(2) * self.s)) + np.log(con))) + self.lam * cp.sum_squares(self.beta_unif))
-		self.prob_unif = cp.Problem(self.objective_unif)
-		self.prepared_log_marginal  = True
-
-	def _prepare_log_marginal_huber(self):
-		beta = cp.Variable(self.n)
-		L = cp.Parameter((self.n, self.n))
-
-		objective = cp.Minimize(cp.sum(
-			cp.huber((L @ beta - self.y.view(-1).numpy()) / self.s, M=self.huber_delta)) + self.lam * cp.sum_squares(
-			beta))
-
-		prob = cp.Problem(objective)
-		cvxpylayer = CvxpyLayer(prob, parameters=[L], variables=[beta])
-		self.prepared_log_marginal = True
-		print ("cvxpy-layer has been initialized.")
-		return cvxpylayer
-
-	def _log_marginal_huber_cvxpy(self, kernel, X, weight):
-		func = kernel.get_kernel()
-		self.jitter = 10e-4
-		L_tch = torch.linalg.cholesky(func(self.x, self.x, **X) + torch.eye(self.n, dtype=torch.float64) * self.jitter)
-
-		if not self.prepared_log_marginal:
-			self._cvxpylayer = self._prepare_log_marginal_huber()
-		solution = self._cvxpylayer(L_tch)[0]
-
-		huber = lambda beta: torch.nn.functional.huber_loss(L_tch@beta/self.s,self.y.view(-1)/self.s,reduction='sum',delta = self.huber_delta) + self.lam * beta.T @ beta
-		H = torch.autograd.functional.hessian(huber, solution)
-
-		logdet = - 0.5* torch.slogdet(H)[1]* weight
-		logprob = -0.5* huber(solution) +logdet
-		logprob = -logprob
-		return logprob
-
-
-	def _log_marginal_map(self, kernel, X, weight):
-		# this implementation uses Danskin theorem to simplify gradient propagation
-		func = kernel.get_kernel()
-		self.jitter = 10e-4
-		K_tch =func(self.x, self.x, **X) + torch.eye(self.n, dtype=torch.float64) * self.jitter
-
-		# solve
-		solution = torch.zeros(size=(self.n, 1), requires_grad=True).reshape(-1).double()
-		if self.warm_start_solution is None:
-			self.warm_start_solution = solution.clone()
-
-		if self.loss == "huber":
-			alpha = self._huber_fit(None, newK = K_tch).detach()
-			loglikelihood = lambda alpha: torch.nn.functional.huber_loss(K_tch@alpha/self.s,self.y.view(-1)/self.s,
-									reduction='sum',delta = self.huber_delta) + self.lam * alpha.T @K_tch@ alpha
-
-			solution.data = alpha.reshape(-1).data
-			self.warm_start_solution.data = solution.data
-			mask = torch.abs(K_tch @ alpha - self.y)/self.s<self.huber_delta
-			mask = mask.view(-1).double()
-			D = torch.diag(mask)
-			H =  K_tch@D@K_tch+ 2 * self.lam * K_tch
-
-		elif self.loss == "svr":
-			alpha = self._svr_fit(None, newK=K_tch).detach()
-
-			loglikelihood = lambda alpha: torch.sum(torch.abs(K_tch@alpha-self.y.view(-1))*(K_tch@alpha -self.y.view(-1) > self.svr_eps).int()) \
-										 + self.lam * alpha.T @K_tch@ alpha
-
-			solution.data = alpha.reshape(-1).data
-			self.warm_start_solution.data = solution.data
-			H = torch.autograd.functional.hessian(loglikelihood, solution)
-
-		elif self.loss == "unif":
-			alpha = self._unif_fit_torch(None, newK=K_tch).detach()
-			con = 2 * self.total_bound * self.prob / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s ** 2))
-
-
-			loglikelihood = lambda alpha: torch.sum(torch.log(torch.exp( ((K_tch@alpha-self.y.view(-1))**2)/(2*self.s**2) + np.log(con) ) + 1 ) ) \
-										  + self.lam * alpha @ K_tch@ alpha
-			#v = lambda alpha : torch.sum(torch.exp( ((K_tch@alpha-self.y.view(-1))**2)/(2*self.s**2) + np.log(con) ))
-			solution.data = alpha.reshape(-1).data
-			self.warm_start_solution.data = solution.data
-			H = hessian(loglikelihood)(solution)
-
-		logdet = - 0.5* torch.slogdet(H)[1] * weight
-		logprob = -0.5* loglikelihood(solution) + logdet
-		logprob = -logprob
-		return logprob
-
-
-
-	def _log_marginal_squared(self, kernel, X, weight):
-		func = kernel.get_kernel()
-		K = func(self.x, self.x, **X) + torch.eye(self.n, dtype=torch.float64) * self.s * self.s
-		logdet = -0.5 * torch.slogdet(K)[1] * weight
-		alpha = torch.linalg.solve(K, self.y)
-		logprob = -0.5 * torch.mm(torch.t(self.y), alpha) + logdet
-		logprob = -logprob
-		return logprob
-
-	def optimize_params(self, type='bandwidth', restarts=10, regularizer=None,
-						maxiter=1000, mingradnorm=1e-4, verbose=False, optimizer="pymanopt", scale=1., weight=1., save = False,
-								save_name = 'model.np', init_func = None, bounds = None, parallel = False, cores = None):
-
-		# Spectral norm regularizer
-		if regularizer is not None:
-			if regularizer[0] == "spectral_norm":
-				regularizer_func = lambda S: regularizer[1] * torch.norm(1/S[0], p='nuc')
-			elif regularizer[0] == 'lasso':
-				regularizer_func = lambda S: regularizer[1] * torch.norm(1/S[0], p=1)
-			else:
-				regularizer_func = None
-		else:
-			regularizer_func = None
-
-		if type == "bandwidth":
-			params = {}
-			for key, dict2 in self.kernel_object.params_dict.items():
-				if 'gamma' in dict2.keys():
-					params[key] = {'gamma': (init_func, Euclidean(1), bounds)}
-				elif 'ard_gamma' in dict2.keys():
-					params[key] = {'ard_gamma': (init_func, Euclidean(len(dict2['group'])), bounds)}
-
-		elif type == "bandwidth+noise":
-			params = {}
-			init_func_noise = lambda x: self.s
-			for key, dict2 in self.kernel_object.params_dict.items():
-
-				if 'gamma' in dict2.keys():
-					params[key] = {'gamma': (init_func, Euclidean(1), bounds)}
-
-				elif 'ard_gamma' in dict2.keys():
-					params[key] = {'ard_gamma': (init_func, Euclidean(len(dict2['group'])), bounds)}
-
-			params['likelihood'] = {'sigma':(init_func_noise, Euclidean(1), None )}
-
-		elif type == "rots":
-			params = {}
-			d = int(self.kernel_object.d)
-			for key, dict2 in self.kernel_object.params_dict.items():
-				if 'rot' in dict2.keys():
-					params[key] = {'rot': (None, Stiefel(d, d), None)}
-		elif type == "groups":
-			params = {}
-			optimizer = "discrete"
-			d = self.kernel_object.d
-			for key, dict2 in self.kernel_object.params_dict.items():
-				if 'groups' in dict2.keys():
-					params[key] = {'groups': (None, helper.generate_groups(d), None)}
-			pass
-		elif type == "covariance":
-			params = {}
-			d = int(self.kernel_object.d)
-			for key, dict2 in self.kernel_object.params_dict.items():
-				if 'cov' in dict2.keys():
-					params[key] = {'cov': (None, PSDFixedRank(d, d), None)}
-		else:
-			raise AttributeError("This quick-optimization is not implemented.")
-
-		self.optimize_params_general(params=params, restarts=restarts,
-									 optimizer=optimizer, regularizer_func=regularizer_func,
-									 maxiter=maxiter, mingradnorm=mingradnorm, verbose=verbose, scale=scale,
-									 weight=weight, save = save, save_name = save_name, parallel = parallel, cores = cores)
-
-	def log_probability(self, xtest, sample):
-		from scipy.stats import multivariate_normal
-		mu, covar = self.mean_std(xtest, full=True)
-		p = np.log(multivariate_normal.pdf(sample.view(-1).numpy(), mean=mu.view(-1).numpy(), cov=covar.numpy()))
-		return p
-
-	def volume_mean_cvxpy(self, xtest, weights=None, eps=10e-2,
-						  tol=10e-14, max_weight=1, max_iter=1000,
-						  verbose=False, scale=10e-4, slope=1.,
-						  bisections=10, B='auto', optimal_scale=None,
-						  optimize_scale=False, relax='relu'):
-
-		n = self.x.size()[0]
-		K = self.get_kernel()  # (self.x, self.x)
-		Kinv = torch.pinverse(K + eps * torch.eye(K.size()[0]).double()).numpy()
-		if weights is None:
-			weights = torch.ones(self.x.size()[0]) / n
-		if B == 'auto':
-			alpha, _ = torch.lstsq(self.y, K)
-			beta = K @ alpha
-			B = beta.T @ Kinv @ beta
-			print("Auto:B", B)
-
-		def fun(scale_arg):
-			beta = cp.Variable(n)
-			if relax == 'relu':
-				loss_fn_transformed = cp.sum(cp.pos(weights * slope * (
-							cp.abs(beta - self.y.numpy().reshape(-1)) - eps))) + 0.5 * scale_arg * cp.quad_form(beta,
-																												Kinv)
-			elif relax == 'log':
-				loss_fn_transformed = cp.sum(cp.logistic(weights * slope * (
-						cp.abs(beta - self.y.numpy().reshape(-1)) - eps))) + 0.5 * scale_arg * cp.quad_form(beta,
-																											Kinv)
-
-			# loss_fn_transformed = cp.sum(weights*logit(slope*(cp.abs(beta - self.y.numpy().reshape(-1)) -eps))) +  0.5*scale_arg*cp.quad_form(beta, Kinv)-
-
-			prob = cp.Problem(cp.Minimize(loss_fn_transformed))
-			# prob.solve(solver=cp.MOSEK, feastol=tol, verbose=False)
-			prob.solve(solver=cp.MOSEK, verbose=False)
-			if verbose == True:
-				print("scale:", scale_arg, "cond:", np.linalg.cond(Kinv), "sub.", beta.value.T @ Kinv @ beta.value - B,
-					  "B:", B)
-			return beta.value.T @ Kinv @ beta.value - B
-
-		if optimize_scale:
-			return helper.bisection(fun, 0., max_weight, bisections)
-
-		if optimal_scale is None:
-			scale_star = helper.bisection(fun, 0., max_weight, bisections)
-		else:
-			scale_star = optimal_scale
-
-		beta = cp.Variable(n)
-		if relax == 'relu':
-			loss_fn_transformed = cp.sum(weights * cp.pos(
-				slope * (cp.abs(beta - self.y.numpy().reshape(-1)) - eps))) + 0.5 * scale_star * cp.quad_form(beta,
-																											  Kinv)
-		elif relax == 'log':
-			loss_fn_transformed = cp.sum(weights * cp.logistic(
-				slope * (cp.abs(beta - self.y.numpy().reshape(-1)) - eps))) + 0.5 * scale_star * cp.quad_form(beta,
-																											  Kinv)
-		prob = cp.Problem(cp.Minimize(loss_fn_transformed))
-		# prob.solve(solver=cp.CVXOPT, feastol=tol, verbose=verbose)
-		prob.solve(solver=cp.MOSEK, verbose=verbose)
-		beta_torch = torch.from_numpy(beta.value).view(-1, 1)
-		alpha = torch.from_numpy(Kinv) @ beta_torch
-		ytest = self.kernel(self.x, xtest) @ alpha
-		return ytest
-
-	def volume_mean(self, xtest, weights=None, eps=10e-2, tol=10e-6, max_iter=1000, verbose=False, eta_start=0.01,
-					eta_decrease=0.9, scale=1, slope=1., warm=True, relax='relu', norm=False, B='auto'):
-		self.scale = scale
-		self.relax = relax
-
-		K = self.get_kernel()  # (self.x, self.x)
-		Kinv = torch.pinverse(K)
-
-		if weights is None:
-			weights = torch.ones(self.x.size()[0])
-		else:
-			weights[weights < 10e-6] = 0.  # * self.x.size()[0]
-			weights = weights.view(-1)
-		if warm == True:
-			# warm start with L2 fit
-			alpha, _ = torch.lstsq(self.y, K)
-			beta = K @ alpha
-		else:
-			beta = torch.randn(size=(self.n, 1)).double()  # .requires_grad_(True)*0
-
-		# loss_fn_original = lambda alpha: torch.sum(torch.relu(torch.abs(K @ alpha - self.y) -eps)) + 0.5*self.s * alpha.T @ K @ alpha
-		if self.relax == "relu":
-			loss_fn_transformed = lambda beta: torch.sum(
-				torch.relu(torch.abs(beta - self.y) - eps)) + self.scale * 0.5 * self.s * beta.T @ Kinv @ beta
-
-		elif self.relax == "tanh":
-			self.slope = slope
-			tanh = lambda x: (torch.tanh(self.slope * x) + 1) * 0.5
-			loss_fn_transformed = lambda beta: torch.sum(weights * tanh(torch.abs(beta - self.y) - eps).view(
-				-1)) + 0.5 * self.s * self.scale * beta.T @ Kinv @ beta
-
-		elif self.relax == "elu":
-			self.slope = slope
-			elu = lambda x: torch.nn.elu(x, alpha=self.slope)
-			loss_fn_transformed = lambda beta: torch.sum(
-				elu(torch.abs(beta - self.y) - eps)) + 0.5 * self.s * self.scale * beta.T @ Kinv @ beta
-
-		elif self.relax == "relu":
-			return self.volume_mean_cvxpy(xtest, weights=weights, eps=eps, scale=scale, tol=tol)
-		else:
-			raise AssertionError("Unkown relaxation.")
-
-		current_loss = 10e10
-		eta = eta_start
-		for i in range(max_iter):
-			grad = self.s * (Kinv @ beta)
-			beta = self.proximal(beta, grad, eta, eps, weights)
-			past_loss = current_loss
-			current_loss = loss_fn_transformed(beta)
-			if current_loss > past_loss:
-				eta = eta * eta_decrease
-			elif np.abs(current_loss - past_loss) < tol:
-				break
-
-			# print (i, beta.T)
-			if verbose == True:
-				print(i, loss_fn_transformed(beta), eta)
-
-		print("final norm:", beta.T @ Kinv @ beta)
-
-		# alpha = torch.inverse(self.K) @ beta
-		alpha = torch.pinverse(K) @ beta
-		# alpha = torch.lstsq(K,beta)
-		ytest = self.kernel(self.x, xtest) @ alpha
-		# max = torch.max(torch.abs(beta - self.y))
-		if norm == True:
-			return beta.T @ Kinv @ beta
-		# yz = self.kernel(self.x, self.x)  @ alpha
-		# approx_v = torch.sum(torch.relu(torch.abs(beta - self.y) -eps))/max
-		# approx_p = approx_v/self.n
-		# mask = (torch.abs(yz[:,0] - self.y[:,0])) > eps
-		# approx_p = float(torch.sum(mask))/float(self.n)
-		return ytest  # ,approx_p
-
-	def volume_mean_norm(self, xtest, weights=None, eps=10e-2, tol=10e-6, max_iter=1000, verbose=False, eta_start=0.01,
-						 eta_decrease=0.9, scale=1, slope=1., warm=True, relax='relu', B='auto'):
-		K = self.kernel(self.x, self.x)
-		Kinv = torch.pinverse(K)
-		if B == 'auto':
-			alpha, _ = torch.lstsq(self.y, self.K)
-			beta = K @ alpha
-			B = beta.T @ Kinv @ beta
-
-		func = lambda s: self.volume_mean(xtest, weights=weights, eps=eps, tol=tol, max_iter=max_iter, verbose=verbose,
-										  eta_start=eta_start,
-										  eta_decrease=eta_decrease, scale=s, slope=slope, warm=warm, relax=relax,
-										  norm=True) - B
-
-		s_star = stpy.optim.custom_optimizers.bisection(func, 0., 1000., 10)
-
-		return self.volume_mean(xtest, weights=weights, eps=eps, tol=tol, max_iter=max_iter, verbose=verbose,
-								eta_start=eta_start,
-								eta_decrease=eta_decrease, scale=s_star, slope=slope, warm=warm, relax=relax,
-								norm=False)
-
-	def proximal(self, beta, nabla, eta, eps, weights):
-		res = beta
-		for i in range(self.n):
-			from scipy.optimize import minimize
-
-			b = float(beta[i, :])
-			y = float(self.y[i, :])
-			g = float(nabla[i, :])
-			w = float(weights[i])
-			# s = float(self.s)
-
-			tanh = lambda x: (np.tanh(self.slope * x) + 1) * 0.5
-			elu = lambda x: torch.elu(x, alpha=self.slope).numpy()
-
-			if self.relax == "relu":
-				loss_reg = lambda x: w * np.maximum(0, np.abs(x - y) - eps)
-			elif self.relax == "tanh":
-				loss_reg = lambda x: w * tanh(np.abs(x - y) - eps)
-			elif self.relax == "elu":
-				loss_reg = lambda x: w * elu(np.abs(x - y) - eps)
-			else:
-				raise AssertionError("Unkown relaxation.")
-
-			loss_scalar = lambda x: ((1 / (2. * eta)) * (x - (b - eta * g)) ** 2) + loss_reg(x)
-
-			x0 = np.array([0.])
-			# print (minimize(loss_scalar,x0,method ='nelder-mead').x)
-			res[i, :] = float(minimize(loss_scalar, x0, method='nelder-mead').x)
-		return res
-
-	def get_lambdas(self, beta, mean=False):
-		"""
-		Gets lambda function to evaluate acquisiton function and its derivative
-		:param beta: beta in GP-UCB
-		:return: [lambda,lambda]
-		"""
-		mean = lambda x: self.mean_std(x.reshape(1, -1), reuse=True)[0][0][0]
-		sigma = lambda x: self.mean_std(x.reshape(1, -1), reuse=True)[1][0][0]
-
-		if mean == True:
-			return [mean, sigma]
-		else:
-			fun = lambda x: -(mean(x) + np.sqrt(beta) * sigma(x))
-			grad = lambda x: -complex_step_derivative(fun, 1e-10, x.reshape(1, -1))
-
-			return [fun, grad]
-
-	def get_kernel(self):
-		return self.K
-
-	def ucb_optimize(self, beta, multistart=25, lcb=False):
-		"""
-		Optimizes UCB acquisiton function and return next point and its value as output
-		:param beta: beta from GP UCB
-		:param multistart: number of starts
-		:return: (next_point, value at next_point)
-		"""
-
-		mean = lambda x: self.mean_std(x, reuse=True)[0][0][0]
-		sigma = lambda x: self.mean_std(x, reuse=True)[1][0][0]
-
-		ucb = lambda x: torch.dot(torch.Tensor([1.0, np.sqrt(beta)]), torch.Tensor(
-			[self.mean_std(x, reuse=True)[0][0][0], self.mean_std(x, reuse=True)[1][0][0]]))
-		lcb = lambda x: torch.dot(torch.Tensor([1.0, np.sqrt(beta)]), torch.Tensor(
-			[self.mean_std(x, reuse=True)[0][0][0], -self.mean_std(x, reuse=True)[1][0][0]]))
-
-		if lcb == False:
-			fun2 = lambda x: -ucb(torch.from_numpy(x).view(1, -1)).numpy()
-		else:
-			fun2 = lambda x: -lcb(torch.from_numpy(x).view(1, -1)).numpy()
-		fun = lambda x: -(
-					mean(torch.from_numpy(x).view(1, -1)) + np.sqrt(beta) * sigma(torch.from_numpy(x).view(1, -1)))
-
-		self.back_prop = False
-		self.mean_std(self.x)
-
-		mybounds = self.bounds
-
-		results = []
-
-		from scipy.optimize import minimize
-
-		for i in range(multistart):
-			x0 = np.random.randn(self.d)
-			for i in range(self.d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-
-			res = minimize(fun2, x0, method="L-BFGS-B", jac=None, tol=0.000001, bounds=mybounds)
-			solution = res.x
-			results.append([solution, -fun(solution)])
-
-		results = np.array(results)
-		index = np.argmax(results[:, 1])
-		solution = results[index, 0]
-
-		return (torch.from_numpy(solution), -fun(solution))
-
-	def isin(self, xnext):
-		self.epsilon = 0.001
-		for v in self.x:
-			if torch.norm(v - xnext, p=2) < self.epsilon:
-				return True
-
-	def sample_and_condition(self, x):
-		xprobe = x.view(1, -1)
-		fprobe = self.sample(xprobe)
-		if not self.isin(xprobe):
-			self.x = torch.cat((self.x, xprobe), dim=0)
-			self.y = torch.cat((self.y, fprobe), dim=0)
-			self.fit_gp(self.x, self.y)
-		return -fprobe
-
-	def get_lambdas_TH(self):
-		fun = lambda x: self.sample_and_condition(x)
-		grad = None
-		return [fun, grad]
-
-	def sample_iteratively_max(self, xtest, multistart=20, minimizer="coordinate-wise", grid=100):
-		"""
-			Samples Path from GP and takes the maximum iteratively
-			:param xtest: grid
-			:param size: number of samples
-			:return: numpy array
-		"""
-		# print ("Iterative:",multistart,minimizer,grid)
-		from scipy.optimize import minimize
-		# old stuff
-		xold = self.x
-		yold = self.y
-
-		# with fixed grid
-		if xtest is not None:
-			# number of samples
-			nn = xtest.shape[0]
-
-			f = torch.zeros(nn, dtype=torch.float64)
-
-			for j in range(nn):
-				xprobe = xtest[j, :].view(1, -1)
-				(K_star, K_star_star) = self.execute(xprobe)
-				(ymean, yvar) = self.mean_std(xprobe)
-				L = torch.sqrt(K_star_star + self.s * self.s * torch.eye(1, dtype=torch.float64) - yvar)
-				fprobe = ymean + L * torch.randn(1, dtype=torch.float64)
-				# add x and fprobe to the dataset and redo the whole
-				f[j] = fprobe
-				if not self.isin(xprobe):
-					self.x = torch.cat((self.x, xprobe), dim=0)
-					self.y = torch.cat((self.y, fprobe), dim=0)
-
-				self.fit_gp(self.x, self.y)
-
-			val, index = torch.max(f, dim=0)
-			self.fit_gp(xold, yold)
-			return (xtest[index, :], f[index])
-
-		else:
-			# Iterative without grid
-
-			# get bounds
-			if self.bounds == None:
-				mybounds = tuple([(-self.diameter, self.diameter) for i in range(self.d)])
-			else:
-				mybounds = self.bounds
-			[fun, grad] = self.get_lambdas_TH()
-
-			results = []
-			for j in range(multistart):
-
-				# print ("Multistart:",j)
-				x0 = torch.randn(self.d, dtype=torch.float64)
-				for i in range(self.d):
-					x0[i].uniform_(mybounds[i][0], mybounds[i][1])
-
-				# simple coordnate-wise optimization
-				if minimizer == "coordinate-wise":
-					solution = x0
-					for i in range(self.d):
-						xtest = torch.from_numpy(np.tile(x0, (grid, 1)))
-						xtest[:, i] = torch.linspace(mybounds[i][0], mybounds[i][1], grid)
-						sample = self.sample(xtest)
-
-						## Add to the posterior
-						self.x = torch.cat((self.x, xtest), dim=0)
-						self.y = torch.cat((self.y, sample), dim=0)
-
-						# argmax
-						val, index = torch.max(sample, dim=0)
-						out = xtest[index, :]
-
-						# fit new GP
-						self.fit_gp(self.x, self.y)
-						solution[i] = out[0, i]
-
-				elif minimizer == "L-BFGS-B":
-					solution = np.random.randn(self.d)
-					xmax = [b[1] for b in mybounds]
-					xmin = [b[0] for b in mybounds]
-					bounds = MyBounds(xmax=xmax, xmin=xmin)
-					func = lambda x: fun(torch.from_numpy(x)).numpy()[0][0]
-					res = scipy.optimize.basinhopping(func, solution, disp=False, niter=grid, accept_test=bounds)
-					solution = torch.from_numpy(res.x)
-
-				else:
-					raise AssertionError("Wrong optimizer selected.")
-
-				results.append(torch.cat((solution, -fun(solution)[0])))
-				self.x = xold
-				self.y = yold
-				self.fit_gp(self.x, self.y)
-
-			results = torch.stack(results)
-			val, index = torch.max(results[:, -1], dim=0)
-			solution = results[index, 0:self.d].view(1, self.d)
-			self.x = xold
-			self.y = yold
-			self.fit_gp(self.x, self.y)
-
-			return (solution, -fun(solution))
+    def __init__(
+        self,
+        gamma=1,
+        s=0.001,
+        kappa=1.0,
+        kernel_name="squared_exponential",
+        diameter=1.0,
+        groups=None,
+        bounds=None,
+        nu=1.5,
+        kernel=None,
+        d=1,
+        power=2,
+        lam=1.0,
+        loss="squared",
+        huber_delta=1.35,
+        hyper="classical",
+        B=1.0,
+        svr_eps=0.1,
+    ):
+        """
+
+        :param gamma: Smoothnes parameter for squared exponential, laplace and matern kernel
+        :param s: level of noise
+        :param kernel: choose from a list
+        :param diameter: diameter of the set (deprecated)
+        :param groups: additive groups
+        :param bounds: bounds for the continuous optimization
+        :param v: parameter for matern kernel
+        """
+
+        ## GP properties
+        self.s = s
+        self.d = d
+        self.x = None
+        self.K = np.array([1.0])
+        self.mu = 0.0
+        self.lam = lam
+        self.total_bound = B
+        self.prob = 0.5
+        self.svr_eps = svr_eps
+        self.safe = False
+        self.fitted = False
+        self.diameter = diameter
+        self.bounds = bounds
+        self.admits_first_order = False
+        self.back_prop = True
+        self.loss = loss
+        self.huber_delta = huber_delta
+        self.hyper = hyper
+        self.prepared_log_marginal = False
+        self.warm_start_solution = None
+        self.max_size = 10000
+        ## kernel hyperparameters
+        if kernel is not None:
+            self.kernel_object = kernel
+            self.kernel = kernel.kernel
+            self.d = kernel.d
+        else:
+            self.kernel_object = KernelFunction(
+                kernel_name=kernel_name,
+                gamma=gamma,
+                nu=nu,
+                groups=groups,
+                kappa=kappa,
+                power=power,
+                d=d,
+            )
+            self.kernel = self.kernel_object.kernel
+
+            self.gamma = gamma
+            self.v = nu
+            self.groups = groups
+            self.kappa = kappa
+            self.custom = kernel
+            self.optkernel = kernel_name
+
+    def residuals(self, x, y):
+        res = self.mean(x) - y
+        return res
+
+    def description(self):
+        """
+        Description of GP in text
+        :return: string with description
+        """
+        return self.kernel_object.description() + "\nlambda=" + str(self.s)
+
+    def embed(self, x):
+        return self.kernel_object.embed(x)
+
+    def get_basis_size(self):
+        return self.kernel_object.get_basis_size()
+
+    def make_safe(self, x):
+        """
+        Make the input dataset numerically stable by removing duplicates?
+        :param x:
+        :return:
+        """
+        self.epsilon = 0.001
+        # remove vectors that are very close to each other
+        return x
+
+    def add_data_point(self, x, y, Sigma=None):
+
+        if self.x is not None:
+            self.x = torch.cat((self.x, x), dim=0)
+            self.y = torch.cat((self.y, y), dim=0)
+            if Sigma is None:
+                self.Sigma = torch.block_diag(
+                    self.Sigma, torch.eye(x.size()[0], dtype=torch.double) * self.s
+                )
+        else:
+            self.x = x
+            self.y = y
+            self.Sigma = Sigma
+        self.fit_gp(self.x, self.y, Sigma=self.Sigma)
+
+    def fit(self, x=None, y=None):
+        if x is not None:
+            self.fit_gp(x, y)
+        else:
+            self.fit_gp(self.x, self.y)
+
+    def lcb(self, xtest):
+        """
+        Lower confidence bound
+        :return:
+        """
+        mu, s = self.mean_std(xtest)
+        return mu - 2 * s
+
+    def ucb(self, xtest):
+        """
+        Upper confidence bound
+        :param xtest:
+        :return:
+        """
+        mu, s = self.mean_std(xtest)
+        return mu + 2 * s
+
+    def fit_gp(self, x, y, Sigma=None, iterative=False, extrapoint=False):
+        """
+        Fits the Gaussian process, possible update is via iterative inverse
+        :param x: data x
+        :param y: values y
+        :param iterative: iterative inverse, where only last point of x is used
+        :param extrapoint: iterative inverse must be allowed, x is the only addition
+        :return:
+        """
+        # first fit
+        try:
+            self.n, self.d = list(x.size())
+        except:
+            self.n, self.d = x.shape
+
+        if Sigma is None:
+            self.Sigma = (self.s) * torch.eye(self.n, dtype=torch.float64)
+        else:
+            self.Sigma = Sigma
+
+        if self.fitted == False or iterative == False:
+
+            if self.safe == True:
+                x = self.make_safe(x)
+
+            self.x = x
+            self.y = y
+            self.K = self.kernel(x, x) + self.Sigma.T @ self.Sigma
+            self.fitted = True
+        else:
+            # iterative inverse
+            if iterative == True:
+                if extrapoint == False:
+                    last_point = self.x[-1, :].view(1, -1)
+                else:
+                    last_point = x
+                old_K = self.K
+                old_Kinv = self.Kinv
+            else:
+                pass
+        self.mean_std(x)
+        return None
+
+    def norm(self):
+        if self.fitted:
+            val = torch.sqrt(self.A.T @ self.kernel(self.x, self.x) @ self.A)
+            return val
+        else:
+            return None
+
+    def beta(self, delta=1e-3, norm=1):
+        """
+        return concentration parameter given the current estimates
+
+        :param delta: failure probability
+        :param norm: norm assumption
+        :return:
+        """
+        beta_value = self.s * norm + torch.sqrt(
+            2 * torch.log(1.0 / delta + torch.log(torch.det(self.K) / self.s**self.n))
+        )
+        return beta_value
+
+    def execute(self, xtest):
+        """
+        Calculates the covariance between data and xtest
+        :param xtest:
+        :return:
+        """
+        if self.fitted == True:
+            K_star = self.kernel(self.x, xtest)
+        else:
+            K_star = None
+        K_star_star = self.kernel(xtest, xtest)
+        return (K_star, K_star_star)
+
+    def _huber_fit(self, K_star, newK=None):
+        alpha = cp.Variable(self.n)
+        self.jitter = 10e-5
+        if newK is None:
+            K = self.kernel(self.x, self.x) + self.jitter * torch.eye(
+                self.n, dtype=torch.float64
+            )
+        else:
+            K = newK.detach()
+        K = cp.atoms.affine.wraps.psd_wrap(K)
+        objective = cp.Minimize(
+            cp.sum(
+                cp.huber(
+                    (K @ alpha - self.y.view(-1).numpy()) / self.s, M=self.huber_delta
+                )
+            )
+            + self.lam * cp.quad_form(alpha, K)
+        )
+        prob = cp.Problem(objective)
+        prob.solve(solver=cp.MOSEK, enforce_dpp=False)
+        if K_star is not None:
+            return K_star @ torch.from_numpy(alpha.value).view(-1, 1)
+        else:
+            return torch.from_numpy(alpha.value).view(-1, 1)
+
+    def _svr_fit(self, K_star, newK=None):
+        alpha = cp.Variable(self.n)
+        self.jitter = 10e-5
+        if newK is None:
+            K = self.kernel(self.x, self.x) + self.jitter * torch.eye(
+                self.n, dtype=torch.float64
+            )
+        else:
+            K = newK.detach()
+
+        K = cp.atoms.affine.wraps.psd_wrap(K)
+        objective = cp.Minimize(self.lam * cp.quad_form(alpha, K))
+        constraints = [cp.abs(K @ alpha - self.y.view(-1).numpy()) <= self.svr_eps]
+        prob = cp.Problem(objective, constraints)
+        prob.solve(solver=cp.MOSEK, enforce_dpp=False)
+        if K_star is not None:
+            return K_star @ torch.from_numpy(alpha.value).view(-1, 1)
+        else:
+            return torch.from_numpy(alpha.value).view(-1, 1)
+
+    def _unif_fit(self, K_star, newK=None):
+        alpha = cp.Variable((self.n, 1))
+        self.jitter = 10e-5
+        if newK is None:
+            K = self.kernel(self.x, self.x) + self.jitter * torch.eye(
+                self.n, dtype=torch.float64
+            )
+        else:
+            K = newK.detach()
+
+        K = cp.atoms.affine.wraps.psd_wrap(K)
+        con = (
+            2
+            * self.total_bound
+            * self.prob
+            / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s**2))
+        )
+        objective = cp.Minimize(
+            cp.sum(
+                cp.logistic(
+                    cp.square(
+                        (K @ alpha - self.y.view(-1, 1).numpy()) / (np.sqrt(2) * self.s)
+                    )
+                    + np.log(con)
+                )
+            )
+            + self.lam * cp.quad_form(alpha, K)
+        )
+        prob = cp.Problem(objective)
+        prob.solve(solver=cp.MOSEK, enforce_dpp=False)
+        if K_star is not None:
+            return K_star @ torch.from_numpy(alpha.value).view(-1, 1)
+        else:
+            return torch.from_numpy(alpha.value).view(-1, 1)
+
+    def _unif_fit_torch(self, K_star, newK=None, warm_start=None):
+        self.jitter = 10e-5
+        if newK is None:
+            K = self.kernel(self.x, self.x) + self.jitter * torch.eye(
+                self.n, dtype=torch.float64
+            )
+        else:
+            K = newK.detach()
+
+        con = (
+            2
+            * self.total_bound
+            * self.prob
+            / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s**2))
+        )
+        unif = (
+            lambda alpha: torch.sum(
+                torch.log(
+                    torch.exp(
+                        ((K @ alpha - self.y.view(-1)) ** 2) / (2 * self.s**2)
+                        + np.log(con)
+                    )
+                    + 1
+                )
+            )
+            + self.lam * alpha @ K @ alpha
+        )
+        if warm_start is None:
+            x_init = torch.zeros(size=(self.n, 1)).view(-1).double()
+        else:
+            x_init = warm_start.view(-1)
+
+        res = minimize_torch(
+            unif,
+            x_init,
+            method="l-bfgs",
+            tol=1e-3,
+            disp=0,
+            options={"max_iter": 200, "gtol": 1e-3},
+        )
+        alpha = res.x
+
+        if K_star is not None:
+            return K_star @ alpha.view(-1, 1)
+        else:
+            return alpha.view(-1, 1)
+
+    def _huber_fit_torch(self, K_star, newK=None):
+        self.jitter = 10e-5
+        if newK is None:
+            K = self.kernel(self.x, self.x) + self.jitter * torch.eye(
+                self.n, dtype=torch.float64
+            )
+        else:
+            K = newK
+        L = torch.linalg.cholesky(K)
+
+        huber = (
+            lambda beta: torch.nn.functional.huber_loss(
+                L @ beta / self.s,
+                self.y.view(-1) / self.s,
+                reduction="sum",
+                delta=self.huber_delta,
+            )
+            + self.lam * beta @ beta
+        )
+        # x_init = torch.linalg.solve(L.T@L+torch.eye(self.n).double()*self.s**2*self.lam, self.y)
+        x_init = torch.zeros(size=(self.n, 1)).view(-1).double()
+        res = minimize_torch(
+            huber,
+            x_init,
+            method="l-bfgs",
+            tol=1e-4,
+            disp=0,
+            options={"max_iter": 10**3, "gtol": 1e-4},
+        )
+        alpha = torch.linalg.solve(L, res.x)
+        if K_star is not None:
+            return K_star @ alpha.view(-1, 1)
+        else:
+            return alpha.view(-1, 1)
+
+    def mean_std(self, xtest, full=False, reuse=False):
+        if xtest.size()[0] < self.max_size:
+            return self.mean_std_sub(xtest, full=full, reuse=reuse)
+        else:
+            stepby = self.max_size
+            mu = torch.zeros(size=(xtest.size()[0], 1)).double()
+            std = torch.zeros(size=(xtest.size()[0], 1)).double()
+
+            # first
+            i = 0
+            mu[i * stepby : (i + 1) * stepby], std[i * stepby : (i + 1) * stepby] = (
+                self.mean_std_sub(xtest[i * stepby : (i + 1) * stepby, :], reuse=False)
+            )
+
+            for i in np.arange(1, xtest.size()[0] // stepby, 1):
+                print(i, "/", xtest.size()[0] // stepby)
+                (
+                    mu[i * stepby : (i + 1) * stepby],
+                    std[i * stepby : (i + 1) * stepby],
+                ) = self.mean_std_sub(
+                    xtest[i * stepby : (i + 1) * stepby, :], reuse=True
+                )
+
+            # last
+            if xtest.size()[0] % stepby > 0:
+                (
+                    mu[xtest.size()[0] - xtest.size()[0] % stepby :],
+                    std[xtest.size()[0] - xtest.size()[0] % stepby :],
+                ) = self.mean_std_sub(
+                    xtest[xtest.size()[0] - xtest.size()[0] % stepby :, :], reuse=True
+                )
+
+            return mu, std
+
+    def mean_std_sub(self, xtest, full=False, reuse=False):
+        """
+        Return posterior mean and variance as tuple
+        :param xtest: grid, numpy array (2D)
+        :param full: Instead of just poinwise variance, full covariance can be outputed (bool)
+        :return: (tensor,tensor)
+        """
+        if full:
+            (K_star, K_star_star) = self.execute(xtest)
+        else:
+            K_star = self.kernel(self.x, xtest)
+            diag_K_star_star = torch.hstack(
+                [
+                    self.kernel(xtest[i, :].view(1, -1), xtest[i, :].view(1, -1)).view(
+                        1
+                    )
+                    for i in range(xtest.size()[0])
+                ]
+            )
+
+        if self.fitted == False:
+            # the process is not fitted
+
+            if full == False:
+                x = torch.sum(xtest, dim=1)
+                # first = torch.diag(K_star_star).view(-1, 1)
+                first = diag_K_star_star.view(-1, 1)
+                variance = first
+                yvar = torch.sqrt(variance)
+            else:
+                x = torch.sum(xtest, dim=1)
+                first = K_star_star
+                yvar = first
+
+            return (0 * x.view(-1, 1), yvar)
+
+        else:
+
+            if self.back_prop == False:
+                if reuse == False:
+                    # self.decomp = torch.lu(self.K.unsqueeze(0))
+                    self.LU, self.pivot = torch.linalg.lu_factor(self.K.unsqueeze(0))
+                    # self.A = torch.lu_solve(self.y.unsqueeze(0), *self.decomp)[0, :, :]
+                    self.A = torch.linalg.lu_solve(
+                        self.LU, self.pivot, self.y.unsqueeze(0)
+                    )[0, :, :]
+                self.B = torch.t(
+                    torch.linalg.lu_solve(
+                        self.LU, self.pivot, torch.t(K_star).unsqueeze(0)
+                    )[0, :, :]
+                )
+            else:
+                if reuse == False:
+                    self.A = torch.linalg.lstsq(self.K, self.y)[0]
+                # self.B = torch.t(torch.linalg.solve(self.K, torch.t(K_star)))
+                self.B = torch.t(torch.linalg.lstsq(self.K, torch.t(K_star))[0])
+
+            if self.loss == "squared":
+                ymean = torch.mm(K_star, self.A)
+            elif self.loss == "huber":
+                ymean = self._huber_fit(K_star)
+            elif self.loss == "svr":
+                ymean = self._svr_fit(K_star)
+            elif self.loss == "unif" or self.loss == "unif_new":
+                ymean = self._unif_fit_torch(K_star)
+            else:
+                raise AssertionError("Loss function not implemented.")
+
+            if full == False:
+                first = diag_K_star_star.view(-1, 1)
+                second = torch.einsum("ij,ji->i", (self.B, torch.t(K_star))).view(-1, 1)
+                variance = first - second
+                yvar = torch.sqrt(variance)
+            else:
+                first = K_star_star
+                second = torch.mm(self.B, torch.t(K_star))
+                yvar = first - second
+
+            return (ymean, yvar)
+
+    def mean(self, xtest):
+        """
+        Calculates the mean prediction over a specific input space
+        :param xtest: input
+        :return:
+        """
+        K_star = self.kernel(self.x, xtest)
+
+        if self.loss == "squared":
+            ymean = torch.mm(K_star, self.A)
+        elif self.loss == "huber":
+            ymean = self._huber_fit(K_star)
+        else:
+            raise AssertionError("Loss function not implemented.")
+
+        return ymean
+
+    def gradient_mean_var(self, point, hessian=True):
+        """
+        Can calculate gradient at single point atm.
+
+        :param point:
+        :return:
+        """
+
+        # mean
+        point.requires_grad_(True)
+        mu = self.mean_std(point)[0]
+        nabla_mu = grad(mu, point, create_graph=True)[0][0]
+
+        if hessian == True:
+            # variance
+            H = self.kernel_object.get_2_der(point)
+            C = self.kernel_object.get_1_der(point, self.x)
+
+            V = H - torch.t(C) @ self.K @ C
+
+            return [nabla_mu, V]
+        else:
+            return nabla_mu
+
+    def mean_gradient_hessian(self, xtest, hessian=False):
+        hessian_mu = torch.zeros(size=(self.d, self.d), dtype=torch.float64)
+        xtest.requires_grad_(True)
+        # xtest.retain_grad()
+        mu = self.mean_std(xtest)[0]
+        # mu.backward(retain_graph=True)
+
+        # nabla_mu = xtest.grad
+        nabla_mu = grad(mu, xtest, create_graph=True)[0][0]
+
+        if hessian == False:
+            return nabla_mu
+        else:
+            for i in range(self.d):
+                hessian_mu[i, :] = grad(
+                    nabla_mu[i], xtest, create_graph=True, retain_graph=True
+                )[0][0]
+            return [nabla_mu, hessian_mu]
+
+    def sample(self, xtest, size=1, jitter=10e-8):
+        """
+                Samples Path from GP, return a numpy array evaluated over grid
+        :param xtest: grid
+        :param size: number of samples
+        :return: numpy array
+        """
+        nn = list(xtest.size())[0]
+
+        if self.fitted == True:
+            (ymean, yvar) = self.mean_std(xtest, full=True)
+            Cov = yvar + 10e-10 * torch.eye(nn, dtype=torch.float64)
+            L = torch.linalg.cholesky(Cov)
+            # L = torch.from_numpy(np.linalg.cholesky(Cov.numpy()))
+            random_vector = torch.normal(
+                mean=torch.zeros(nn, size, dtype=torch.float64), std=1.0
+            )
+            f = ymean + torch.mm(L, random_vector)
+        else:
+            (K_star, K_star_star) = self.execute(xtest)
+            L = torch.linalg.cholesky(
+                K_star_star + jitter * torch.eye(nn, dtype=torch.float64)
+            )
+            random_vector = torch.normal(
+                mean=torch.zeros(nn, size, dtype=torch.float64), std=1.0
+            )
+            f = self.mu + torch.mm(L, random_vector)
+        return f
+
+    def sample_and_max(self, xtest, size=1):
+        """
+                Samples Path from GP and takes argmax
+        :param xtest: grid
+        :param size: number of samples
+        :return: (argmax, max)
+        """
+        f = self.sample(xtest, size=size)
+        self.temp = f
+        val, index = torch.max(f, dim=0)
+        return (xtest[index, :], val)
+
+    def log_marginal(self, kernel, X, weight):
+
+        if self.loss == "squared":
+            return self._log_marginal_squared(kernel, X, weight)
+        elif self.loss == "unif_new":
+            return self._log_marginal_unif(kernel, X, weight)
+        else:
+            return self._log_marginal_map(kernel, X, weight)
+
+    def _log_marginal_unif(self, kernel, X, weight):
+        if not self.prepared_log_marginal:
+            self._prepare_log_marginal_unif()
+
+        func = kernel.get_kernel()
+        self.jitter = 10e-4
+        K = (
+            func(self.x, self.x, **X)
+            + torch.eye(self.n, dtype=torch.float64) * self.jitter
+        )
+        # print ("Kernel")
+        # print (K)
+        L = torch.linalg.cholesky(K)
+        self.L_unif.value = L.data.numpy()
+
+        self.prob_unif.solve(solver=cp.MOSEK, enforce_dpp=False, warm_start=True)
+
+        solution = (
+            torch.zeros(size=(self.n, 1), requires_grad=True).reshape(-1).double()
+        )
+        solution.data = torch.from_numpy(self.beta_unif.value)
+        con = (
+            2
+            * self.total_bound
+            * self.prob
+            / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s**2))
+        )
+
+        loglikelihood = (
+            lambda beta: torch.sum(
+                torch.log(
+                    torch.exp(
+                        ((L @ beta - self.y.view(-1)) ** 2) / (2 * self.s**2)
+                        + np.log(con)
+                    )
+                    + 1
+                )
+            )
+            + self.lam * beta.T @ beta
+        )
+
+        H = hessian(loglikelihood)(solution)
+        logdet = -0.5 * torch.slogdet(H)[1] * weight
+        logprob = -0.5 * loglikelihood(solution) + logdet
+        logprob = -logprob
+        return logprob
+
+    def _prepare_log_marginal_unif(self):
+
+        self.beta_unif = cp.Variable(self.n)
+        self.L_unif = cp.Parameter((self.n, self.n))
+
+        con = (
+            2
+            * self.total_bound
+            * self.prob
+            / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s**2))
+        )
+        # self.objective_unif = cp.Minimize(cp.sum(cp.logistic(cp.square(
+        # 	(self.K_unif @ self.alpha_unif - self.y.view(-1).numpy()) / (np.sqrt(2) * self.s)) + np.log(con))) + self.lam * cp.quad_form(
+        # 	self.alpha_unif, self.L))
+        self.objective_unif = cp.Minimize(
+            cp.sum(
+                cp.logistic(
+                    cp.square(
+                        (self.L_unif @ self.beta_unif - self.y.view(-1).numpy())
+                        / (np.sqrt(2) * self.s)
+                    )
+                    + np.log(con)
+                )
+            )
+            + self.lam * cp.sum_squares(self.beta_unif)
+        )
+        self.prob_unif = cp.Problem(self.objective_unif)
+        self.prepared_log_marginal = True
+
+    def _prepare_log_marginal_huber(self):
+        beta = cp.Variable(self.n)
+        L = cp.Parameter((self.n, self.n))
+
+        objective = cp.Minimize(
+            cp.sum(
+                cp.huber(
+                    (L @ beta - self.y.view(-1).numpy()) / self.s, M=self.huber_delta
+                )
+            )
+            + self.lam * cp.sum_squares(beta)
+        )
+
+        prob = cp.Problem(objective)
+        cvxpylayer = CvxpyLayer(prob, parameters=[L], variables=[beta])
+        self.prepared_log_marginal = True
+        print("cvxpy-layer has been initialized.")
+        return cvxpylayer
+
+    def _log_marginal_huber_cvxpy(self, kernel, X, weight):
+        func = kernel.get_kernel()
+        self.jitter = 10e-4
+        L_tch = torch.linalg.cholesky(
+            func(self.x, self.x, **X)
+            + torch.eye(self.n, dtype=torch.float64) * self.jitter
+        )
+
+        if not self.prepared_log_marginal:
+            self._cvxpylayer = self._prepare_log_marginal_huber()
+        solution = self._cvxpylayer(L_tch)[0]
+
+        huber = (
+            lambda beta: torch.nn.functional.huber_loss(
+                L_tch @ beta / self.s,
+                self.y.view(-1) / self.s,
+                reduction="sum",
+                delta=self.huber_delta,
+            )
+            + self.lam * beta.T @ beta
+        )
+        H = torch.autograd.functional.hessian(huber, solution)
+
+        logdet = -0.5 * torch.slogdet(H)[1] * weight
+        logprob = -0.5 * huber(solution) + logdet
+        logprob = -logprob
+        return logprob
+
+    def _log_marginal_map(self, kernel, X, weight):
+        # this implementation uses Danskin theorem to simplify gradient propagation
+        func = kernel.get_kernel()
+        self.jitter = 10e-4
+        K_tch = (
+            func(self.x, self.x, **X)
+            + torch.eye(self.n, dtype=torch.float64) * self.jitter
+        )
+
+        # solve
+        solution = (
+            torch.zeros(size=(self.n, 1), requires_grad=True).reshape(-1).double()
+        )
+        if self.warm_start_solution is None:
+            self.warm_start_solution = solution.clone()
+
+        if self.loss == "huber":
+            alpha = self._huber_fit(None, newK=K_tch).detach()
+            loglikelihood = (
+                lambda alpha: torch.nn.functional.huber_loss(
+                    K_tch @ alpha / self.s,
+                    self.y.view(-1) / self.s,
+                    reduction="sum",
+                    delta=self.huber_delta,
+                )
+                + self.lam * alpha.T @ K_tch @ alpha
+            )
+
+            solution.data = alpha.reshape(-1).data
+            self.warm_start_solution.data = solution.data
+            mask = torch.abs(K_tch @ alpha - self.y) / self.s < self.huber_delta
+            mask = mask.view(-1).double()
+            D = torch.diag(mask)
+            H = K_tch @ D @ K_tch + 2 * self.lam * K_tch
+
+        elif self.loss == "svr":
+            alpha = self._svr_fit(None, newK=K_tch).detach()
+
+            loglikelihood = (
+                lambda alpha: torch.sum(
+                    torch.abs(K_tch @ alpha - self.y.view(-1))
+                    * (K_tch @ alpha - self.y.view(-1) > self.svr_eps).int()
+                )
+                + self.lam * alpha.T @ K_tch @ alpha
+            )
+
+            solution.data = alpha.reshape(-1).data
+            self.warm_start_solution.data = solution.data
+            H = torch.autograd.functional.hessian(loglikelihood, solution)
+
+        elif self.loss == "unif":
+            alpha = self._unif_fit_torch(None, newK=K_tch).detach()
+            con = (
+                2
+                * self.total_bound
+                * self.prob
+                / ((1 - self.prob) * np.sqrt(2 * np.pi * self.s**2))
+            )
+
+            loglikelihood = (
+                lambda alpha: torch.sum(
+                    torch.log(
+                        torch.exp(
+                            ((K_tch @ alpha - self.y.view(-1)) ** 2) / (2 * self.s**2)
+                            + np.log(con)
+                        )
+                        + 1
+                    )
+                )
+                + self.lam * alpha @ K_tch @ alpha
+            )
+            # v = lambda alpha : torch.sum(torch.exp( ((K_tch@alpha-self.y.view(-1))**2)/(2*self.s**2) + np.log(con) ))
+            solution.data = alpha.reshape(-1).data
+            self.warm_start_solution.data = solution.data
+            H = hessian(loglikelihood)(solution)
+
+        logdet = -0.5 * torch.slogdet(H)[1] * weight
+        logprob = -0.5 * loglikelihood(solution) + logdet
+        logprob = -logprob
+        return logprob
+
+    def _log_marginal_squared(self, kernel, X, weight):
+        func = kernel.get_kernel()
+        K = (
+            func(self.x, self.x, **X)
+            + torch.eye(self.n, dtype=torch.float64) * self.s * self.s
+        )
+        logdet = -0.5 * torch.slogdet(K)[1] * weight
+        alpha = torch.linalg.solve(K, self.y)
+        logprob = -0.5 * torch.mm(torch.t(self.y), alpha) + logdet
+        logprob = -logprob
+        return logprob
+
+    def optimize_params(
+        self,
+        type="bandwidth",
+        restarts=10,
+        regularizer=None,
+        maxiter=1000,
+        mingradnorm=1e-4,
+        verbose=False,
+        optimizer="pymanopt",
+        scale=1.0,
+        weight=1.0,
+        save=False,
+        save_name="model.np",
+        init_func=None,
+        bounds=None,
+        parallel=False,
+        cores=None,
+    ):
+
+        # Spectral norm regularizer
+        if regularizer is not None:
+            if regularizer[0] == "spectral_norm":
+                regularizer_func = lambda S: regularizer[1] * torch.norm(
+                    1 / S[0], p="nuc"
+                )
+            elif regularizer[0] == "lasso":
+                regularizer_func = lambda S: regularizer[1] * torch.norm(1 / S[0], p=1)
+            else:
+                regularizer_func = None
+        else:
+            regularizer_func = None
+
+        if type == "bandwidth":
+            params = {}
+            for key, dict2 in self.kernel_object.params_dict.items():
+                if "gamma" in dict2.keys():
+                    params[key] = {"gamma": (init_func, Euclidean(1), bounds)}
+                elif "ard_gamma" in dict2.keys():
+                    params[key] = {
+                        "ard_gamma": (init_func, Euclidean(len(dict2["group"])), bounds)
+                    }
+
+        elif type == "bandwidth+noise":
+            params = {}
+            init_func_noise = lambda x: self.s
+            for key, dict2 in self.kernel_object.params_dict.items():
+
+                if "gamma" in dict2.keys():
+                    params[key] = {"gamma": (init_func, Euclidean(1), bounds)}
+
+                elif "ard_gamma" in dict2.keys():
+                    params[key] = {
+                        "ard_gamma": (init_func, Euclidean(len(dict2["group"])), bounds)
+                    }
+
+            params["likelihood"] = {"sigma": (init_func_noise, Euclidean(1), None)}
+
+        elif type == "rots":
+            params = {}
+            d = int(self.kernel_object.d)
+            for key, dict2 in self.kernel_object.params_dict.items():
+                if "rot" in dict2.keys():
+                    params[key] = {"rot": (None, Stiefel(d, d), None)}
+        elif type == "groups":
+            params = {}
+            optimizer = "discrete"
+            d = self.kernel_object.d
+            for key, dict2 in self.kernel_object.params_dict.items():
+                if "groups" in dict2.keys():
+                    params[key] = {"groups": (None, helper.generate_groups(d), None)}
+            pass
+        elif type == "covariance":
+            params = {}
+            d = int(self.kernel_object.d)
+            for key, dict2 in self.kernel_object.params_dict.items():
+                if "cov" in dict2.keys():
+                    params[key] = {"cov": (None, PSDFixedRank(d, d), None)}
+        else:
+            raise AttributeError("This quick-optimization is not implemented.")
+
+        self.optimize_params_general(
+            params=params,
+            restarts=restarts,
+            optimizer=optimizer,
+            regularizer_func=regularizer_func,
+            maxiter=maxiter,
+            mingradnorm=mingradnorm,
+            verbose=verbose,
+            scale=scale,
+            weight=weight,
+            save=save,
+            save_name=save_name,
+            parallel=parallel,
+            cores=cores,
+        )
+
+    def log_probability(self, xtest, sample):
+        from scipy.stats import multivariate_normal
+
+        mu, covar = self.mean_std(xtest, full=True)
+        p = np.log(
+            multivariate_normal.pdf(
+                sample.view(-1).numpy(), mean=mu.view(-1).numpy(), cov=covar.numpy()
+            )
+        )
+        return p
+
+    def volume_mean_cvxpy(
+        self,
+        xtest,
+        weights=None,
+        eps=10e-2,
+        tol=10e-14,
+        max_weight=1,
+        max_iter=1000,
+        verbose=False,
+        scale=10e-4,
+        slope=1.0,
+        bisections=10,
+        B="auto",
+        optimal_scale=None,
+        optimize_scale=False,
+        relax="relu",
+    ):
+
+        n = self.x.size()[0]
+        K = self.get_kernel()  # (self.x, self.x)
+        Kinv = torch.pinverse(K + eps * torch.eye(K.size()[0]).double()).numpy()
+        if weights is None:
+            weights = torch.ones(self.x.size()[0]) / n
+        if B == "auto":
+            alpha, _ = torch.lstsq(self.y, K)
+            beta = K @ alpha
+            B = beta.T @ Kinv @ beta
+            print("Auto:B", B)
+
+        def fun(scale_arg):
+            beta = cp.Variable(n)
+            if relax == "relu":
+                loss_fn_transformed = cp.sum(
+                    cp.pos(
+                        weights
+                        * slope
+                        * (cp.abs(beta - self.y.numpy().reshape(-1)) - eps)
+                    )
+                ) + 0.5 * scale_arg * cp.quad_form(beta, Kinv)
+            elif relax == "log":
+                loss_fn_transformed = cp.sum(
+                    cp.logistic(
+                        weights
+                        * slope
+                        * (cp.abs(beta - self.y.numpy().reshape(-1)) - eps)
+                    )
+                ) + 0.5 * scale_arg * cp.quad_form(beta, Kinv)
+
+            # loss_fn_transformed = cp.sum(weights*logit(slope*(cp.abs(beta - self.y.numpy().reshape(-1)) -eps))) +  0.5*scale_arg*cp.quad_form(beta, Kinv)-
+
+            prob = cp.Problem(cp.Minimize(loss_fn_transformed))
+            # prob.solve(solver=cp.MOSEK, feastol=tol, verbose=False)
+            prob.solve(solver=cp.MOSEK, verbose=False)
+            if verbose == True:
+                print(
+                    "scale:",
+                    scale_arg,
+                    "cond:",
+                    np.linalg.cond(Kinv),
+                    "sub.",
+                    beta.value.T @ Kinv @ beta.value - B,
+                    "B:",
+                    B,
+                )
+            return beta.value.T @ Kinv @ beta.value - B
+
+        if optimize_scale:
+            return helper.bisection(fun, 0.0, max_weight, bisections)
+
+        if optimal_scale is None:
+            scale_star = helper.bisection(fun, 0.0, max_weight, bisections)
+        else:
+            scale_star = optimal_scale
+
+        beta = cp.Variable(n)
+        if relax == "relu":
+            loss_fn_transformed = cp.sum(
+                weights
+                * cp.pos(slope * (cp.abs(beta - self.y.numpy().reshape(-1)) - eps))
+            ) + 0.5 * scale_star * cp.quad_form(beta, Kinv)
+        elif relax == "log":
+            loss_fn_transformed = cp.sum(
+                weights
+                * cp.logistic(slope * (cp.abs(beta - self.y.numpy().reshape(-1)) - eps))
+            ) + 0.5 * scale_star * cp.quad_form(beta, Kinv)
+        prob = cp.Problem(cp.Minimize(loss_fn_transformed))
+        # prob.solve(solver=cp.CVXOPT, feastol=tol, verbose=verbose)
+        prob.solve(solver=cp.MOSEK, verbose=verbose)
+        beta_torch = torch.from_numpy(beta.value).view(-1, 1)
+        alpha = torch.from_numpy(Kinv) @ beta_torch
+        ytest = self.kernel(self.x, xtest) @ alpha
+        return ytest
+
+    def volume_mean(
+        self,
+        xtest,
+        weights=None,
+        eps=10e-2,
+        tol=10e-6,
+        max_iter=1000,
+        verbose=False,
+        eta_start=0.01,
+        eta_decrease=0.9,
+        scale=1,
+        slope=1.0,
+        warm=True,
+        relax="relu",
+        norm=False,
+        B="auto",
+    ):
+        self.scale = scale
+        self.relax = relax
+
+        K = self.get_kernel()  # (self.x, self.x)
+        Kinv = torch.pinverse(K)
+
+        if weights is None:
+            weights = torch.ones(self.x.size()[0])
+        else:
+            weights[weights < 10e-6] = 0.0  # * self.x.size()[0]
+            weights = weights.view(-1)
+        if warm == True:
+            # warm start with L2 fit
+            alpha, _ = torch.lstsq(self.y, K)
+            beta = K @ alpha
+        else:
+            beta = torch.randn(size=(self.n, 1)).double()  # .requires_grad_(True)*0
+
+        # loss_fn_original = lambda alpha: torch.sum(torch.relu(torch.abs(K @ alpha - self.y) -eps)) + 0.5*self.s * alpha.T @ K @ alpha
+        if self.relax == "relu":
+            loss_fn_transformed = (
+                lambda beta: torch.sum(torch.relu(torch.abs(beta - self.y) - eps))
+                + self.scale * 0.5 * self.s * beta.T @ Kinv @ beta
+            )
+
+        elif self.relax == "tanh":
+            self.slope = slope
+            tanh = lambda x: (torch.tanh(self.slope * x) + 1) * 0.5
+            loss_fn_transformed = (
+                lambda beta: torch.sum(
+                    weights * tanh(torch.abs(beta - self.y) - eps).view(-1)
+                )
+                + 0.5 * self.s * self.scale * beta.T @ Kinv @ beta
+            )
+
+        elif self.relax == "elu":
+            self.slope = slope
+            elu = lambda x: torch.nn.elu(x, alpha=self.slope)
+            loss_fn_transformed = (
+                lambda beta: torch.sum(elu(torch.abs(beta - self.y) - eps))
+                + 0.5 * self.s * self.scale * beta.T @ Kinv @ beta
+            )
+
+        elif self.relax == "relu":
+            return self.volume_mean_cvxpy(
+                xtest, weights=weights, eps=eps, scale=scale, tol=tol
+            )
+        else:
+            raise AssertionError("Unkown relaxation.")
+
+        current_loss = 10e10
+        eta = eta_start
+        for i in range(max_iter):
+            grad = self.s * (Kinv @ beta)
+            beta = self.proximal(beta, grad, eta, eps, weights)
+            past_loss = current_loss
+            current_loss = loss_fn_transformed(beta)
+            if current_loss > past_loss:
+                eta = eta * eta_decrease
+            elif np.abs(current_loss - past_loss) < tol:
+                break
+
+            # print (i, beta.T)
+            if verbose == True:
+                print(i, loss_fn_transformed(beta), eta)
+
+        print("final norm:", beta.T @ Kinv @ beta)
+
+        # alpha = torch.inverse(self.K) @ beta
+        alpha = torch.pinverse(K) @ beta
+        # alpha = torch.lstsq(K,beta)
+        ytest = self.kernel(self.x, xtest) @ alpha
+        # max = torch.max(torch.abs(beta - self.y))
+        if norm == True:
+            return beta.T @ Kinv @ beta
+        # yz = self.kernel(self.x, self.x)  @ alpha
+        # approx_v = torch.sum(torch.relu(torch.abs(beta - self.y) -eps))/max
+        # approx_p = approx_v/self.n
+        # mask = (torch.abs(yz[:,0] - self.y[:,0])) > eps
+        # approx_p = float(torch.sum(mask))/float(self.n)
+        return ytest  # ,approx_p
+
+    def volume_mean_norm(
+        self,
+        xtest,
+        weights=None,
+        eps=10e-2,
+        tol=10e-6,
+        max_iter=1000,
+        verbose=False,
+        eta_start=0.01,
+        eta_decrease=0.9,
+        scale=1,
+        slope=1.0,
+        warm=True,
+        relax="relu",
+        B="auto",
+    ):
+        K = self.kernel(self.x, self.x)
+        Kinv = torch.pinverse(K)
+        if B == "auto":
+            alpha, _ = torch.lstsq(self.y, self.K)
+            beta = K @ alpha
+            B = beta.T @ Kinv @ beta
+
+        func = (
+            lambda s: self.volume_mean(
+                xtest,
+                weights=weights,
+                eps=eps,
+                tol=tol,
+                max_iter=max_iter,
+                verbose=verbose,
+                eta_start=eta_start,
+                eta_decrease=eta_decrease,
+                scale=s,
+                slope=slope,
+                warm=warm,
+                relax=relax,
+                norm=True,
+            )
+            - B
+        )
+
+        s_star = stpy.optim.custom_optimizers.bisection(func, 0.0, 1000.0, 10)
+
+        return self.volume_mean(
+            xtest,
+            weights=weights,
+            eps=eps,
+            tol=tol,
+            max_iter=max_iter,
+            verbose=verbose,
+            eta_start=eta_start,
+            eta_decrease=eta_decrease,
+            scale=s_star,
+            slope=slope,
+            warm=warm,
+            relax=relax,
+            norm=False,
+        )
+
+    def proximal(self, beta, nabla, eta, eps, weights):
+        res = beta
+        for i in range(self.n):
+            from scipy.optimize import minimize
+
+            b = float(beta[i, :])
+            y = float(self.y[i, :])
+            g = float(nabla[i, :])
+            w = float(weights[i])
+            # s = float(self.s)
+
+            tanh = lambda x: (np.tanh(self.slope * x) + 1) * 0.5
+            elu = lambda x: torch.elu(x, alpha=self.slope).numpy()
+
+            if self.relax == "relu":
+                loss_reg = lambda x: w * np.maximum(0, np.abs(x - y) - eps)
+            elif self.relax == "tanh":
+                loss_reg = lambda x: w * tanh(np.abs(x - y) - eps)
+            elif self.relax == "elu":
+                loss_reg = lambda x: w * elu(np.abs(x - y) - eps)
+            else:
+                raise AssertionError("Unkown relaxation.")
+
+            loss_scalar = lambda x: (
+                (1 / (2.0 * eta)) * (x - (b - eta * g)) ** 2
+            ) + loss_reg(x)
+
+            x0 = np.array([0.0])
+            # print (minimize(loss_scalar,x0,method ='nelder-mead').x)
+            res[i, :] = float(minimize(loss_scalar, x0, method="nelder-mead").x)
+        return res
+
+    def get_lambdas(self, beta, mean=False):
+        """
+        Gets lambda function to evaluate acquisiton function and its derivative
+        :param beta: beta in GP-UCB
+        :return: [lambda,lambda]
+        """
+        mean = lambda x: self.mean_std(x.reshape(1, -1), reuse=True)[0][0][0]
+        sigma = lambda x: self.mean_std(x.reshape(1, -1), reuse=True)[1][0][0]
+
+        if mean == True:
+            return [mean, sigma]
+        else:
+            fun = lambda x: -(mean(x) + np.sqrt(beta) * sigma(x))
+            grad = lambda x: -complex_step_derivative(fun, 1e-10, x.reshape(1, -1))
+
+            return [fun, grad]
+
+    def get_kernel(self):
+        return self.K
+
+    def ucb_optimize(self, beta, multistart=25, lcb=False):
+        """
+        Optimizes UCB acquisiton function and return next point and its value as output
+        :param beta: beta from GP UCB
+        :param multistart: number of starts
+        :return: (next_point, value at next_point)
+        """
+
+        mean = lambda x: self.mean_std(x, reuse=True)[0][0][0]
+        sigma = lambda x: self.mean_std(x, reuse=True)[1][0][0]
+
+        ucb = lambda x: torch.dot(
+            torch.tensor([1.0, np.sqrt(beta)]),
+            torch.tensor(
+                [
+                    self.mean_std(x, reuse=True)[0][0][0],
+                    self.mean_std(x, reuse=True)[1][0][0],
+                ]
+            ),
+        )
+        lcb = lambda x: torch.dot(
+            torch.tensor([1.0, np.sqrt(beta)]),
+            torch.tensor(
+                [
+                    self.mean_std(x, reuse=True)[0][0][0],
+                    -self.mean_std(x, reuse=True)[1][0][0],
+                ]
+            ),
+        )
+
+        if lcb == False:
+            fun2 = lambda x: -ucb(torch.from_numpy(x).view(1, -1)).numpy()
+        else:
+            fun2 = lambda x: -lcb(torch.from_numpy(x).view(1, -1)).numpy()
+        fun = lambda x: -(
+            mean(torch.from_numpy(x).view(1, -1))
+            + np.sqrt(beta) * sigma(torch.from_numpy(x).view(1, -1))
+        )
+
+        self.back_prop = False
+        self.mean_std(self.x)
+
+        mybounds = self.bounds
+
+        results = []
+
+        from scipy.optimize import minimize
+
+        for i in range(multistart):
+            x0 = np.random.randn(self.d)
+            for i in range(self.d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+
+            res = minimize(
+                fun2, x0, method="L-BFGS-B", jac=None, tol=0.000001, bounds=mybounds
+            )
+            solution = res.x
+            results.append([solution, -fun(solution)])
+
+        results = np.array(results)
+        index = np.argmax(results[:, 1])
+        solution = results[index, 0]
+
+        return (torch.from_numpy(solution), -fun(solution))
+
+    def isin(self, xnext):
+        self.epsilon = 0.001
+        for v in self.x:
+            if torch.norm(v - xnext, p=2) < self.epsilon:
+                return True
+
+    def sample_and_condition(self, x):
+        xprobe = x.view(1, -1)
+        fprobe = self.sample(xprobe)
+        if not self.isin(xprobe):
+            self.x = torch.cat((self.x, xprobe), dim=0)
+            self.y = torch.cat((self.y, fprobe), dim=0)
+            self.fit_gp(self.x, self.y)
+        return -fprobe
+
+    def get_lambdas_TH(self):
+        fun = lambda x: self.sample_and_condition(x)
+        grad = None
+        return [fun, grad]
+
+    def sample_iteratively_max(
+        self, xtest, multistart=20, minimizer="coordinate-wise", grid=100
+    ):
+        """
+        Samples Path from GP and takes the maximum iteratively
+        :param xtest: grid
+        :param size: number of samples
+        :return: numpy array
+        """
+        # print ("Iterative:",multistart,minimizer,grid)
+        from scipy.optimize import minimize
+
+        # old stuff
+        xold = self.x
+        yold = self.y
+
+        # with fixed grid
+        if xtest is not None:
+            # number of samples
+            nn = xtest.shape[0]
+
+            f = torch.zeros(nn, dtype=torch.float64)
+
+            for j in range(nn):
+                xprobe = xtest[j, :].view(1, -1)
+                (K_star, K_star_star) = self.execute(xprobe)
+                (ymean, yvar) = self.mean_std(xprobe)
+                L = torch.sqrt(
+                    K_star_star
+                    + self.s * self.s * torch.eye(1, dtype=torch.float64)
+                    - yvar
+                )
+                fprobe = ymean + L * torch.randn(1, dtype=torch.float64)
+                # add x and fprobe to the dataset and redo the whole
+                f[j] = fprobe
+                if not self.isin(xprobe):
+                    self.x = torch.cat((self.x, xprobe), dim=0)
+                    self.y = torch.cat((self.y, fprobe), dim=0)
+
+                self.fit_gp(self.x, self.y)
+
+            val, index = torch.max(f, dim=0)
+            self.fit_gp(xold, yold)
+            return (xtest[index, :], f[index])
+
+        else:
+            # Iterative without grid
+
+            # get bounds
+            if self.bounds == None:
+                mybounds = tuple(
+                    [(-self.diameter, self.diameter) for i in range(self.d)]
+                )
+            else:
+                mybounds = self.bounds
+            [fun, grad] = self.get_lambdas_TH()
+
+            results = []
+            for j in range(multistart):
+
+                # print ("Multistart:",j)
+                x0 = torch.randn(self.d, dtype=torch.float64)
+                for i in range(self.d):
+                    x0[i].uniform_(mybounds[i][0], mybounds[i][1])
+
+                # simple coordnate-wise optimization
+                if minimizer == "coordinate-wise":
+                    solution = x0
+                    for i in range(self.d):
+                        xtest = torch.from_numpy(np.tile(x0, (grid, 1)))
+                        xtest[:, i] = torch.linspace(
+                            mybounds[i][0], mybounds[i][1], grid
+                        )
+                        sample = self.sample(xtest)
+
+                        ## Add to the posterior
+                        self.x = torch.cat((self.x, xtest), dim=0)
+                        self.y = torch.cat((self.y, sample), dim=0)
+
+                        # argmax
+                        val, index = torch.max(sample, dim=0)
+                        out = xtest[index, :]
+
+                        # fit new GP
+                        self.fit_gp(self.x, self.y)
+                        solution[i] = out[0, i]
+
+                elif minimizer == "L-BFGS-B":
+                    solution = np.random.randn(self.d)
+                    xmax = [b[1] for b in mybounds]
+                    xmin = [b[0] for b in mybounds]
+                    bounds = MyBounds(xmax=xmax, xmin=xmin)
+                    func = lambda x: fun(torch.from_numpy(x)).numpy()[0][0]
+                    res = scipy.optimize.basinhopping(
+                        func, solution, disp=False, niter=grid, accept_test=bounds
+                    )
+                    solution = torch.from_numpy(res.x)
+
+                else:
+                    raise AssertionError("Wrong optimizer selected.")
+
+                results.append(torch.cat((solution, -fun(solution)[0])))
+                self.x = xold
+                self.y = yold
+                self.fit_gp(self.x, self.y)
+
+            results = torch.stack(results)
+            val, index = torch.max(results[:, -1], dim=0)
+            solution = results[index, 0 : self.d].view(1, self.d)
+            self.x = xold
+            self.y = yold
+            self.fit_gp(self.x, self.y)
+
+            return (solution, -fun(solution))
 
 
 if __name__ == "__main__":
-	from stpy.helpers.helper import interval
-	# domain size
-	L_infinity_ball = 1
-	# dimension
-	d = 1
-	# error variance
-	s = torch.from_numpy(np.array(1.0, dtype=np.float64))
-
-	# grid density
-	n = 1024
-	# number of intial points
-	N = 32
-	# smoothness
-	gamma = 0.1
-	# test problem
-
-	xtest = torch.from_numpy(interval(n, d))
-	# x = torch.from_numpy(np.random.uniform(-L_infinity_ball,L_infinity_ball, size = (N,d)))
-	x = torch.from_numpy(interval(N, 1))
-	f_no_noise = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
-	f = lambda q: f_no_noise(q) + torch.normal(mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.,
-											   out=None) * s * s
-	# targets
-	y = f(x)
-
-	# GP model with squared exponential
-	kernel = KernelFunction(kernel_name = "ard", gamma = torch.ones(d, dtype = torch.float64)*gamma , groups = [[0],[1]])
-	# kernel = KernelFunction(kernel_name="ard", gamma=torch.ones(1, dtype=torch.float64) * gamma, groups=[[0]])
-	GP = GaussianProcess(s=s, d=1)
-
-	# fit GP
-	# x = x.numpy()
-	GP.fit_gp(x, y)
-	# get mean and variance of GP
-	[mu, std] = GP.mean_std(xtest)
-
-	# print ("Log probability:", GP.log_marginal_likelihood() )
-	# mu_inf = GP.chebyshev_mean(xtest)
-	eps = 0.1
-
-	mu_vol = GP.volume_mean_cvxpy(xtest, eps=eps, verbose=True, scale=1., slope=1., tol=10e-9)
-
-	GP.visualize(xtest, f_true=f_no_noise, show=False)
-	plt.plot(xtest.numpy(), mu_vol.detach().numpy(), label="Least-Volume-ReLu", lw=2)
-	for slope in [0.001, 0.01, 0.1, 1., 10., 100., 1000., 10000.]:
-		# mu_vol_log = GP.volume_mean_cvxpy(xtest, eps=eps, verbose=True, scale=1., slope=slope, tol=10e-9, relax = 'log', B = 1000)
-		# plt.plot(xtest.numpy(),mu_vol_log.detach().numpy(), '--',label = "Least-Volume-Log" + str(slope), lw = 2)
-		mu_vol_tanh = GP.volume_mean(xtest, eps=eps, verbose=True, eta_start=0.1, eta_decrease=0.1, scale=1.,
-									 slope=slope,
-									 tol=0.01, warm=True, relax='tanh')
-		plt.plot(xtest.numpy(), mu_vol_tanh.detach().numpy(), '-.', label="Least-Volume-Tanh" + str(slope), lw=2)
-	# print (slope, np.sum(np.abs(mu_vol_log)<eps) )
-	# plt.plot(xtest.numpy(),mu_vol_tanh.detach().numpy(), label = "Least-Volume-Tahn", lw = 2)
-	# plt.plot(xtest.numpy(),mu_vol_tanh2.detach().numpy(), label = "Least-Volume-Tahn2", lw = 2)
-
-	# plt.plot(xtest.numpy(),mu_inf.detach().numpy(), label = "Chebyschev estimate", lw = 2)
-	plt.plot(x.numpy(), y.numpy() + eps, 'ko')
-	plt.plot(x.numpy(), y.numpy() - eps, 'ko')
-	plt.legend()
-	plt.show()
+    from stpy.helpers.helper import interval
+
+    # domain size
+    L_infinity_ball = 1
+    # dimension
+    d = 1
+    # error variance
+    s = torch.from_numpy(np.array(1.0, dtype=np.float64))
+
+    # grid density
+    n = 1024
+    # number of intial points
+    N = 32
+    # smoothness
+    gamma = 0.1
+    # test problem
+
+    xtest = torch.from_numpy(interval(n, d))
+    # x = torch.from_numpy(np.random.uniform(-L_infinity_ball,L_infinity_ball, size = (N,d)))
+    x = torch.from_numpy(interval(N, 1))
+    f_no_noise = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
+    f = (
+        lambda q: f_no_noise(q)
+        + torch.normal(
+            mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.0, out=None
+        )
+        * s
+        * s
+    )
+    # targets
+    y = f(x)
+
+    # GP model with squared exponential
+    kernel = KernelFunction(
+        kernel_name="ard",
+        gamma=torch.ones(d, dtype=torch.float64) * gamma,
+        groups=[[0], [1]],
+    )
+    # kernel = KernelFunction(kernel_name="ard", gamma=torch.ones(1, dtype=torch.float64) * gamma, groups=[[0]])
+    GP = GaussianProcess(s=s, d=1)
+
+    # fit GP
+    # x = x.numpy()
+    GP.fit_gp(x, y)
+    # get mean and variance of GP
+    [mu, std] = GP.mean_std(xtest)
+
+    # print ("Log probability:", GP.log_marginal_likelihood() )
+    # mu_inf = GP.chebyshev_mean(xtest)
+    eps = 0.1
+
+    mu_vol = GP.volume_mean_cvxpy(
+        xtest, eps=eps, verbose=True, scale=1.0, slope=1.0, tol=10e-9
+    )
+
+    GP.visualize(xtest, f_true=f_no_noise, show=False)
+    plt.plot(xtest.numpy(), mu_vol.detach().numpy(), label="Least-Volume-ReLu", lw=2)
+    for slope in [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]:
+        # mu_vol_log = GP.volume_mean_cvxpy(xtest, eps=eps, verbose=True, scale=1., slope=slope, tol=10e-9, relax = 'log', B = 1000)
+        # plt.plot(xtest.numpy(),mu_vol_log.detach().numpy(), '--',label = "Least-Volume-Log" + str(slope), lw = 2)
+        mu_vol_tanh = GP.volume_mean(
+            xtest,
+            eps=eps,
+            verbose=True,
+            eta_start=0.1,
+            eta_decrease=0.1,
+            scale=1.0,
+            slope=slope,
+            tol=0.01,
+            warm=True,
+            relax="tanh",
+        )
+        plt.plot(
+            xtest.numpy(),
+            mu_vol_tanh.detach().numpy(),
+            "-.",
+            label="Least-Volume-Tanh" + str(slope),
+            lw=2,
+        )
+    # print (slope, np.sum(np.abs(mu_vol_log)<eps) )
+    # plt.plot(xtest.numpy(),mu_vol_tanh.detach().numpy(), label = "Least-Volume-Tahn", lw = 2)
+    # plt.plot(xtest.numpy(),mu_vol_tanh2.detach().numpy(), label = "Least-Volume-Tahn2", lw = 2)
+
+    # plt.plot(xtest.numpy(),mu_inf.detach().numpy(), label = "Chebyschev estimate", lw = 2)
+    plt.plot(x.numpy(), y.numpy() + eps, "ko")
+    plt.plot(x.numpy(), y.numpy() - eps, "ko")
+    plt.legend()
+    plt.show()
diff --git a/stpy/continuous_processes/kernelized_features.py b/stpy/continuous_processes/kernelized_features.py
index 0fe4802..d77d677 100755
--- a/stpy/continuous_processes/kernelized_features.py
+++ b/stpy/continuous_processes/kernelized_features.py
@@ -10,573 +10,681 @@
 
 
 class KernelizedFeatures(GaussianProcess):
-	'''
-		Random Fourier Features for Gaussian Kernel
-	'''
-
-	def __init__(self, embedding, m, s=0.001, lam=1., d=1, diameter=1.0, theta_norm=1.0, verbose=True, groups=None,
-				 bounds=None, scale=1.0, kappa=1.0, poly=2, primal=True, beta_fun=None, bound= 1) :
-
-		self.s = s
-		self.lam = lam
-		self.primal = primal
-		self.x = None
-
-		self.K = torch.ones(size=(1, 1)).double()
-		self.mu = 0.0
-
-		self.m = torch.from_numpy(np.array(m))
-		self.fitted = False
-		self.data = False
-
-		self.d = d
-		self.n = 0
-		self.bounds = bounds
-		self.groups = groups
-		self.diameter = diameter
-		self.theta_norm = theta_norm
-
-		self.verbose = verbose
-		self.admits_first_order = True
-
-		self.embedding = embedding
-		self.embedding_map = embedding
-
-		self.kappa = kappa
-		self.scale = scale
-		self.poly = poly
-
-		self.to_add = []
-		self.prior_mean = 0
-		self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
-		self.dual = False
-		self.beta_fun = beta_fun
-		self.bound = bound
-
-	def beta(self, delta=0.1, norm=None):
-		# self.K = Z_ + self.s * self.s * self.lam * I
-		if norm is None:
-			norm = self.theta_norm
-
-		if self.beta_fun is None:
-			return 2.0
-
-		elif self.beta_fun == "theory":
-			K = self.kernel(self.x,self.x) + torch.eye(self.x.size()[0]).double()*self.s**2*self.lam
-
-			beta_value = self.bound * self.lam + torch.logdet(K / ((self.s ** 2) * self.lam)) + 2 * np.log(1 / delta)
-			Q = self.embed(self.x)
-			Lam = self.lam * torch.eye(self.get_basis_size()).double()
-			V = Q.T @ Q/(self.s**2) + Lam
-
-			beta_value = self.bound * self.lam + torch.logdet(V) - torch.logdet(Lam) + 2 * np.log(1 / delta)
-			beta_value = beta_value
-		else:
-			return self.beta_fun(self.K, delta=delta, norm=norm)
-		return beta_value
-
-	def description(self):
-		return "Custom Features object"
-
-	def embed(self, x):
-		return self.embedding.embed(x)
-
-	def set_embedding(self, embed):
-		self.embedding_map = embed
-
-	def get_basis_size(self):
-		return int(torch.sum(self.m))
-
-	def set_basis_size(self, m):
-		self.m = m
-
-	def kernel(self, x, y):
-		embedding = self.embed(x)
-		embedding2 = self.embed(y)
-		K = self.linear_kernel(embedding, embedding2)
-		return K
-
-	def logdet_ratio(self):
-		I = torch.eye(int(torch.sum(self.m))).double()
-		return torch.logdet(self.K) - torch.logdet(self.s ** 2 * self.lam * I)
-
-	def effective_dim(self, xtest):
-		Phi = self.embed(xtest)
-		d = torch.trace(torch.solve(Phi.T @ Phi, Phi.T @ Phi + torch.eye(self.get_basis_size()).double() * self.lam)[0])
-		return d
-
-	def add_data_point(self, x, y):
-		if self.n == 0:
-			self.fit_gp(x, y)
-		else:
-			self.to_add.append([x, y])
-			self.fitted = False
-
-	def fit(self,x=None,y=None):
-		self.fit_gp(self.x,self.y)
-
-	def fit_gp(self, x, y):
-		'''
-			Function to Fit GP
-		'''
-		self.x = x
-		self.y = y
-		self.n = list(self.x.size())[0]
-		self.d = list(self.x.size())[1]
-
-		if self.n < self.m:
-			self.dual = True
-		else:
-			self.dual = False
-
-		if self.primal == True:
-			self.dual = False
-
-		self.data = True
-		self.fitted = False
-		self.precompute()
-		return None
-
-	def add_points(self, d):
-		x,y = d
-		if self.x is not None:
-			self.x = torch.cat((self.x, x), dim=0)
-			self.y = torch.cat((self.y, y), dim=0)
-		else:
-			self.x = x
-			self.y = y
-
-	def check_conversion(self):
-		"""
-		Convert between dual and primal form
-		:return:
-		"""
-		if self.primal == False:
-			if self.n == self.m:  # convert do d mode
-				print("Switching mode to primal.")
-				self.dual = False
-
-				I = torch.eye(int(self.m)).double()
-				Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
-				self.V = (Z_ + self.s * self.s * self.lam * torch.eye(int(self.m), dtype=torch.float64))
-				self.invV, _ = torch.solve(I, self.V)
-
-	def get_invV(self):
-		self.precompute()
-
-		if self.dual:
-			I = torch.eye(self.m).double()
-			Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
-			self.V = (Z_ + self.s * self.s * self.lam * torch.eye(self.m, dtype=torch.float64))
-			self.invV  = torch.linalg.solve(self.V,I)
-			return self.invV
-		else:
-			return self.invV
-
-	def precompute(self):
-
-		if self.fitted == False:
-			if len(self.to_add) > 0:
-				# something to add via low rank update
-				for i in range(len(self.to_add)):
-					newx = self.to_add[i][0]
-					newy = self.to_add[i][1]
-
-					# rank one update
-					emb = self.embed(newx)
-
-					if self.dual:  # via Shur complements
-						newKinv = torch.zeros(size=(self.n + 1, self.n + 1)).double()
-						newK = torch.zeros(size=(self.n + 1, self.n + 1)).double()
-
-						M = self.invK @ self.Q
-						c = 1. / ((self.s ** 2 * self.lam + emb @ emb.T) - emb @ self.Q.T @ M @ emb.T)
-
-						newKinv[0:self.n, 0:self.n] = self.invK + c * M @ emb.T @ emb @ M.T
-						newKinv[0:self.n, self.n] = (- M @ emb.T * c).view(-1)
-						newKinv[self.n, 0:self.n] = (- emb @ M.T * c).view(-1)
-						newKinv[self.n, self.n] = c.view(-1)
-
-						newK[0:self.n, 0:self.n] = self.K
-						newK[0:self.n, self.n] = emb @ self.Q.T
-						newK[self.n, 0:self.n] = emb @ self.Q.T
-						newK[self.n, self.n] = self.s ** 2 * self.lam + emb @ emb.T
-						self.K = newK
-
-						self.invK = newKinv
-
-						self.add_points(newx, newy)
-						self.n = self.n + 1
-						self.Q = self.embed(self.x)
-						self.invK_V = (1. / self.lam) * (-self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m)))
-
-					else:  # via Woodbury
-						c = 1 + emb @ self.invV @ emb.T
-						self.invV = self.invV - (self.invV @ emb.T @ emb @ self.invV) / c
-						self.add_points(newx, newy)
-						self.n = self.n + 1
-						self.Q = self.embed(self.x)
-					# add point
-
-					self.check_conversion()
-
-				self.fitted = True
-				self.to_add = []
-
-
-			elif self.data == True:  # just compute the
-				self.Q = self.embed(self.x)
-				if self.dual:
-					I = torch.eye(self.n).double()
-					Z_ = self.Q @ self.Q.T
-					self.K = Z_ + self.s * self.s * self.lam * I
-					# self.invK, _ = torch.solve(I, self.K)
-					self.invK = torch.pinverse(self.K)
-					self.invK_V = (1. / self.lam) * (-self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m)))
-				else:
-					I = torch.eye(int(self.m)).double()
-					Z_ = self.Q.T @ self.Q
-					self.V = Z_ + self.s ** 2 * self.lam * I
-					self.invV = torch.pinverse(self.V)
-
-				self.fitted = True
-			else:
-				pass
-		else:
-			pass
-
-	def theta_mean(self, var=False, prior=False):
-		self.precompute()
-
-		if self.fitted == True and prior == False:
-			if self.dual:
-				theta_mean = self.Q.T @ self.invK @ self.y
-				Z = self.invK_V
-			else:
-				theta_mean = self.invV @ self.Q.T @ self.y
-				Z = self.s ** 2 * self.invV
-		else:
-			theta_mean = 0 * torch.ones(size=(self.m, 1)).double()
-
-		if var is False:
-			return theta_mean
-		else:
-			return (theta_mean, Z)
-
-	def mean(self, xtest):
-		return self.mean_std(xtest)[0]
-
-	def mean_std(self, xtest):
-		'''
-			Calculate mean and variance for GP at xtest points
-		'''
-		self.precompute()
-		embeding = self.embed(xtest)
-
-		# mean
-		theta_mean = self.theta_mean()
-		# print(torch.norm(theta_mean))
-		ymean = embeding @ theta_mean
-
-		# std
-		if not self.dual or self.primal:
-			diagonal = self.s ** 2 * torch.einsum('ij,jk,ik->i', (embeding, self.invV, embeding)).view(-1, 1)
-		else:
-			diagonal = torch.einsum('ij,jk,ik->i', (embeding, self.invK_V, embeding)).view(-1, 1)
-
-		ystd = torch.sqrt(diagonal)
-		return (ymean, ystd)
-
-	def ucb(self, xtest, delta=0.1):
-		mu, std = self.mean_std(xtest)
-		res = mu + np.sqrt(self.beta(delta=delta)) * std
-		return res
-
-	def lcb(self, xtest, delta=0.1):
-		mu, std = self.mean_std(xtest)
-		res = mu - np.sqrt(self.beta(delta=delta)) * std
-		return res
-
-	def sample_matheron(self, xtest, kernel_object, size=1):
-		basis = self.get_basis_size()
-		zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
-		random_vector = torch.normal(mean=zeros, std=1.)
-
-		Z = self.lam * torch.eye(basis, dtype=torch.float64)
-		L = torch.linalg.cholesky(Z.transpose(-2, -1).conj()).transpose(-2, -1).conj()
-		theta = torch.mm(L, random_vector) + self.prior_mean
-
-		f_prior_xtest = torch.mm(self.embed(xtest), theta)
-		f_prior_x = torch.mm(self.embed(self.x), theta)
-
-		K_star = kernel_object.kernel(self.x, xtest)
-		N = self.x.size()[0]
-		K = kernel_object.kernel(self.x, self.x) + self.s ** 2 * self.lam * torch.eye(N)
-
-		f = f_prior_xtest + K_star @ torch.pinverse(K) @ (self.y - f_prior_x)
-		return f
-
-	def sample_theta(self, size=1, prior=False):
-
-		basis = self.get_basis_size()
-
-		zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
-		random_vector = torch.normal(mean=zeros, std=1.)
-		self.precompute()
-
-		if self.fitted == True and prior == False:
-			self.L = torch.linalg.cholesky(self.get_invV()) * self.s
-			theta = self.theta_mean()
-			theta = theta + torch.mm(self.L, random_vector)
-		else:
-			Z = self.lam * torch.eye(basis, dtype=torch.float64)
-			L = torch.linalg.cholesky(Z.transpose(-2, -1).conj()).transpose(-2, -1).conj()
-			theta = torch.mm(L, random_vector) + self.prior_mean
-
-		return theta
-
-	def theta_mean_constrained(self, weights=None, B=1):
-		if weights is None:
-			weights = torch.ones(self.n).double() / self.n
-
-		Q = self.embed(self.x)
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-		objective = cp.Minimize(
-			cp.sum(weights @ cp.square(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy())))
-		zero = np.zeros(int(torch.sum(self.m)))
-		constraints = [cp.SOC(theta @ zero + B, theta)]
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK)
-		return torch.from_numpy(theta.value).view(-1, 1)
-
-	def theta_absolute_deviation(self, weights=None, reg=None):
-		if weights is None:
-			weights = torch.ones(self.x.size()[0])
-
-		if reg is None:  # standard regularization
-			Q = self.embed(self.x)
-			theta = cp.Variable((int(torch.sum(self.m)), 1))
-			objective = cp.Minimize(
-				cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy())) + self.s * self.lam * cp.norm2(theta))
-			prob = cp.Problem(objective)
-			prob.solve()
-			return torch.from_numpy(theta.value)
-		else:  # custom regularization
-			Q = self.embed(self.x)
-			theta = cp.Variable((int(torch.sum(self.m)), 1))
-			objective = cp.Minimize(
-				cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy())) + reg * cp.norm2(theta))
-			prob = cp.Problem(objective)
-			prob.solve(solver=cp.MOSEK)
-			return torch.from_numpy(theta.value)
-
-	def theta_absolute_deviation_constrained(self, weights=None, B=1):
-		if weights is None:
-			weights = torch.ones(self.x.size()[0])
-		Q = self.embed(self.x)
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-
-		objective = cp.Minimize(cp.sum(weights @ cp.abs(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy())))
-		zero = np.zeros(int(torch.sum(self.m)))
-		constraints = [cp.SOC(theta @ zero + B, theta)]
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK)
-		return torch.from_numpy(theta.value).view(-1, 1)
-
-	def theta_chebyschev_approximation(self, eps=1.):
-		Q = self.embed(self.x).detach().numpy()
-		y = self.y.view(-1).detach().numpy()
-
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-		objective = cp.Minimize(cp.sum_squares(theta))
-		constraints = [cp.abs(Q @ theta - y) <= eps]
-
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK)
-		res = torch.from_numpy(theta.value).view(-1, 1)
-		return res
-
-	def interpolation(self, eps=0.):
-		Q = self.embed(self.x).detach().numpy()
-		y = self.y.view(-1).detach().numpy()
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-		objective = cp.Minimize(cp.sum_squares(theta))
-		constraints = [Q @ theta == y]
-
-		prob = cp.Problem(objective, constraints)
-		prob.solve()
-		res = torch.from_numpy(theta.value).view(-1, 1)
-
-		return res
-
-	def mean_squared(self, xtest, weights=None, B=None, theta=False, reg=None):
-		embeding = self.embed(xtest)
-
-		if B is not None:
-			theta_mean = self.theta_mean_constrained(weights=weights, B=B)
-		else:
-			theta_mean = self.theta_mean(weights=weights, reg=reg)
-		ymean = torch.mm(embeding, theta_mean)
-		if theta == True:
-			return ymean, theta_mean
-		else:
-			return ymean
-
-	def mean_aboslute_deviation(self, xtest, weights=None, B=None, theta=False):
-		embeding = self.embed(xtest)
-		if B is not None:
-			theta_mean = self.theta_absolute_deviation_constrained(weights=weights, B=B)
-		else:
-			theta_mean = self.theta_absolute_deviation(weights=weights)
-		ymean = torch.mm(embeding, theta_mean)
-		if theta == True:
-			return ymean, theta_mean
-		else:
-			return ymean
-
-	"""
+    """
+    Random Fourier Features for Gaussian Kernel
+    """
+
+    def __init__(
+        self,
+        embedding,
+        m,
+        s=0.001,
+        lam=1.0,
+        d=1,
+        diameter=1.0,
+        theta_norm=1.0,
+        verbose=True,
+        groups=None,
+        bounds=None,
+        scale=1.0,
+        kappa=1.0,
+        poly=2,
+        primal=True,
+        beta_fun=None,
+        bound=1,
+    ):
+
+        self.s = s
+        self.lam = lam
+        self.primal = primal
+        self.x = None
+
+        self.K = torch.ones(size=(1, 1)).double()
+        self.mu = 0.0
+
+        self.m = torch.from_numpy(np.array(m))
+        self.fitted = False
+        self.data = False
+
+        self.d = d
+        self.n = 0
+        self.bounds = bounds
+        self.groups = groups
+        self.diameter = diameter
+        self.theta_norm = theta_norm
+
+        self.verbose = verbose
+        self.admits_first_order = True
+
+        self.embedding = embedding
+        self.embedding_map = embedding
+
+        self.kappa = kappa
+        self.scale = scale
+        self.poly = poly
+
+        self.to_add = []
+        self.prior_mean = 0
+        self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
+        self.dual = False
+        self.beta_fun = beta_fun
+        self.bound = bound
+
+    def beta(self, delta=0.1, norm=None):
+        # self.K = Z_ + self.s * self.s * self.lam * I
+        if norm is None:
+            norm = self.theta_norm
+
+        if self.beta_fun is None:
+            return 2.0
+
+        elif self.beta_fun == "theory":
+            K = (
+                self.kernel(self.x, self.x)
+                + torch.eye(self.x.size()[0]).double() * self.s**2 * self.lam
+            )
+
+            beta_value = (
+                self.bound * self.lam
+                + torch.logdet(K / ((self.s**2) * self.lam))
+                + 2 * np.log(1 / delta)
+            )
+            Q = self.embed(self.x)
+            Lam = self.lam * torch.eye(self.get_basis_size()).double()
+            V = Q.T @ Q / (self.s**2) + Lam
+
+            beta_value = (
+                self.bound * self.lam
+                + torch.logdet(V)
+                - torch.logdet(Lam)
+                + 2 * np.log(1 / delta)
+            )
+            beta_value = beta_value
+        else:
+            return self.beta_fun(self.K, delta=delta, norm=norm)
+        return beta_value
+
+    def description(self):
+        return "Custom Features object"
+
+    def embed(self, x):
+        return self.embedding.embed(x)
+
+    def set_embedding(self, embed):
+        self.embedding_map = embed
+
+    def get_basis_size(self):
+        return int(torch.sum(self.m))
+
+    def set_basis_size(self, m):
+        self.m = m
+
+    def kernel(self, x, y):
+        embedding = self.embed(x)
+        embedding2 = self.embed(y)
+        K = self.linear_kernel(embedding, embedding2)
+        return K
+
+    def logdet_ratio(self):
+        I = torch.eye(int(torch.sum(self.m))).double()
+        return torch.logdet(self.K) - torch.logdet(self.s**2 * self.lam * I)
+
+    def effective_dim(self, xtest):
+        Phi = self.embed(xtest)
+        d = torch.trace(
+            torch.solve(
+                Phi.T @ Phi,
+                Phi.T @ Phi + torch.eye(self.get_basis_size()).double() * self.lam,
+            )[0]
+        )
+        return d
+
+    def add_data_point(self, x, y):
+        if self.n == 0:
+            self.fit_gp(x, y)
+        else:
+            self.to_add.append([x, y])
+            self.fitted = False
+
+    def fit(self, x=None, y=None):
+        self.fit_gp(self.x, self.y)
+
+    def fit_gp(self, x, y):
+        """
+        Function to Fit GP
+        """
+        self.x = x
+        self.y = y
+        self.n = list(self.x.size())[0]
+        self.d = list(self.x.size())[1]
+
+        if self.n < self.m:
+            self.dual = True
+        else:
+            self.dual = False
+
+        if self.primal == True:
+            self.dual = False
+
+        self.data = True
+        self.fitted = False
+        self.precompute()
+        return None
+
+    def add_points(self, d):
+        x, y = d
+        if self.x is not None:
+            self.x = torch.cat((self.x, x), dim=0)
+            self.y = torch.cat((self.y, y), dim=0)
+        else:
+            self.x = x
+            self.y = y
+
+    def check_conversion(self):
+        """
+        Convert between dual and primal form
+        :return:
+        """
+        if self.primal == False:
+            if self.n == self.m:  # convert do d mode
+                print("Switching mode to primal.")
+                self.dual = False
+
+                I = torch.eye(int(self.m)).double()
+                Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
+                self.V = Z_ + self.s * self.s * self.lam * torch.eye(
+                    int(self.m), dtype=torch.float64
+                )
+                self.invV, _ = torch.solve(I, self.V)
+
+    def get_invV(self):
+        self.precompute()
+
+        if self.dual:
+            I = torch.eye(self.m).double()
+            Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
+            self.V = Z_ + self.s * self.s * self.lam * torch.eye(
+                self.m, dtype=torch.float64
+            )
+            self.invV = torch.linalg.solve(self.V, I)
+            return self.invV
+        else:
+            return self.invV
+
+    def precompute(self):
+
+        if self.fitted == False:
+            if len(self.to_add) > 0:
+                # something to add via low rank update
+                for i in range(len(self.to_add)):
+                    newx = self.to_add[i][0]
+                    newy = self.to_add[i][1]
+
+                    # rank one update
+                    emb = self.embed(newx)
+
+                    if self.dual:  # via Shur complements
+                        newKinv = torch.zeros(size=(self.n + 1, self.n + 1)).double()
+                        newK = torch.zeros(size=(self.n + 1, self.n + 1)).double()
+
+                        M = self.invK @ self.Q
+                        c = 1.0 / (
+                            (self.s**2 * self.lam + emb @ emb.T)
+                            - emb @ self.Q.T @ M @ emb.T
+                        )
+
+                        newKinv[0 : self.n, 0 : self.n] = (
+                            self.invK + c * M @ emb.T @ emb @ M.T
+                        )
+                        newKinv[0 : self.n, self.n] = (-M @ emb.T * c).view(-1)
+                        newKinv[self.n, 0 : self.n] = (-emb @ M.T * c).view(-1)
+                        newKinv[self.n, self.n] = c.view(-1)
+
+                        newK[0 : self.n, 0 : self.n] = self.K
+                        newK[0 : self.n, self.n] = emb @ self.Q.T
+                        newK[self.n, 0 : self.n] = emb @ self.Q.T
+                        newK[self.n, self.n] = self.s**2 * self.lam + emb @ emb.T
+                        self.K = newK
+
+                        self.invK = newKinv
+
+                        self.add_points(newx, newy)
+                        self.n = self.n + 1
+                        self.Q = self.embed(self.x)
+                        self.invK_V = (1.0 / self.lam) * (
+                            -self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m))
+                        )
+
+                    else:  # via Woodbury
+                        c = 1 + emb @ self.invV @ emb.T
+                        self.invV = (
+                            self.invV - (self.invV @ emb.T @ emb @ self.invV) / c
+                        )
+                        self.add_points(newx, newy)
+                        self.n = self.n + 1
+                        self.Q = self.embed(self.x)
+                    # add point
+
+                    self.check_conversion()
+
+                self.fitted = True
+                self.to_add = []
+
+            elif self.data == True:  # just compute the
+                self.Q = self.embed(self.x)
+                if self.dual:
+                    I = torch.eye(self.n).double()
+                    Z_ = self.Q @ self.Q.T
+                    self.K = Z_ + self.s * self.s * self.lam * I
+                    # self.invK, _ = torch.solve(I, self.K)
+                    self.invK = torch.pinverse(self.K)
+                    self.invK_V = (1.0 / self.lam) * (
+                        -self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m))
+                    )
+                else:
+                    I = torch.eye(int(self.m)).double()
+                    Z_ = self.Q.T @ self.Q
+                    self.V = Z_ + self.s**2 * self.lam * I
+                    self.invV = torch.pinverse(self.V)
+
+                self.fitted = True
+            else:
+                pass
+        else:
+            pass
+
+    def theta_mean(self, var=False, prior=False):
+        self.precompute()
+
+        if self.fitted == True and prior == False:
+            if self.dual:
+                theta_mean = self.Q.T @ self.invK @ self.y
+                Z = self.invK_V
+            else:
+                theta_mean = self.invV @ self.Q.T @ self.y
+                Z = self.s**2 * self.invV
+        else:
+            theta_mean = 0 * torch.ones(size=(self.m, 1)).double()
+
+        if var is False:
+            return theta_mean
+        else:
+            return (theta_mean, Z)
+
+    def mean(self, xtest):
+        return self.mean_std(xtest)[0]
+
+    def mean_std(self, xtest):
+        """
+        Calculate mean and variance for GP at xtest points
+        """
+        self.precompute()
+        embeding = self.embed(xtest)
+
+        # mean
+        theta_mean = self.theta_mean()
+        # print(torch.norm(theta_mean))
+        ymean = embeding @ theta_mean
+
+        # std
+        if not self.dual or self.primal:
+            diagonal = self.s**2 * torch.einsum(
+                "ij,jk,ik->i", (embeding, self.invV, embeding)
+            ).view(-1, 1)
+        else:
+            diagonal = torch.einsum(
+                "ij,jk,ik->i", (embeding, self.invK_V, embeding)
+            ).view(-1, 1)
+
+        ystd = torch.sqrt(diagonal)
+        return (ymean, ystd)
+
+    def ucb(self, xtest, delta=0.1):
+        mu, std = self.mean_std(xtest)
+        res = mu + np.sqrt(self.beta(delta=delta)) * std
+        return res
+
+    def lcb(self, xtest, delta=0.1):
+        mu, std = self.mean_std(xtest)
+        res = mu - np.sqrt(self.beta(delta=delta)) * std
+        return res
+
+    def sample_matheron(self, xtest, kernel_object, size=1):
+        basis = self.get_basis_size()
+        zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
+        random_vector = torch.normal(mean=zeros, std=1.0)
+
+        Z = self.lam * torch.eye(basis, dtype=torch.float64)
+        L = torch.linalg.cholesky(Z.transpose(-2, -1).conj()).transpose(-2, -1).conj()
+        theta = torch.mm(L, random_vector) + self.prior_mean
+
+        f_prior_xtest = torch.mm(self.embed(xtest), theta)
+        f_prior_x = torch.mm(self.embed(self.x), theta)
+
+        K_star = kernel_object.kernel(self.x, xtest)
+        N = self.x.size()[0]
+        K = kernel_object.kernel(self.x, self.x) + self.s**2 * self.lam * torch.eye(N)
+
+        f = f_prior_xtest + K_star @ torch.pinverse(K) @ (self.y - f_prior_x)
+        return f
+
+    def sample_theta(self, size=1, prior=False):
+
+        basis = self.get_basis_size()
+
+        zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
+        random_vector = torch.normal(mean=zeros, std=1.0)
+        self.precompute()
+
+        if self.fitted == True and prior == False:
+            self.L = torch.linalg.cholesky(self.get_invV()) * self.s
+            theta = self.theta_mean()
+            theta = theta + torch.mm(self.L, random_vector)
+        else:
+            Z = self.lam * torch.eye(basis, dtype=torch.float64)
+            L = (
+                torch.linalg.cholesky(Z.transpose(-2, -1).conj())
+                .transpose(-2, -1)
+                .conj()
+            )
+            theta = torch.mm(L, random_vector) + self.prior_mean
+
+        return theta
+
+    def theta_mean_constrained(self, weights=None, B=1):
+        if weights is None:
+            weights = torch.ones(self.n).double() / self.n
+
+        Q = self.embed(self.x)
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+        objective = cp.Minimize(
+            cp.sum(
+                weights
+                @ cp.square(
+                    Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy()
+                )
+            )
+        )
+        zero = np.zeros(int(torch.sum(self.m)))
+        constraints = [cp.SOC(theta @ zero + B, theta)]
+        prob = cp.Problem(objective, constraints)
+        prob.solve(solver=cp.MOSEK)
+        return torch.from_numpy(theta.value).view(-1, 1)
+
+    def theta_absolute_deviation(self, weights=None, reg=None):
+        if weights is None:
+            weights = torch.ones(self.x.size()[0])
+
+        if reg is None:  # standard regularization
+            Q = self.embed(self.x)
+            theta = cp.Variable((int(torch.sum(self.m)), 1))
+            objective = cp.Minimize(
+                cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy()))
+                + self.s * self.lam * cp.norm2(theta)
+            )
+            prob = cp.Problem(objective)
+            prob.solve()
+            return torch.from_numpy(theta.value)
+        else:  # custom regularization
+            Q = self.embed(self.x)
+            theta = cp.Variable((int(torch.sum(self.m)), 1))
+            objective = cp.Minimize(
+                cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy()))
+                + reg * cp.norm2(theta)
+            )
+            prob = cp.Problem(objective)
+            prob.solve(solver=cp.MOSEK)
+            return torch.from_numpy(theta.value)
+
+    def theta_absolute_deviation_constrained(self, weights=None, B=1):
+        if weights is None:
+            weights = torch.ones(self.x.size()[0])
+        Q = self.embed(self.x)
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+
+        objective = cp.Minimize(
+            cp.sum(
+                weights
+                @ cp.abs(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy())
+            )
+        )
+        zero = np.zeros(int(torch.sum(self.m)))
+        constraints = [cp.SOC(theta @ zero + B, theta)]
+        prob = cp.Problem(objective, constraints)
+        prob.solve(solver=cp.MOSEK)
+        return torch.from_numpy(theta.value).view(-1, 1)
+
+    def theta_chebyschev_approximation(self, eps=1.0):
+        Q = self.embed(self.x).detach().numpy()
+        y = self.y.view(-1).detach().numpy()
+
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+        objective = cp.Minimize(cp.sum_squares(theta))
+        constraints = [cp.abs(Q @ theta - y) <= eps]
+
+        prob = cp.Problem(objective, constraints)
+        prob.solve(solver=cp.MOSEK)
+        res = torch.from_numpy(theta.value).view(-1, 1)
+        return res
+
+    def interpolation(self, eps=0.0):
+        Q = self.embed(self.x).detach().numpy()
+        y = self.y.view(-1).detach().numpy()
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+        objective = cp.Minimize(cp.sum_squares(theta))
+        constraints = [Q @ theta == y]
+
+        prob = cp.Problem(objective, constraints)
+        prob.solve()
+        res = torch.from_numpy(theta.value).view(-1, 1)
+
+        return res
+
+    def mean_squared(self, xtest, weights=None, B=None, theta=False, reg=None):
+        embeding = self.embed(xtest)
+
+        if B is not None:
+            theta_mean = self.theta_mean_constrained(weights=weights, B=B)
+        else:
+            theta_mean = self.theta_mean(weights=weights, reg=reg)
+        ymean = torch.mm(embeding, theta_mean)
+        if theta == True:
+            return ymean, theta_mean
+        else:
+            return ymean
+
+    def mean_aboslute_deviation(self, xtest, weights=None, B=None, theta=False):
+        embeding = self.embed(xtest)
+        if B is not None:
+            theta_mean = self.theta_absolute_deviation_constrained(weights=weights, B=B)
+        else:
+            theta_mean = self.theta_absolute_deviation(weights=weights)
+        ymean = torch.mm(embeding, theta_mean)
+        if theta == True:
+            return ymean, theta_mean
+        else:
+            return ymean
+
+    """
 	Hessian 
 	"""
 
-	def mean_gradient_hessian(self, xtest, hessian=False):
-		hessian_mu = torch.zeros(size=(self.d, self.d), dtype=torch.float64)
-		xtest.requires_grad_(True)
-		# xtest.retain_grad()
-		mu = self.mean_std(xtest)[0]
-		# mu.backward(retain_graph=True)
-
-		# nabla_mu = xtest.grad
-		nabla_mu = grad(mu, xtest, create_graph=True)[0][0]
-
-		if hessian == False:
-			return nabla_mu
-		else:
-			for i in range(self.d):
-				hessian_mu[i, :] = grad(nabla_mu[i], xtest, create_graph=True, retain_graph=True)[0][0]
-			return [nabla_mu, hessian_mu]
-
-	""" 
+    def mean_gradient_hessian(self, xtest, hessian=False):
+        hessian_mu = torch.zeros(size=(self.d, self.d), dtype=torch.float64)
+        xtest.requires_grad_(True)
+        # xtest.retain_grad()
+        mu = self.mean_std(xtest)[0]
+        # mu.backward(retain_graph=True)
+
+        # nabla_mu = xtest.grad
+        nabla_mu = grad(mu, xtest, create_graph=True)[0][0]
+
+        if hessian == False:
+            return nabla_mu
+        else:
+            for i in range(self.d):
+                hessian_mu[i, :] = grad(
+                    nabla_mu[i], xtest, create_graph=True, retain_graph=True
+                )[0][0]
+            return [nabla_mu, hessian_mu]
+
+    """ 
 	Optimization
 	"""
 
-	def ucb_optimize(self, beta, multistart=25, lcb=False, minimizer="L-BFGS-B"):
-
-		# precompute important (theta)
-		theta_mean, K = self.theta_mean(var=True)
-
-		if lcb == False:
-			fun = lambda x: - (self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean + \
-							   beta * torch.sqrt(self.embed(torch.from_numpy(x).view(1, -1)) @ K @ self.embed(
-						torch.from_numpy(x).view(1, -1)).T)).detach().numpy()[0]
-		else:
-			fun = lambda x: - (self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean - \
-							   beta * torch.sqrt(self.embed(torch.from_numpy(x).view(1, -1)) @ K @ self.embed(
-						torch.from_numpy(x).view(1, -1)).T).detach().numpy()[0]).numpy()[0]
-
-		if self.bounds == None:
-			mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
-		else:
-			mybounds = self.bounds
-
-		results = []
-		for j in range(multistart):
-
-			x0 = np.random.randn(self.d)
-			for i in range(self.d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-
-			if minimizer == "L-BFGS-B":
-				res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds)
-				solution = res.x
-			else:
-				raise AssertionError("Wrong optimizer selected.")
-
-			results.append([solution, -fun(solution)])
-
-		results = np.array(results)
-		index = np.argmax(results[:, 1])
-		solution = results[index, 0]
-		return (torch.from_numpy(solution).view(1, -1), -torch.from_numpy(fun(solution)))
-
-	def sample_and_optimize(self, xtest=None, multistart=25, minimizer="L-BFGS-B", grid=100, verbose=0):
-		'''
-			Sample functions from Gaussian Process and take Maximum using
-			first order maximization
-		'''
-
-		# sample linear approximating
-		theta = self.sample_theta()
-
-		# get bounds
-		if self.bounds == None:
-			mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
-		else:
-			mybounds = self.bounds
-
-		fun = lambda x: -torch.mm(torch.t(theta), torch.t(self.embed(torch.from_numpy(x).view(1, -1)))).numpy()[0]
-
-		results = []
-		for j in range(multistart):
-			x0 = np.random.randn(self.d)
-			for i in range(self.d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-
-			if minimizer == "L-BFGS-B":
-				res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds)
-				solution = res.x
-			else:
-				raise AssertionError("Wrong optimizer selected.")
-
-			results.append([solution, -fun(solution)])
-		results = np.array(results)
-		index = np.argmax(results[:, 1])
-		solution = results[index, 0]
-
-		return (torch.from_numpy(solution), -torch.from_numpy(fun(solution)))
-
-	def sample(self, xtest, size=1, prior=False):
-		'''
-			Sample functions from Gaussian Process
-		'''
-		theta = self.sample_theta(size=size, prior=prior)
-		f = torch.mm(self.embed(xtest), theta)
-		return f
-
-	def sample_and_max(self, xtest, size=1):
-		'''
-			Sample functions from Gaussian Process and take Maximum
-		'''
-		f = self.sample(xtest, size=size)
-		index = np.argmax(f, axis=0)
-		return (xtest[index, :], f[index, :])
-
-	def get_kernel(self):
-		embeding = self.embed(self.x)
-		Z_ = self.linear_kernel(embeding, embeding)
-		K = (Z_ + self.s * self.s * self.lam * torch.eye(int(self.n), dtype=torch.float64))
-		return K
-
-	def residuals(self):
-		mu, _ = self.mean_std(self.x)
-		out = torch.sum((mu - self.y) ** 2)
-		return out
-if __name__ == "__main__":
-	N = 10
-	s = 0.1
-	n = 256
-	L_infinity_ball = 0.5
-
-	d = 1
-	m = 128
+    def ucb_optimize(self, beta, multistart=25, lcb=False, minimizer="L-BFGS-B"):
+
+        # precompute important (theta)
+        theta_mean, K = self.theta_mean(var=True)
+
+        if lcb == False:
+            fun = (
+                lambda x: -(
+                    self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean
+                    + beta
+                    * torch.sqrt(
+                        self.embed(torch.from_numpy(x).view(1, -1))
+                        @ K
+                        @ self.embed(torch.from_numpy(x).view(1, -1)).T
+                    )
+                )
+                .detach()
+                .numpy()[0]
+            )
+        else:
+            fun = lambda x: -(
+                self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean
+                - beta
+                * torch.sqrt(
+                    self.embed(torch.from_numpy(x).view(1, -1))
+                    @ K
+                    @ self.embed(torch.from_numpy(x).view(1, -1)).T
+                )
+                .detach()
+                .numpy()[0]
+            ).numpy()[0]
+
+        if self.bounds == None:
+            mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
+        else:
+            mybounds = self.bounds
+
+        results = []
+        for j in range(multistart):
+
+            x0 = np.random.randn(self.d)
+            for i in range(self.d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+
+            if minimizer == "L-BFGS-B":
+                res = minimize(
+                    fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds
+                )
+                solution = res.x
+            else:
+                raise AssertionError("Wrong optimizer selected.")
+
+            results.append([solution, -fun(solution)])
+
+        results = np.array(results)
+        index = np.argmax(results[:, 1])
+        solution = results[index, 0]
+        return (
+            torch.from_numpy(solution).view(1, -1),
+            -torch.from_numpy(fun(solution)),
+        )
+
+    def sample_and_optimize(
+        self, xtest=None, multistart=25, minimizer="L-BFGS-B", grid=100, verbose=0
+    ):
+        """
+        Sample functions from Gaussian Process and take Maximum using
+        first order maximization
+        """
+
+        # sample linear approximating
+        theta = self.sample_theta()
+
+        # get bounds
+        if self.bounds == None:
+            mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
+        else:
+            mybounds = self.bounds
+
+        fun = lambda x: -torch.mm(
+            torch.t(theta), torch.t(self.embed(torch.from_numpy(x).view(1, -1)))
+        ).numpy()[0]
+
+        results = []
+        for j in range(multistart):
+            x0 = np.random.randn(self.d)
+            for i in range(self.d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+
+            if minimizer == "L-BFGS-B":
+                res = minimize(
+                    fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds
+                )
+                solution = res.x
+            else:
+                raise AssertionError("Wrong optimizer selected.")
+
+            results.append([solution, -fun(solution)])
+        results = np.array(results)
+        index = np.argmax(results[:, 1])
+        solution = results[index, 0]
+
+        return (torch.from_numpy(solution), -torch.from_numpy(fun(solution)))
+
+    def sample(self, xtest, size=1, prior=False):
+        """
+        Sample functions from Gaussian Process
+        """
+        theta = self.sample_theta(size=size, prior=prior)
+        f = torch.mm(self.embed(xtest), theta)
+        return f
+
+    def sample_and_max(self, xtest, size=1):
+        """
+        Sample functions from Gaussian Process and take Maximum
+        """
+        f = self.sample(xtest, size=size)
+        index = np.argmax(f, axis=0)
+        return (xtest[index, :], f[index, :])
+
+    def get_kernel(self):
+        embeding = self.embed(self.x)
+        Z_ = self.linear_kernel(embeding, embeding)
+        K = Z_ + self.s * self.s * self.lam * torch.eye(
+            int(self.n), dtype=torch.float64
+        )
+        return K
+
+    def residuals(self):
+        mu, _ = self.mean_std(self.x)
+        out = torch.sum((mu - self.y) ** 2)
+        return out
 
-	xtest = torch.from_numpy(interval(n, d, L_infinity_ball=L_infinity_ball))
-	x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, N)).view(-1, 1)
 
-	F_true = lambda x: torch.sin(x * 4) ** 2 - 0.1
-	F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
-	y = F(x)
-
-	emb = RFFEmbedding(m=m, gamma=0.1)
-	Reggr = KernelizedFeatures(embedding=emb, m=m, d=1)
-	Reggr.fit_gp(x, y)
-	Reggr.visualize(xtest, f_true=F_true)
+if __name__ == "__main__":
+    N = 10
+    s = 0.1
+    n = 256
+    L_infinity_ball = 0.5
+
+    d = 1
+    m = 128
+
+    xtest = torch.from_numpy(interval(n, d, L_infinity_ball=L_infinity_ball))
+    x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, N)).view(
+        -1, 1
+    )
+
+    F_true = lambda x: torch.sin(x * 4) ** 2 - 0.1
+    F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
+    y = F(x)
+
+    emb = RFFEmbedding(m=m, gamma=0.1)
+    Reggr = KernelizedFeatures(embedding=emb, m=m, d=1)
+    Reggr.fit_gp(x, y)
+    Reggr.visualize(xtest, f_true=F_true)
diff --git a/stpy/continuous_processes/kernelized_features_old.py b/stpy/continuous_processes/kernelized_features_old.py
index 32877f8..62f802c 100755
--- a/stpy/continuous_processes/kernelized_features_old.py
+++ b/stpy/continuous_processes/kernelized_features_old.py
@@ -10,649 +10,757 @@
 
 
 class KernelizedFeatures(GaussianProcess):
-	'''
-		Random Fourier Features for Gaussian Kernel
-	'''
-
-	def __init__(self, embedding, m, s=0.001, lam=1., d=1, diameter=1.0, verbose=True, groups=None,
-				 bounds=None, scale=1.0, kappa=1.0, poly=2, primal=True, beta_fun = None ):
-
-		self.s = s
-		self.lam = lam
-		self.primal = primal
-		self.x = None
-
-		self.K = 0
-		self.mu = 0.0
-
-		self.m = torch.from_numpy(np.array(m))
-		self.fitted = False
-		self.data = False
-
-		self.d = d
-		self.n = 0
-		self.bounds = bounds
-		self.groups = groups
-		self.diameter = diameter
-
-		self.verbose = verbose
-		self.admits_first_order = True
-
-		self.embedding = embedding
-		self.embedding_map = embedding
-
-		self.kappa = kappa
-		self.scale = scale
-		self.poly = poly
-
-		self.to_add = []
-		self.prior_mean = 0
-		self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
-		self.dual = False
-
-	def beta(self, delta=1e-2, norm=1, theory=False, variance_only=False):
-		if not theory:
-			beta_value = 2.
-		else:
-			embeding = self.embed(self.x)
-			n = self.x.size()[0]
-			Z_ = self.linear_kernel(embeding, embeding)
-			K = (Z_ + self.lam * torch.eye(int(self.n), dtype=torch.float64))
-			if not variance_only:
-				beta_value = norm * np.sqrt(self.lam) + self.s * np.sqrt(
-					torch.logdet(K) - n * np.log(self.lam) + 2. * np.log(1. / delta))
-			else:
-				beta_value = self.s * np.sqrt(torch.logdet(K) - n * np.log(self.lam) + 2. * np.log(1. / delta))
-		return beta_value
-
-	def description(self):
-		return "Custom Features object"
-
-	def norm(self):
-		if self.fitted:
-			norm = torch.linalg.norm(self.theta_mean())
-			return norm
-		else:
-			return None
-
-	def embed(self, x):
-		return self.embedding.embed(x)
-
-	def set_embedding(self, embed):
-		self.embedding_map = embed
-
-	def get_basis_size(self):
-		return int(torch.sum(self.m))
-
-	def set_basis_size(self, m):
-		self.m = m
-
-	def kernel(self, x, y):
-		embedding = self.embed(x)
-		embedding2 = self.embed(y)
-		K = self.linear_kernel(embedding, embedding2)
-		return K
-
-	def logdet_ratio(self):
-		I = torch.eye(int(torch.sum(self.m))).double()
-		return torch.logdet(self.K) - torch.logdet(self.s ** 2 * self.lam * I)
-
-	def effective_dim(self, xtest):
-		Phi = self.embed(xtest)
-		d = torch.trace(torch.solve(Phi.T @ Phi, Phi.T @ Phi + torch.eye(self.get_basis_size()).double() * self.lam)[0])
-		return d
-
-	def add_data_point(self, x, y):
-		if self.n == 0:
-			self.fit_gp(x, y)
-		else:
-			self.to_add.append([x, y])
-			self.fitted = False
-
-	def fit(self,x= None, y=None):
-		self.fit_gp(self.x,self.y)
-
-	def fit_gp_soft(self, x, y, A, b, std=None):
-		self.fit_gp(x, y)
-		Q = self.embed(self.x)
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-		if std is not None:
-			P = np.diag(1 / (std ** 2))
-		else:
-			P = np.diag(np.ones(A.shape[0]))
-
-		objective = cp.Minimize(
-			cp.sum(cp.square(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy()))
-			+ self.s ** 2 * cp.quad_form(A @ theta - b, P) + self.lam * self.s ** 2 * cp.sum_squares(theta))
-		prob = cp.Problem(objective)
-		prob.solve(solver=cp.MOSEK, verbose=False)
-		return torch.from_numpy(theta.value).view(-1, 1)
-
-	def fit_gp_equality_fast(self, x, y, A, b, rcond=1e-2):
-		self.fit_gp(x, y)
-		Q = self.embed(self.x)
-		I = torch.zeros(Q.size()[1]).double()
-
-		V = Q.T @ Q - self.lam * self.s ** 2 * I
-		e = Q.T @ self.y
-
-		R = torch.from_numpy(orth(A.detach().numpy().T)).T
-		b = torch.zeros(size=(R.size()[0], 1)).double()
-		M = torch.vstack([V, R])
-		v = torch.vstack([e, b.view(-1, 1)])
-		theta = torch.linalg.lstsq(M, v.view(-1))[0].view(-1, 1)
-		return theta
-
-	def fit_gp_equality(self, x, y, A, b, eps=1e-6, rcond=1e-6):
-		self.fit_gp(x, y)
-		Q = self.embed(self.x)
-
-		if eps is not None:
-			theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-			objective = cp.Minimize(
-				cp.sum_squares(Q.detach().numpy() @ theta - self.y.view(
-					-1).detach().numpy()) + self.lam * self.s ** 2 * cp.sum_squares(theta))
-
-			constraints = [A.detach().numpy() @ theta - b.detach().view(-1).numpy() <= np.ones(A.size()[0]) * eps ** 2]
-			constraints += [
-				A.detach().numpy() @ theta - b.detach().view(-1).numpy() >= -np.ones(A.size()[0]) * eps ** 2]
-
-			prob = cp.Problem(objective, constraints)
-			prob.solve(solver=cp.MOSEK, verbose=True)
-			return torch.from_numpy(theta.value).view(-1, 1)
-		else:
-			r = torch.linalg.lstsq(A, b)[0]
-			N = null_space(A.detach().numpy(), rcond=rcond)
-			theta = cp.Variable(N.shape[1])
-
-			objective = cp.Minimize(
-				cp.sum_squares(Q.detach().numpy() @ N @ theta - self.y.view(
-					-1).detach().numpy()) + self.lam * self.s ** 2 * cp.sum_squares(theta))
-
-			prob = cp.Problem(objective)
-			prob.solve(solver=cp.MOSEK, verbose=True)
-			return torch.from_numpy(N @ theta.value + r.numpy()).view(-1, 1)
-
-	def fit_gp(self, x, y):
-		'''
-			Function to Fit GP
-		'''
-		self.x = x
-		self.y = y
-		self.n = list(self.x.size())[0]
-		self.d = list(self.x.size())[1]
-
-		if self.n < self.m:
-			self.dual = True
-		else:
-			self.dual = False
-
-		if self.primal == True:
-			self.dual = False
-
-		self.data = True
-		self.fitted = False
-		return None
-
-	def add_points(self, x, y):
-		if self.x is not None:
-			self.x = torch.cat((self.x, x), dim=0)
-			self.y = torch.cat((self.y, y), dim=0)
-		else:
-			self.x = x
-			self.y = y
-
-	def check_conversion(self):
-		"""
-		Convert between dual and primal form
-		:return:
-		"""
-		if self.primal == False:
-			if self.n == self.m:  # convert do d mode
-				print("Switching mode to primal.")
-				self.dual = False
-
-				I = torch.eye(int(self.m)).double()
-				Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
-				self.V = (Z_ + self.s * self.s * self.lam * torch.eye(int(self.m), dtype=torch.float64))
-				self.invV, _ = torch.solve(I, self.V)
-
-	def get_invV(self):
-		self.precompute()
-
-		if self.dual:
-			I = torch.eye(self.m).double()
-			Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
-			self.V = (Z_ + self.s * self.s * self.lam * torch.eye(self.m, dtype=torch.float64))
-			self.invV, _ = torch.solve(I, self.V)
-			return self.invV
-		else:
-			return self.invV
-
-	def precompute(self):
-		if self.fitted == False:
-			if len(self.to_add) > 0:
-				# something to add via low rank update
-				for i in range(len(self.to_add)):
-					newx = self.to_add[i][0]
-					newy = self.to_add[i][1]
-
-					# rank one update
-					emb = self.embed(newx)
-
-					if self.dual:  # via Shur complements
-						newKinv = torch.zeros(size=(self.n + 1, self.n + 1)).double()
-						M = self.invK @ self.Q
-						c = 1. / ((self.s ** 2 * self.lam + emb @ emb.T) - emb @ self.Q.T @ M @ emb.T)
-
-						newKinv[0:self.n, 0:self.n] = self.invK + c * M @ emb.T @ emb @ M.T
-						newKinv[0:self.n, self.n] = (- M @ emb.T * c).view(-1)
-						newKinv[self.n, 0:self.n] = (- emb @ M.T * c).view(-1)
-						newKinv[self.n, self.n] = c.view(-1)
-
-						self.invK = newKinv
-
-						self.add_points(newx, newy)
-						self.n = self.n + 1
-						self.Q = self.embed(self.x)
-
-						self.invK_V = (1. / self.lam) * (-self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m)))
-
-					else:  # via Woodbury
-						c = 1 + emb @ self.invV @ emb.T
-						self.invV = self.invV - (self.invV @ emb.T @ emb @ self.invV) / c
-						self.add_points(newx, newy)
-						self.n = self.n + 1
-						self.Q = self.embed(self.x)
-					# add point
-
-					self.check_conversion()
-
-				self.fitted = True
-				self.to_add = []
-
-
-			elif self.data == True:  # just compute the
-				self.Q = self.embed(self.x)
-				if not self.dual:
-					I = torch.eye(int(self.m)).double()
-					Z_ = self.Q.T @ self.Q
-					self.V = Z_ + self.s ** 2 * self.lam * I
-					self.invV = torch.pinverse(self.V, rcond=1e-10)
-				else:
-					I = torch.eye(self.n).double()
-					Z_ = self.Q @ self.Q.T
-					self.K = Z_ + self.s * self.s * self.lam * I
-					# self.invK, _ = torch.solve(I, self.K)
-					self.invK = torch.pinverse(self.K)
-					self.invK_V = (1. / self.lam) * (-self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m)))
-				self.fitted = True
-			else:
-				I = torch.eye(int(self.m)).double()
-				self.V = self.s ** 2 * self.lam * I
-				self.invV = torch.pinverse(self.V, rcond=1e-10)
-		else:
-			pass
-
-	def theta_mean(self, var=False, prior=False):
-
-		self.precompute()
-		if self.fitted == True and prior == False:
-			if self.dual:
-				theta_mean = self.Q.T @ self.invK @ self.y
-				Z = self.invK_V
-			else:
-				theta_mean = self.invV @ self.Q.T @ self.y
-				Z = self.s ** 2 * self.invV
-		else:
-			theta_mean = 0 * torch.ones(size=(self.m, 1)).double()
-
-		if var is False:
-			return theta_mean
-		else:
-			return (theta_mean, Z)
-
-	def mean_std(self, xtest):
-		'''
-			Calculate mean and variance for GP at xtest points
-		'''
-		# self.precompute()
-		embeding = self.embed(xtest)
-
-		# mean
-		theta_mean = self.theta_mean()
-		ymean = embeding @ theta_mean
-
-		# std
-		if not self.dual:
-			diagonal = self.s ** 2 * torch.einsum('ij,jk,ik->i', (embeding, self.invV, embeding)).view(-1, 1)
-		else:
-			diagonal = torch.einsum('ij,jk,ik->i', (embeding, self.invK_V, embeding)).view(-1, 1)
-
-		ystd = torch.sqrt(diagonal)
-		return (ymean, ystd)
-
-	def sample_matheron(self, xtest, kernel_object, size=1):
-		basis = self.get_basis_size()
-		zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
-		random_vector = torch.normal(mean=zeros, std=1.)
-
-		Z = self.lam * torch.eye(basis, dtype=torch.float64)
-		L = torch.linalg.cholesky(Z.transpose(-2, -1).conj()).transpose(-2, -1).conj()
-		theta = torch.mm(L, random_vector) + self.prior_mean
-
-		f_prior_xtest = torch.mm(self.embed(xtest), theta)
-		f_prior_x = torch.mm(self.embed(self.x), theta)
-
-		K_star = kernel_object.kernel(self.x, xtest)
-		N = self.x.size()[0]
-		K = kernel_object.kernel(self.x, self.x) + self.s ** 2 * self.lam * torch.eye(N)
-
-		f = f_prior_xtest + K_star @ torch.pinverse(K) @ (self.y - f_prior_x)
-		return f
-
-	def sample_theta(self, size=1, prior=False):
-
-		basis = self.get_basis_size()
-
-		zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
-		random_vector = torch.normal(mean=zeros, std=1.).double()
-		self.precompute()
-
-		if self.fitted == True and prior == False:
-			self.L = torch.linalg.cholesky(self.get_invV()) * self.s
-			theta = self.theta_mean().view(-1, 1)
-			print(theta.size())
-			print(self.L.size())
-			print(random_vector.size())
-			theta = theta + torch.mm(self.L, random_vector)
-		else:
-			Z = (self.lam) * torch.eye(basis, dtype=torch.float64)
-			L = torch.linalg.cholesky(Z.transpose(-2, -1).conj()).transpose(-2, -1).conj()
-			theta = torch.mm(L, random_vector) + self.prior_mean
-
-		return theta
-
-	def theta_mean_constrained(self, weights=None, B=1):
-		if weights is None:
-			weights = torch.ones(self.n).double() / self.n
-
-		Q = self.embed(self.x)
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-		objective = cp.Minimize(
-			cp.sum(weights @ cp.square(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy())))
-		zero = np.zeros(int(torch.sum(self.m)))
-		constraints = [cp.SOC(theta @ zero + B, theta)]
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK)
-		return torch.from_numpy(theta.value).view(-1, 1)
-
-	def theta_absolute_deviation(self, weights=None, reg=None):
-		if weights is None:
-			weights = torch.ones(self.x.size()[0])
-
-		if reg is None:  # standard regularization
-			Q = self.embed(self.x)
-			theta = cp.Variable((int(torch.sum(self.m)), 1))
-			objective = cp.Minimize(
-				cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy())) + self.s * self.lam * cp.norm2(theta))
-			prob = cp.Problem(objective)
-			prob.solve()
-			return torch.from_numpy(theta.value)
-		else:  # custom regularization
-			Q = self.embed(self.x)
-			theta = cp.Variable((int(torch.sum(self.m)), 1))
-			objective = cp.Minimize(
-				cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy())) + reg * cp.norm2(theta))
-			prob = cp.Problem(objective)
-			prob.solve(solver=cp.MOSEK)
-			return torch.from_numpy(theta.value)
-
-	def theta_absolute_deviation_constrained(self, weights=None, B=1):
-		if weights is None:
-			weights = torch.ones(self.x.size()[0])
-		Q = self.embed(self.x)
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-
-		objective = cp.Minimize(cp.sum(weights @ cp.abs(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy())))
-		zero = np.zeros(int(torch.sum(self.m)))
-		constraints = [cp.SOC(theta @ zero + B, theta)]
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK)
-		return torch.from_numpy(theta.value).view(-1, 1)
-
-	def theta_chebyschev_approximation(self, eps=1.):
-		Q = self.embed(self.x).detach().numpy()
-		y = self.y.view(-1).detach().numpy()
-
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-		objective = cp.Minimize(cp.sum_squares(theta))
-		constraints = [cp.abs(Q @ theta - y) <= eps]
-
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK)
-		res = torch.from_numpy(theta.value).view(-1, 1)
-		return res
-
-	def interpolation(self, eps=0.):
-		Q = self.embed(self.x).detach().numpy()
-		y = self.y.view(-1).detach().numpy()
-		theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
-		objective = cp.Minimize(cp.sum_squares(theta))
-		constraints = [Q @ theta == y]
-
-		prob = cp.Problem(objective, constraints)
-		prob.solve()
-		res = torch.from_numpy(theta.value).view(-1, 1)
-
-		return res
-
-	def mean_squared(self, xtest, weights=None, B=None, theta=False, reg=None):
-		embeding = self.embed(xtest)
-
-		if B is not None:
-			theta_mean = self.theta_mean_constrained(weights=weights, B=B)
-		else:
-			theta_mean = self.theta_mean(weights=weights, reg=reg)
-		ymean = torch.mm(embeding, theta_mean)
-		if theta == True:
-			return ymean, theta_mean
-		else:
-			return ymean
-
-	def mean_aboslute_deviation(self, xtest, weights=None, B=None, theta=False):
-		embeding = self.embed(xtest)
-		if B is not None:
-			theta_mean = self.theta_absolute_deviation_constrained(weights=weights, B=B)
-		else:
-			theta_mean = self.theta_absolute_deviation(weights=weights)
-		ymean = torch.mm(embeding, theta_mean)
-		if theta == True:
-			return ymean, theta_mean
-		else:
-			return ymean
-
-	"""
+    """
+    Random Fourier Features for Gaussian Kernel
+    """
+
+    def __init__(
+        self,
+        embedding,
+        m,
+        s=0.001,
+        lam=1.0,
+        d=1,
+        diameter=1.0,
+        verbose=True,
+        groups=None,
+        bounds=None,
+        scale=1.0,
+        kappa=1.0,
+        poly=2,
+        primal=True,
+        beta_fun=None,
+    ):
+
+        self.s = s
+        self.lam = lam
+        self.primal = primal
+        self.x = None
+
+        self.K = 0
+        self.mu = 0.0
+
+        self.m = torch.from_numpy(np.array(m))
+        self.fitted = False
+        self.data = False
+
+        self.d = d
+        self.n = 0
+        self.bounds = bounds
+        self.groups = groups
+        self.diameter = diameter
+
+        self.verbose = verbose
+        self.admits_first_order = True
+
+        self.embedding = embedding
+        self.embedding_map = embedding
+
+        self.kappa = kappa
+        self.scale = scale
+        self.poly = poly
+
+        self.to_add = []
+        self.prior_mean = 0
+        self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
+        self.dual = False
+
+    def beta(self, delta=1e-2, norm=1, theory=False, variance_only=False):
+        if not theory:
+            beta_value = 2.0
+        else:
+            embeding = self.embed(self.x)
+            n = self.x.size()[0]
+            Z_ = self.linear_kernel(embeding, embeding)
+            K = Z_ + self.lam * torch.eye(int(self.n), dtype=torch.float64)
+            if not variance_only:
+                beta_value = norm * np.sqrt(self.lam) + self.s * np.sqrt(
+                    torch.logdet(K) - n * np.log(self.lam) + 2.0 * np.log(1.0 / delta)
+                )
+            else:
+                beta_value = self.s * np.sqrt(
+                    torch.logdet(K) - n * np.log(self.lam) + 2.0 * np.log(1.0 / delta)
+                )
+        return beta_value
+
+    def description(self):
+        return "Custom Features object"
+
+    def norm(self):
+        if self.fitted:
+            norm = torch.linalg.norm(self.theta_mean())
+            return norm
+        else:
+            return None
+
+    def embed(self, x):
+        return self.embedding.embed(x)
+
+    def set_embedding(self, embed):
+        self.embedding_map = embed
+
+    def get_basis_size(self):
+        return int(torch.sum(self.m))
+
+    def set_basis_size(self, m):
+        self.m = m
+
+    def kernel(self, x, y):
+        embedding = self.embed(x)
+        embedding2 = self.embed(y)
+        K = self.linear_kernel(embedding, embedding2)
+        return K
+
+    def logdet_ratio(self):
+        I = torch.eye(int(torch.sum(self.m))).double()
+        return torch.logdet(self.K) - torch.logdet(self.s**2 * self.lam * I)
+
+    def effective_dim(self, xtest):
+        Phi = self.embed(xtest)
+        d = torch.trace(
+            torch.solve(
+                Phi.T @ Phi,
+                Phi.T @ Phi + torch.eye(self.get_basis_size()).double() * self.lam,
+            )[0]
+        )
+        return d
+
+    def add_data_point(self, x, y):
+        if self.n == 0:
+            self.fit_gp(x, y)
+        else:
+            self.to_add.append([x, y])
+            self.fitted = False
+
+    def fit(self, x=None, y=None):
+        self.fit_gp(self.x, self.y)
+
+    def fit_gp_soft(self, x, y, A, b, std=None):
+        self.fit_gp(x, y)
+        Q = self.embed(self.x)
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+        if std is not None:
+            P = np.diag(1 / (std**2))
+        else:
+            P = np.diag(np.ones(A.shape[0]))
+
+        objective = cp.Minimize(
+            cp.sum(
+                cp.square(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy())
+            )
+            + self.s**2 * cp.quad_form(A @ theta - b, P)
+            + self.lam * self.s**2 * cp.sum_squares(theta)
+        )
+        prob = cp.Problem(objective)
+        prob.solve(solver=cp.MOSEK, verbose=False)
+        return torch.from_numpy(theta.value).view(-1, 1)
+
+    def fit_gp_equality_fast(self, x, y, A, b, rcond=1e-2):
+        self.fit_gp(x, y)
+        Q = self.embed(self.x)
+        I = torch.zeros(Q.size()[1]).double()
+
+        V = Q.T @ Q - self.lam * self.s**2 * I
+        e = Q.T @ self.y
+
+        R = torch.from_numpy(orth(A.detach().numpy().T)).T
+        b = torch.zeros(size=(R.size()[0], 1)).double()
+        M = torch.vstack([V, R])
+        v = torch.vstack([e, b.view(-1, 1)])
+        theta = torch.linalg.lstsq(M, v.view(-1))[0].view(-1, 1)
+        return theta
+
+    def fit_gp_equality(self, x, y, A, b, eps=1e-6, rcond=1e-6):
+        self.fit_gp(x, y)
+        Q = self.embed(self.x)
+
+        if eps is not None:
+            theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+            objective = cp.Minimize(
+                cp.sum_squares(
+                    Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy()
+                )
+                + self.lam * self.s**2 * cp.sum_squares(theta)
+            )
+
+            constraints = [
+                A.detach().numpy() @ theta - b.detach().view(-1).numpy()
+                <= np.ones(A.size()[0]) * eps**2
+            ]
+            constraints += [
+                A.detach().numpy() @ theta - b.detach().view(-1).numpy()
+                >= -np.ones(A.size()[0]) * eps**2
+            ]
+
+            prob = cp.Problem(objective, constraints)
+            prob.solve(solver=cp.MOSEK, verbose=True)
+            return torch.from_numpy(theta.value).view(-1, 1)
+        else:
+            r = torch.linalg.lstsq(A, b)[0]
+            N = null_space(A.detach().numpy(), rcond=rcond)
+            theta = cp.Variable(N.shape[1])
+
+            objective = cp.Minimize(
+                cp.sum_squares(
+                    Q.detach().numpy() @ N @ theta - self.y.view(-1).detach().numpy()
+                )
+                + self.lam * self.s**2 * cp.sum_squares(theta)
+            )
+
+            prob = cp.Problem(objective)
+            prob.solve(solver=cp.MOSEK, verbose=True)
+            return torch.from_numpy(N @ theta.value + r.numpy()).view(-1, 1)
+
+    def fit_gp(self, x, y):
+        """
+        Function to Fit GP
+        """
+        self.x = x
+        self.y = y
+        self.n = list(self.x.size())[0]
+        self.d = list(self.x.size())[1]
+
+        if self.n < self.m:
+            self.dual = True
+        else:
+            self.dual = False
+
+        if self.primal == True:
+            self.dual = False
+
+        self.data = True
+        self.fitted = False
+        return None
+
+    def add_points(self, x, y):
+        if self.x is not None:
+            self.x = torch.cat((self.x, x), dim=0)
+            self.y = torch.cat((self.y, y), dim=0)
+        else:
+            self.x = x
+            self.y = y
+
+    def check_conversion(self):
+        """
+        Convert between dual and primal form
+        :return:
+        """
+        if self.primal == False:
+            if self.n == self.m:  # convert do d mode
+                print("Switching mode to primal.")
+                self.dual = False
+
+                I = torch.eye(int(self.m)).double()
+                Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
+                self.V = Z_ + self.s * self.s * self.lam * torch.eye(
+                    int(self.m), dtype=torch.float64
+                )
+                self.invV, _ = torch.solve(I, self.V)
+
+    def get_invV(self):
+        self.precompute()
+
+        if self.dual:
+            I = torch.eye(self.m).double()
+            Z_ = self.linear_kernel(torch.t(self.Q), torch.t(self.Q))
+            self.V = Z_ + self.s * self.s * self.lam * torch.eye(
+                self.m, dtype=torch.float64
+            )
+            self.invV, _ = torch.solve(I, self.V)
+            return self.invV
+        else:
+            return self.invV
+
+    def precompute(self):
+        if self.fitted == False:
+            if len(self.to_add) > 0:
+                # something to add via low rank update
+                for i in range(len(self.to_add)):
+                    newx = self.to_add[i][0]
+                    newy = self.to_add[i][1]
+
+                    # rank one update
+                    emb = self.embed(newx)
+
+                    if self.dual:  # via Shur complements
+                        newKinv = torch.zeros(size=(self.n + 1, self.n + 1)).double()
+                        M = self.invK @ self.Q
+                        c = 1.0 / (
+                            (self.s**2 * self.lam + emb @ emb.T)
+                            - emb @ self.Q.T @ M @ emb.T
+                        )
+
+                        newKinv[0 : self.n, 0 : self.n] = (
+                            self.invK + c * M @ emb.T @ emb @ M.T
+                        )
+                        newKinv[0 : self.n, self.n] = (-M @ emb.T * c).view(-1)
+                        newKinv[self.n, 0 : self.n] = (-emb @ M.T * c).view(-1)
+                        newKinv[self.n, self.n] = c.view(-1)
+
+                        self.invK = newKinv
+
+                        self.add_points(newx, newy)
+                        self.n = self.n + 1
+                        self.Q = self.embed(self.x)
+
+                        self.invK_V = (1.0 / self.lam) * (
+                            -self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m))
+                        )
+
+                    else:  # via Woodbury
+                        c = 1 + emb @ self.invV @ emb.T
+                        self.invV = (
+                            self.invV - (self.invV @ emb.T @ emb @ self.invV) / c
+                        )
+                        self.add_points(newx, newy)
+                        self.n = self.n + 1
+                        self.Q = self.embed(self.x)
+                    # add point
+
+                    self.check_conversion()
+
+                self.fitted = True
+                self.to_add = []
+
+            elif self.data == True:  # just compute the
+                self.Q = self.embed(self.x)
+                if not self.dual:
+                    I = torch.eye(int(self.m)).double()
+                    Z_ = self.Q.T @ self.Q
+                    self.V = Z_ + self.s**2 * self.lam * I
+                    self.invV = torch.pinverse(self.V, rcond=1e-10)
+                else:
+                    I = torch.eye(self.n).double()
+                    Z_ = self.Q @ self.Q.T
+                    self.K = Z_ + self.s * self.s * self.lam * I
+                    # self.invK, _ = torch.solve(I, self.K)
+                    self.invK = torch.pinverse(self.K)
+                    self.invK_V = (1.0 / self.lam) * (
+                        -self.Q.T @ self.invK @ self.Q + torch.eye(int(self.m))
+                    )
+                self.fitted = True
+            else:
+                I = torch.eye(int(self.m)).double()
+                self.V = self.s**2 * self.lam * I
+                self.invV = torch.pinverse(self.V, rcond=1e-10)
+        else:
+            pass
+
+    def theta_mean(self, var=False, prior=False):
+
+        self.precompute()
+        if self.fitted == True and prior == False:
+            if self.dual:
+                theta_mean = self.Q.T @ self.invK @ self.y
+                Z = self.invK_V
+            else:
+                theta_mean = self.invV @ self.Q.T @ self.y
+                Z = self.s**2 * self.invV
+        else:
+            theta_mean = 0 * torch.ones(size=(self.m, 1)).double()
+
+        if var is False:
+            return theta_mean
+        else:
+            return (theta_mean, Z)
+
+    def mean_std(self, xtest):
+        """
+        Calculate mean and variance for GP at xtest points
+        """
+        # self.precompute()
+        embeding = self.embed(xtest)
+
+        # mean
+        theta_mean = self.theta_mean()
+        ymean = embeding @ theta_mean
+
+        # std
+        if not self.dual:
+            diagonal = self.s**2 * torch.einsum(
+                "ij,jk,ik->i", (embeding, self.invV, embeding)
+            ).view(-1, 1)
+        else:
+            diagonal = torch.einsum(
+                "ij,jk,ik->i", (embeding, self.invK_V, embeding)
+            ).view(-1, 1)
+
+        ystd = torch.sqrt(diagonal)
+        return (ymean, ystd)
+
+    def sample_matheron(self, xtest, kernel_object, size=1):
+        basis = self.get_basis_size()
+        zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
+        random_vector = torch.normal(mean=zeros, std=1.0)
+
+        Z = self.lam * torch.eye(basis, dtype=torch.float64)
+        L = torch.linalg.cholesky(Z.transpose(-2, -1).conj()).transpose(-2, -1).conj()
+        theta = torch.mm(L, random_vector) + self.prior_mean
+
+        f_prior_xtest = torch.mm(self.embed(xtest), theta)
+        f_prior_x = torch.mm(self.embed(self.x), theta)
+
+        K_star = kernel_object.kernel(self.x, xtest)
+        N = self.x.size()[0]
+        K = kernel_object.kernel(self.x, self.x) + self.s**2 * self.lam * torch.eye(N)
+
+        f = f_prior_xtest + K_star @ torch.pinverse(K) @ (self.y - f_prior_x)
+        return f
+
+    def sample_theta(self, size=1, prior=False):
+
+        basis = self.get_basis_size()
+
+        zeros = torch.zeros(size=(basis, size), dtype=torch.float64)
+        random_vector = torch.normal(mean=zeros, std=1.0).double()
+        self.precompute()
+
+        if self.fitted == True and prior == False:
+            self.L = torch.linalg.cholesky(self.get_invV()) * self.s
+            theta = self.theta_mean().view(-1, 1)
+            print(theta.size())
+            print(self.L.size())
+            print(random_vector.size())
+            theta = theta + torch.mm(self.L, random_vector)
+        else:
+            Z = (self.lam) * torch.eye(basis, dtype=torch.float64)
+            L = (
+                torch.linalg.cholesky(Z.transpose(-2, -1).conj())
+                .transpose(-2, -1)
+                .conj()
+            )
+            theta = torch.mm(L, random_vector) + self.prior_mean
+
+        return theta
+
+    def theta_mean_constrained(self, weights=None, B=1):
+        if weights is None:
+            weights = torch.ones(self.n).double() / self.n
+
+        Q = self.embed(self.x)
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+        objective = cp.Minimize(
+            cp.sum(
+                weights
+                @ cp.square(
+                    Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy()
+                )
+            )
+        )
+        zero = np.zeros(int(torch.sum(self.m)))
+        constraints = [cp.SOC(theta @ zero + B, theta)]
+        prob = cp.Problem(objective, constraints)
+        prob.solve(solver=cp.MOSEK)
+        return torch.from_numpy(theta.value).view(-1, 1)
+
+    def theta_absolute_deviation(self, weights=None, reg=None):
+        if weights is None:
+            weights = torch.ones(self.x.size()[0])
+
+        if reg is None:  # standard regularization
+            Q = self.embed(self.x)
+            theta = cp.Variable((int(torch.sum(self.m)), 1))
+            objective = cp.Minimize(
+                cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy()))
+                + self.s * self.lam * cp.norm2(theta)
+            )
+            prob = cp.Problem(objective)
+            prob.solve()
+            return torch.from_numpy(theta.value)
+        else:  # custom regularization
+            Q = self.embed(self.x)
+            theta = cp.Variable((int(torch.sum(self.m)), 1))
+            objective = cp.Minimize(
+                cp.sum(weights @ cp.abs(Q.numpy() @ theta - self.y.numpy()))
+                + reg * cp.norm2(theta)
+            )
+            prob = cp.Problem(objective)
+            prob.solve(solver=cp.MOSEK)
+            return torch.from_numpy(theta.value)
+
+    def theta_absolute_deviation_constrained(self, weights=None, B=1):
+        if weights is None:
+            weights = torch.ones(self.x.size()[0])
+        Q = self.embed(self.x)
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+
+        objective = cp.Minimize(
+            cp.sum(
+                weights
+                @ cp.abs(Q.detach().numpy() @ theta - self.y.view(-1).detach().numpy())
+            )
+        )
+        zero = np.zeros(int(torch.sum(self.m)))
+        constraints = [cp.SOC(theta @ zero + B, theta)]
+        prob = cp.Problem(objective, constraints)
+        prob.solve(solver=cp.MOSEK)
+        return torch.from_numpy(theta.value).view(-1, 1)
+
+    def theta_chebyschev_approximation(self, eps=1.0):
+        Q = self.embed(self.x).detach().numpy()
+        y = self.y.view(-1).detach().numpy()
+
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+        objective = cp.Minimize(cp.sum_squares(theta))
+        constraints = [cp.abs(Q @ theta - y) <= eps]
+
+        prob = cp.Problem(objective, constraints)
+        prob.solve(solver=cp.MOSEK)
+        res = torch.from_numpy(theta.value).view(-1, 1)
+        return res
+
+    def interpolation(self, eps=0.0):
+        Q = self.embed(self.x).detach().numpy()
+        y = self.y.view(-1).detach().numpy()
+        theta = cp.Variable(int(torch.sum(self.m).detach().view(-1).numpy()))
+        objective = cp.Minimize(cp.sum_squares(theta))
+        constraints = [Q @ theta == y]
+
+        prob = cp.Problem(objective, constraints)
+        prob.solve()
+        res = torch.from_numpy(theta.value).view(-1, 1)
+
+        return res
+
+    def mean_squared(self, xtest, weights=None, B=None, theta=False, reg=None):
+        embeding = self.embed(xtest)
+
+        if B is not None:
+            theta_mean = self.theta_mean_constrained(weights=weights, B=B)
+        else:
+            theta_mean = self.theta_mean(weights=weights, reg=reg)
+        ymean = torch.mm(embeding, theta_mean)
+        if theta == True:
+            return ymean, theta_mean
+        else:
+            return ymean
+
+    def mean_aboslute_deviation(self, xtest, weights=None, B=None, theta=False):
+        embeding = self.embed(xtest)
+        if B is not None:
+            theta_mean = self.theta_absolute_deviation_constrained(weights=weights, B=B)
+        else:
+            theta_mean = self.theta_absolute_deviation(weights=weights)
+        ymean = torch.mm(embeding, theta_mean)
+        if theta == True:
+            return ymean, theta_mean
+        else:
+            return ymean
+
+    """
 	Hessian 
 	"""
 
-	def mean_gradient_hessian(self, xtest, hessian=False):
-		hessian_mu = torch.zeros(size=(self.d, self.d), dtype=torch.float64)
-		xtest.requires_grad_(True)
+    def mean_gradient_hessian(self, xtest, hessian=False):
+        hessian_mu = torch.zeros(size=(self.d, self.d), dtype=torch.float64)
+        xtest.requires_grad_(True)
 
-		# xtest.retain_grad()
-		mu = self.mean_std(xtest)[0]
-		# mu.backward(retain_graph=True)
+        # xtest.retain_grad()
+        mu = self.mean_std(xtest)[0]
+        # mu.backward(retain_graph=True)
 
-		# nabla_mu = xtest.grad
-		nabla_mu = grad(mu, xtest, create_graph=True)[0][0]
+        # nabla_mu = xtest.grad
+        nabla_mu = grad(mu, xtest, create_graph=True)[0][0]
 
-		if hessian == False:
-			return nabla_mu
-		else:
-			for i in range(self.d):
-				hessian_mu[i, :] = grad(nabla_mu[i], xtest, create_graph=True, retain_graph=True)[0][0]
-			return [nabla_mu, hessian_mu]
+        if hessian == False:
+            return nabla_mu
+        else:
+            for i in range(self.d):
+                hessian_mu[i, :] = grad(
+                    nabla_mu[i], xtest, create_graph=True, retain_graph=True
+                )[0][0]
+            return [nabla_mu, hessian_mu]
 
-	""" 
+    """ 
 	Optimization
 	"""
 
-	def ucb(self, xtest, beta = lambda :2., bound = None, lcb = False):
-
-		if bound is not None:
-			mu, V = self.theta_mean(var = True)
-			mu = mu.T
-			Phi = self.embed(xtest)
-			ucb = torch.zeros(size = (xtest.size()[0],1)).double()
-
-			theta = cp.Variable(self.get_basis_size())
-			for i in range(xtest.size()[0]):
-				phi = Phi[i,:]
-				if lcb:
-					objective = cp.Minimize(phi @ theta)
-				else:
-					objective = cp.Maximize(phi @ theta)
-
-				constraints = []
-				constraints += [cp.quad_form(mu.view(-1)-theta,V) <= beta()]
-				constraints += [cp.sum_squares(theta) <= bound]
-				prob = cp.Problem(objective, constraints)
-				prob.solve()
-				ucb[i,0] = prob.value
-			return ucb
-		else:
-			mu, sigma = self.mean_std(xtest)
-			if lcb:
-				return mu - beta()*sigma
-			else:
-				return mu + beta() * sigma
-
-
-	def lcb(self, xtest, beta = lambda :2, bound = None):
-		return self.ucb(xtest, beta = beta, bound = bound, lcb = True)
-
-
-	def ucb_optimize(self, beta, multistart=25, lcb=False, minimizer="L-BFGS-B"):
-
-		# precompute important (theta)
-		theta_mean, K = self.theta_mean(var=True)
-
-		if lcb == False:
-			fun = lambda x: - (self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean + \
-							   beta * torch.sqrt(self.embed(torch.from_numpy(x).view(1, -1)) @ K @ self.embed(
-						torch.from_numpy(x).view(1, -1)).T)).detach().numpy()[0]
-		else:
-			fun = lambda x: - (self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean - \
-							   beta * torch.sqrt(self.embed(torch.from_numpy(x).view(1, -1)) @ K @ self.embed(
-						torch.from_numpy(x).view(1, -1)).T).detach().numpy()[0]).numpy()[0]
-
-		if self.bounds == None:
-			mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
-		else:
-			mybounds = self.bounds
-
-		results = []
-		for j in range(multistart):
-
-			x0 = np.random.randn(self.d)
-			for i in range(self.d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-
-			if minimizer == "L-BFGS-B":
-				res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds)
-				solution = res.x
-			else:
-				raise AssertionError("Wrong optimizer selected.")
-
-			results.append([solution, -fun(solution)])
-
-		results = np.array(results)
-		index = np.argmax(results[:, 1])
-		solution = results[index, 0]
-		return (torch.from_numpy(solution).view(1, -1), -torch.from_numpy(fun(solution)))
-
-	def sample_and_optimize(self, xtest=None, multistart=25, minimizer="L-BFGS-B", grid=100, verbose=0):
-		'''
-			Sample functions from Gaussian Process and take Maximum using
-			first order maximization
-		'''
-
-		# sample linear approximating
-		theta = self.sample_theta()
-
-		# get bounds
-		if self.bounds == None:
-			mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
-		else:
-			mybounds = self.bounds
-
-		fun = lambda x: -torch.mm(torch.t(theta), torch.t(self.embed(torch.from_numpy(x).view(1, -1)))).numpy()[0]
-
-		results = []
-		for j in range(multistart):
-			x0 = np.random.randn(self.d)
-			for i in range(self.d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-
-			if minimizer == "L-BFGS-B":
-				res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds)
-				solution = res.x
-			else:
-				raise AssertionError("Wrong optimizer selected.")
-
-			results.append([solution, -fun(solution)])
-		results = np.array(results)
-		index = np.argmax(results[:, 1])
-		solution = results[index, 0]
-
-		return (torch.from_numpy(solution), -torch.from_numpy(fun(solution)))
-
-	def sample(self, xtest, size=1, prior=False):
-		'''
-			Sample functions from Gaussian Process
-		'''
-		theta = self.sample_theta(size=size, prior=prior)
-		f = torch.mm(self.embed(xtest), theta)
-		return f
-
-	def sample_and_max(self, xtest, size=1):
-		'''
-			Sample functions from Gaussian Process and take Maximum
-		'''
-		f = self.sample(xtest, size=size)
-		index = np.argmax(f, axis=0)
-		return (xtest[index, :], f[index, :])
-
-	def get_kernel(self):
-		embeding = self.embed(self.x)
-		Z_ = self.linear_kernel(embeding, embeding)
-		K = (Z_ + self.s * self.s * self.lam * torch.eye(int(self.n), dtype=torch.float64))
-		return K
-
-	def residuals(self):
-		mu, _ = self.mean_std(self.x)
-		out = torch.sum((mu - self.y) ** 2)
-		return out
+    def ucb(self, xtest, beta=lambda: 2.0, bound=None, lcb=False):
+
+        if bound is not None:
+            mu, V = self.theta_mean(var=True)
+            mu = mu.T
+            Phi = self.embed(xtest)
+            ucb = torch.zeros(size=(xtest.size()[0], 1)).double()
+
+            theta = cp.Variable(self.get_basis_size())
+            for i in range(xtest.size()[0]):
+                phi = Phi[i, :]
+                if lcb:
+                    objective = cp.Minimize(phi @ theta)
+                else:
+                    objective = cp.Maximize(phi @ theta)
+
+                constraints = []
+                constraints += [cp.quad_form(mu.view(-1) - theta, V) <= beta()]
+                constraints += [cp.sum_squares(theta) <= bound]
+                prob = cp.Problem(objective, constraints)
+                prob.solve()
+                ucb[i, 0] = prob.value
+            return ucb
+        else:
+            mu, sigma = self.mean_std(xtest)
+            if lcb:
+                return mu - beta() * sigma
+            else:
+                return mu + beta() * sigma
+
+    def lcb(self, xtest, beta=lambda: 2, bound=None):
+        return self.ucb(xtest, beta=beta, bound=bound, lcb=True)
+
+    def ucb_optimize(self, beta, multistart=25, lcb=False, minimizer="L-BFGS-B"):
+
+        # precompute important (theta)
+        theta_mean, K = self.theta_mean(var=True)
+
+        if lcb == False:
+            fun = (
+                lambda x: -(
+                    self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean
+                    + beta
+                    * torch.sqrt(
+                        self.embed(torch.from_numpy(x).view(1, -1))
+                        @ K
+                        @ self.embed(torch.from_numpy(x).view(1, -1)).T
+                    )
+                )
+                .detach()
+                .numpy()[0]
+            )
+        else:
+            fun = lambda x: -(
+                self.embed(torch.from_numpy(x).view(1, -1)) @ theta_mean
+                - beta
+                * torch.sqrt(
+                    self.embed(torch.from_numpy(x).view(1, -1))
+                    @ K
+                    @ self.embed(torch.from_numpy(x).view(1, -1)).T
+                )
+                .detach()
+                .numpy()[0]
+            ).numpy()[0]
+
+        if self.bounds == None:
+            mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
+        else:
+            mybounds = self.bounds
+
+        results = []
+        for j in range(multistart):
+
+            x0 = np.random.randn(self.d)
+            for i in range(self.d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+
+            if minimizer == "L-BFGS-B":
+                res = minimize(
+                    fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds
+                )
+                solution = res.x
+            else:
+                raise AssertionError("Wrong optimizer selected.")
+
+            results.append([solution, -fun(solution)])
+
+        results = np.array(results)
+        index = np.argmax(results[:, 1])
+        solution = results[index, 0]
+        return (
+            torch.from_numpy(solution).view(1, -1),
+            -torch.from_numpy(fun(solution)),
+        )
+
+    def sample_and_optimize(
+        self, xtest=None, multistart=25, minimizer="L-BFGS-B", grid=100, verbose=0
+    ):
+        """
+        Sample functions from Gaussian Process and take Maximum using
+        first order maximization
+        """
+
+        # sample linear approximating
+        theta = self.sample_theta()
+
+        # get bounds
+        if self.bounds == None:
+            mybounds = tuple([(-self.diameter, self.diameter) for _ in range(self.d)])
+        else:
+            mybounds = self.bounds
+
+        fun = lambda x: -torch.mm(
+            torch.t(theta), torch.t(self.embed(torch.from_numpy(x).view(1, -1)))
+        ).numpy()[0]
+
+        results = []
+        for j in range(multistart):
+            x0 = np.random.randn(self.d)
+            for i in range(self.d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+
+            if minimizer == "L-BFGS-B":
+                res = minimize(
+                    fun, x0, method="L-BFGS-B", jac=None, tol=0.0001, bounds=mybounds
+                )
+                solution = res.x
+            else:
+                raise AssertionError("Wrong optimizer selected.")
+
+            results.append([solution, -fun(solution)])
+        results = np.array(results)
+        index = np.argmax(results[:, 1])
+        solution = results[index, 0]
+
+        return (torch.from_numpy(solution), -torch.from_numpy(fun(solution)))
+
+    def sample(self, xtest, size=1, prior=False):
+        """
+        Sample functions from Gaussian Process
+        """
+        theta = self.sample_theta(size=size, prior=prior)
+        f = torch.mm(self.embed(xtest), theta)
+        return f
+
+    def sample_and_max(self, xtest, size=1):
+        """
+        Sample functions from Gaussian Process and take Maximum
+        """
+        f = self.sample(xtest, size=size)
+        index = np.argmax(f, axis=0)
+        return (xtest[index, :], f[index, :])
+
+    def get_kernel(self):
+        embeding = self.embed(self.x)
+        Z_ = self.linear_kernel(embeding, embeding)
+        K = Z_ + self.s * self.s * self.lam * torch.eye(
+            int(self.n), dtype=torch.float64
+        )
+        return K
+
+    def residuals(self):
+        mu, _ = self.mean_std(self.x)
+        out = torch.sum((mu - self.y) ** 2)
+        return out
 
 
 if __name__ == "__main__":
-	N = 10
-	s = 0.1
-	n = 256
-	L_infinity_ball = 0.5
-
-	d = 1
-	m = 128
-
-	xtest = torch.from_numpy(interval(n, d, L_infinity_ball=L_infinity_ball))
-	x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, N)).view(-1, 1)
-
-	F_true = lambda x: torch.sin(x * 4) ** 2 - 0.1
-	F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
-	y = F(x)
-
-	emb = RFFEmbedding(m=m, gamma=0.1)
-	Reggr = KernelizedFeatures(embedding=emb, m=m, d=1)
-	Reggr.fit_gp(x, y)
-	Reggr.visualize(xtest, f_true=F_true)
+    N = 10
+    s = 0.1
+    n = 256
+    L_infinity_ball = 0.5
+
+    d = 1
+    m = 128
+
+    xtest = torch.from_numpy(interval(n, d, L_infinity_ball=L_infinity_ball))
+    x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, N)).view(
+        -1, 1
+    )
+
+    F_true = lambda x: torch.sin(x * 4) ** 2 - 0.1
+    F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
+    y = F(x)
+
+    emb = RFFEmbedding(m=m, gamma=0.1)
+    Reggr = KernelizedFeatures(embedding=emb, m=m, d=1)
+    Reggr.fit_gp(x, y)
+    Reggr.visualize(xtest, f_true=F_true)
diff --git a/stpy/continuous_processes/mkl_estimator.py b/stpy/continuous_processes/mkl_estimator.py
index 59e1ca8..639541d 100755
--- a/stpy/continuous_processes/mkl_estimator.py
+++ b/stpy/continuous_processes/mkl_estimator.py
@@ -7,213 +7,271 @@
 from stpy.regularization.regularizer import Regularizer
 from stpy.regularization.simplex_regularizer import DirichletRegularizer, SupRegularizer
 
+
 class MultipleKernelLearner(GaussianProcess):
 
-	def __init__(self, kernel_objects,
-				 lam: float =1.0,
-				 s:  float = 0.01,
-				 opt: str = 'closed',
-				 regularizer: Regularizer = None):
-
-		self.kernel_objects = kernel_objects
-		self.no_models = len(kernel_objects)
-		self.regularizer = regularizer
-		self.s = s
-		self.lam = lam
-		self.opt = opt
-
-		self.var = 'fixed'
-
-	def fit(self):
-		self.fit_gp(self.x,self.y)
-
-	def fit_gp(self, x, y):
-		self.x = x
-		self.y = y
-		(self.n, self.d) = self.x.size()
-
-		self.Ks = []
-		for i in range(self.no_models):
-			self.Ks.append(self.kernel_objects[i].kernel(x,x))
-
-		if self.opt == 'sdp':
-			alpha = cp.Variable(self.no_models)
-			u = cp.Variable(1)
-			A = None
-			for i in range(self.no_models):
-				if A is None:
-					A = self.Ks[i] * alpha[i]
-				else:
-					A += self.Ks[i] * alpha[i]
-			A = A + np.eye(self.n)*self.lam*self.s**2
-			constraints = []
-			l = cp.reshape(u, (1, 1))
-			G = cp.bmat([[A, y.numpy()], [y.numpy().T, l]])
-			constraints += [G >> 0]
-			constraints += [alpha >= 0.]
-			constraints += [cp.sum(alpha) == 1.]
-
-			objective = cp.Minimize( u)
-			prob = cp.Problem(objective, constraints)
-			prob.solve( solver = cp.MOSEK,verbose = True)
-
-		elif self.opt == "closed":
-			alpha = cp.Variable(self.no_models, nonneg=True)
-			A = sum([self.Ks[i] * alpha[i] for i in range(self.no_models)])+ np.eye(self.n) * self.lam * self.s ** 2
-			constraints = [cp.sum(alpha)==1, alpha<=1]
-			objective = cp.matrix_frac(self.y.numpy(), A)
-			if self.regularizer is not None and self.regularizer.is_convex():
-				objective = objective + self.regularizer.get_regularizer_cvxpy()(alpha)
-				prob = cp.Problem(cp.Minimize(objective), constraints)
-				prob.solve(solver=cp.MOSEK, verbose=False)
-
-			elif self.regularizer is not None and not self.regularizer.is_convex():
-				obj,con,vars = self.regularizer.get_cvxpy_objectives_constraints_variables(self.no_models)
-				no_problems = len(con)
-				vals = []
-				args = []
-				for i in range(no_problems):
-					prob = cp.Problem(cp.Minimize(objective+obj[i](alpha,*vars)), constraints + con[i](alpha, *vars))
-					prob.solve(solver=cp.MOSEK, verbose=False)
-					vals.append(prob.value)
-					args.append(alpha.value)
-				alpha.value = args[np.argmin(vals)]
-			else:
-				prob = cp.Problem(cp.Minimize(objective), constraints)
-				prob.solve(solver=cp.MOSEK, verbose=False)
-
-		self.alphas = torch.from_numpy(alpha.value)
-		if self.regularizer is not None:
-			print (self.regularizer.name, self.alphas)
-		else:
-			print("No", self.alphas)
-		self.K = torch.sum(torch.stack([alpha*K for alpha,K in zip(self.alphas, self.Ks)]), dim = 0) + np.eye(self.n)*self.lam*self.s**2
-		self.fitted = True
-
-	def execute(self, xtest):
-		if self.fitted == True:
-			Ks = [self.kernel_objects[i].kernel(self.x, xtest) for i in range(self.no_models)]
-			K_star = torch.sum(torch.stack([alpha * K for alpha, K in zip(self.alphas, Ks)]), dim=0)
-		else:
-			K_star = None
-		Ks = [self.kernel_objects[i].kernel(xtest, xtest) for i in range(self.no_models)]
-		K_star_star = torch.sum(torch.stack([alpha * K for alpha, K in zip(self.alphas, Ks)]), dim=0)
-		return (K_star, K_star_star)
-
-	# def log_marginal(self, kernel, X, weight):
-	# 	pass
-
-	def mean(self, xtest):
-		K_star, K_star_star = self.execute(xtest)
-		self.A = torch.linalg.lstsq(self.K, self.y)[0]
-		ymean = torch.mm(K_star, self.A)
-		return ymean
-
-	def mean_std(self, xtest, full=False, reuse=False):
-		K_star, K_star_star = self.execute(xtest)
-		self.A = torch.linalg.lstsq(self.K, self.y)[0]
-		ymean = torch.mm(K_star, self.A)
-
-		if self.var == 'fixed':
-			ystd = self.std_fixed(xtest)
-		elif self.var == 'true':
-			ystd = self.std_opt(xtest)
-		return (ymean, ystd)
-
-	def lcb(self, xtest: torch.Tensor, type=None, arg=False, sign=1.):
-		theta = cp.Variable((self.alpha, 1))
-		args = []
-		n = xtest.size()[0]
-		values = torch.zeros(size=(n, 1)).double()
-		Phi = self.embed(xtest)
-
-		for j in range(n):
-			objective = sign * Phi[j, :] @ theta
-			if (self.constraints is not None and not self.constraints.is_convex()):
-				value, theta_lcb = self.objective_on_non_convex_confidence_set(theta, objective, type=type)
-			elif not self.regularizer.is_convex():
-				value, theta_lcb = self.objective_on_non_convex_confidence_set_bisection(theta, objective,
-																						 type=type)
-			else:
-				value, theta_lcb = self.objective_on_confidence_set(theta, objective, type=type)
-
-			values[j] = sign * value
-			if arg:
-				args.append(theta_lcb)
-
-		if args:
-			return values, args
-		else:
-			return values
-
-	def ucb(self, xtest):
-		pass
-
-	def std_opt(self, xtest):
-		N = xtest.size()[0]
-		for i in range(N):
-			x = xtest[i,:]
-			theta = cp.Variable(self.n*self.no_models)
-			M = torch.block_diag(self.Ks)
-			cp.norm(theta,p=2)*theta[i]
-
-	def std_fixed(self, xtest):
-		K_star, K_star_star = self.execute(xtest)
-		self.B = torch.t(torch.linalg.solve(self.K, torch.t(K_star)))
-		first = torch.diag(K_star_star).view(-1, 1)
-		second = torch.einsum('ij,ji->i', (self.B, torch.t(K_star))).view(-1, 1)
-		variance = first - second
-		ystd = torch.sqrt(variance)
-		return ystd
-
-	def sample(self, xtest, size=1):
-		pass
+    def __init__(
+        self,
+        kernel_objects,
+        lam: float = 1.0,
+        s: float = 0.01,
+        opt: str = "closed",
+        regularizer: Regularizer = None,
+    ):
+
+        self.kernel_objects = kernel_objects
+        self.no_models = len(kernel_objects)
+        self.regularizer = regularizer
+        self.s = s
+        self.lam = lam
+        self.opt = opt
+
+        self.var = "fixed"
+
+    def fit(self):
+        self.fit_gp(self.x, self.y)
+
+    def fit_gp(self, x, y):
+        self.x = x
+        self.y = y
+        (self.n, self.d) = self.x.size()
+
+        self.Ks = []
+        for i in range(self.no_models):
+            self.Ks.append(self.kernel_objects[i].kernel(x, x))
+
+        if self.opt == "sdp":
+            alpha = cp.Variable(self.no_models)
+            u = cp.Variable(1)
+            A = None
+            for i in range(self.no_models):
+                if A is None:
+                    A = self.Ks[i] * alpha[i]
+                else:
+                    A += self.Ks[i] * alpha[i]
+            A = A + np.eye(self.n) * self.lam * self.s**2
+            constraints = []
+            l = cp.reshape(u, (1, 1))
+            G = cp.bmat([[A, y.numpy()], [y.numpy().T, l]])
+            constraints += [G >> 0]
+            constraints += [alpha >= 0.0]
+            constraints += [cp.sum(alpha) == 1.0]
+
+            objective = cp.Minimize(u)
+            prob = cp.Problem(objective, constraints)
+            prob.solve(solver=cp.MOSEK, verbose=True)
+
+        elif self.opt == "closed":
+            alpha = cp.Variable(self.no_models, nonneg=True)
+            A = (
+                sum([self.Ks[i] * alpha[i] for i in range(self.no_models)])
+                + np.eye(self.n) * self.lam * self.s**2
+            )
+            constraints = [cp.sum(alpha) == 1, alpha <= 1]
+            objective = cp.matrix_frac(self.y.numpy(), A)
+            if self.regularizer is not None and self.regularizer.is_convex():
+                objective = objective + self.regularizer.get_regularizer_cvxpy()(alpha)
+                prob = cp.Problem(cp.Minimize(objective), constraints)
+                prob.solve(solver=cp.MOSEK, verbose=False)
+
+            elif self.regularizer is not None and not self.regularizer.is_convex():
+                obj, con, vars = (
+                    self.regularizer.get_cvxpy_objectives_constraints_variables(
+                        self.no_models
+                    )
+                )
+                no_problems = len(con)
+                vals = []
+                args = []
+                for i in range(no_problems):
+                    prob = cp.Problem(
+                        cp.Minimize(objective + obj[i](alpha, *vars)),
+                        constraints + con[i](alpha, *vars),
+                    )
+                    prob.solve(solver=cp.MOSEK, verbose=False)
+                    vals.append(prob.value)
+                    args.append(alpha.value)
+                alpha.value = args[np.argmin(vals)]
+            else:
+                prob = cp.Problem(cp.Minimize(objective), constraints)
+                prob.solve(solver=cp.MOSEK, verbose=False)
+
+        self.alphas = torch.from_numpy(alpha.value)
+        if self.regularizer is not None:
+            print(self.regularizer.name, self.alphas)
+        else:
+            print("No", self.alphas)
+        self.K = (
+            torch.sum(
+                torch.stack([alpha * K for alpha, K in zip(self.alphas, self.Ks)]),
+                dim=0,
+            )
+            + np.eye(self.n) * self.lam * self.s**2
+        )
+        self.fitted = True
+
+    def execute(self, xtest):
+        if self.fitted == True:
+            Ks = [
+                self.kernel_objects[i].kernel(self.x, xtest)
+                for i in range(self.no_models)
+            ]
+            K_star = torch.sum(
+                torch.stack([alpha * K for alpha, K in zip(self.alphas, Ks)]), dim=0
+            )
+        else:
+            K_star = None
+        Ks = [
+            self.kernel_objects[i].kernel(xtest, xtest) for i in range(self.no_models)
+        ]
+        K_star_star = torch.sum(
+            torch.stack([alpha * K for alpha, K in zip(self.alphas, Ks)]), dim=0
+        )
+        return (K_star, K_star_star)
+
+    # def log_marginal(self, kernel, X, weight):
+    # 	pass
+
+    def mean(self, xtest):
+        K_star, K_star_star = self.execute(xtest)
+        self.A = torch.linalg.lstsq(self.K, self.y)[0]
+        ymean = torch.mm(K_star, self.A)
+        return ymean
+
+    def mean_std(self, xtest, full=False, reuse=False):
+        K_star, K_star_star = self.execute(xtest)
+        self.A = torch.linalg.lstsq(self.K, self.y)[0]
+        ymean = torch.mm(K_star, self.A)
+
+        if self.var == "fixed":
+            ystd = self.std_fixed(xtest)
+        elif self.var == "true":
+            ystd = self.std_opt(xtest)
+        return (ymean, ystd)
+
+    def lcb(self, xtest: torch.Tensor, type=None, arg=False, sign=1.0):
+        theta = cp.Variable((self.alpha, 1))
+        args = []
+        n = xtest.size()[0]
+        values = torch.zeros(size=(n, 1)).double()
+        Phi = self.embed(xtest)
+
+        for j in range(n):
+            objective = sign * Phi[j, :] @ theta
+            if self.constraints is not None and not self.constraints.is_convex():
+                value, theta_lcb = self.objective_on_non_convex_confidence_set(
+                    theta, objective, type=type
+                )
+            elif not self.regularizer.is_convex():
+                value, theta_lcb = (
+                    self.objective_on_non_convex_confidence_set_bisection(
+                        theta, objective, type=type
+                    )
+                )
+            else:
+                value, theta_lcb = self.objective_on_confidence_set(
+                    theta, objective, type=type
+                )
+
+            values[j] = sign * value
+            if arg:
+                args.append(theta_lcb)
+
+        if args:
+            return values, args
+        else:
+            return values
+
+    def ucb(self, xtest):
+        pass
+
+    def std_opt(self, xtest):
+        N = xtest.size()[0]
+        for i in range(N):
+            x = xtest[i, :]
+            theta = cp.Variable(self.n * self.no_models)
+            M = torch.block_diag(self.Ks)
+            cp.norm(theta, p=2) * theta[i]
+
+    def std_fixed(self, xtest):
+        K_star, K_star_star = self.execute(xtest)
+        self.B = torch.t(torch.linalg.solve(self.K, torch.t(K_star)))
+        first = torch.diag(K_star_star).view(-1, 1)
+        second = torch.einsum("ij,ji->i", (self.B, torch.t(K_star))).view(-1, 1)
+        variance = first - second
+        ystd = torch.sqrt(variance)
+        return ystd
+
+    def sample(self, xtest, size=1):
+        pass
+
 
 if __name__ == "__main__":
-	from stpy.continuous_processes.gauss_procc import GaussianProcess
-	from stpy.helpers.helper import interval_torch
-	import matplotlib.pyplot as plt
-	n = 512
-	N = 5
-	s = 0.1
-	d = 1
-
-	xtest = interval_torch(n,d)
-	x = interval_torch(N,d)
-
-	kernel1 = KernelFunction(gamma = 0.05)
-	kernel2 = KernelFunction(kernel_name="polynomial", power = 5)
-	kernel3 = KernelFunction(kernel_name="polynomial", power=3)
-	kernel4 = KernelFunction(kernel_name="polynomial", power=2)
-	kernel5 = KernelFunction(kernel_name="polynomial", power=1)
-	kernel6 = KernelFunction(kernel_name="polynomial", power=1)
-
-	kernels = [kernel1, kernel2,kernel3, kernel4, kernel5, kernel6]
-
-	GP = GaussianProcess(kernel=kernel1)
-	torch.manual_seed(2)
-	y = GP.sample(x)
-
-	# sup inverse barrier
-	for lam in [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99,0.9999]:
-		regularizer = SupRegularizer(d = len(kernels), lam = lam, constrained=True, version='1')
-		mkl = MultipleKernelLearner(kernels, regularizer= regularizer)
-		mkl.fit_gp(x,y)
-		mkl.visualize(xtest, size = 0, show = False, fig = False, color = 'tab:blue', label = " sup:"+str(lam))
-		regularizer = SupRegularizer(d=len(kernels), lam=lam, constrained=True, version='2')
-		mkl = MultipleKernelLearner(kernels, regularizer=regularizer)
-		mkl.fit_gp(x, y)
-		mkl.visualize(xtest, size=0, show=False, fig=False, color='tab:green', label=" sup:" + str(lam))
-
-	# dirichlet mixture
-	regularizer = DirichletRegularizer(d=len(kernels), lam=lam, constrained=True)
-	mkl = MultipleKernelLearner(kernels, regularizer=regularizer)
-	mkl.fit_gp(x, y)
-	mkl.visualize(xtest, size=0, show=False, fig=False, color='tab:red', label = " dirichlet")
-
-	# no regularizer
-	mkl = MultipleKernelLearner(kernels, regularizer=None)
-	mkl.fit_gp(x, y)
-	mkl.visualize(xtest, size=0, show=False, fig=False, color='tab:orange', label = " no")
-
-	plt.show()
+    from stpy.continuous_processes.gauss_procc import GaussianProcess
+    from stpy.helpers.helper import interval_torch
+    import matplotlib.pyplot as plt
+
+    n = 512
+    N = 5
+    s = 0.1
+    d = 1
+
+    xtest = interval_torch(n, d)
+    x = interval_torch(N, d)
+
+    kernel1 = KernelFunction(gamma=0.05)
+    kernel2 = KernelFunction(kernel_name="polynomial", power=5)
+    kernel3 = KernelFunction(kernel_name="polynomial", power=3)
+    kernel4 = KernelFunction(kernel_name="polynomial", power=2)
+    kernel5 = KernelFunction(kernel_name="polynomial", power=1)
+    kernel6 = KernelFunction(kernel_name="polynomial", power=1)
+
+    kernels = [kernel1, kernel2, kernel3, kernel4, kernel5, kernel6]
+
+    GP = GaussianProcess(kernel=kernel1)
+    torch.manual_seed(2)
+    y = GP.sample(x)
+
+    # sup inverse barrier
+    for lam in [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.9999]:
+        regularizer = SupRegularizer(
+            d=len(kernels), lam=lam, constrained=True, version="1"
+        )
+        mkl = MultipleKernelLearner(kernels, regularizer=regularizer)
+        mkl.fit_gp(x, y)
+        mkl.visualize(
+            xtest,
+            size=0,
+            show=False,
+            fig=False,
+            color="tab:blue",
+            label=" sup:" + str(lam),
+        )
+        regularizer = SupRegularizer(
+            d=len(kernels), lam=lam, constrained=True, version="2"
+        )
+        mkl = MultipleKernelLearner(kernels, regularizer=regularizer)
+        mkl.fit_gp(x, y)
+        mkl.visualize(
+            xtest,
+            size=0,
+            show=False,
+            fig=False,
+            color="tab:green",
+            label=" sup:" + str(lam),
+        )
+
+    # dirichlet mixture
+    regularizer = DirichletRegularizer(d=len(kernels), lam=lam, constrained=True)
+    mkl = MultipleKernelLearner(kernels, regularizer=regularizer)
+    mkl.fit_gp(x, y)
+    mkl.visualize(
+        xtest, size=0, show=False, fig=False, color="tab:red", label=" dirichlet"
+    )
+
+    # no regularizer
+    mkl = MultipleKernelLearner(kernels, regularizer=None)
+    mkl.fit_gp(x, y)
+    mkl.visualize(xtest, size=0, show=False, fig=False, color="tab:orange", label=" no")
+
+    plt.show()
diff --git a/stpy/continuous_processes/mkl_features.py b/stpy/continuous_processes/mkl_features.py
index 8b690c0..42b6868 100755
--- a/stpy/continuous_processes/mkl_features.py
+++ b/stpy/continuous_processes/mkl_features.py
@@ -11,188 +11,196 @@
 
 class MKL(Estimator):
 
-	def __init__(self, embeddings, init_weights=None, lam=0.0, s=0.1):
-		self.embeddings = embeddings
-		self.init_weights = init_weights
-		self.no_models = len(embeddings)
-		self.s = s
-		self.lam = lam
-		if self.init_weights is None:
-			self.init_weights = torch.ones(self.no_models)
-		self.weights = self.init_weights
-		if not isinstance(self.lam, list):
-			self.lam = [lam for i in range(self.no_models)]
-
-	def get_emebed_dims(self):
-		self.dims = []
-		for embedding in self.embeddings:
-			self.dims.append(embedding.get_basis_size())
-		return self.dims
-
-	def total_embed_dim(self):
-		sum = np.sum(self.get_emebed_dims())
-		return sum
-
-	def fit_gp(self, x, y):
-
-		self.x = x
-		self.y = y
-		(self.n, self.d) = self.x.size()
-		self.total_m = self.total_embed_dim()
-
-		self.Reggr = KernelizedFeatures(embeding=self, m=self.total_m, d=d, s=self.s)
-		self.Reggr.fit_gp(x, y)
-
-	# def mean_vector(self):
-	# 	theta = torch.zeros(size = (self.total_embed_dim()))
-	# 	dims_index = torch.cumsum(torch.Tensor([0] + self.get_emebed_dims()),dim = 0).int()
-	# 	for index, emb in enumerate(self.embeddings):
-	# 		theta_small = emb.sample_theta()
-	# 		theta[dims_index[index]:dims_index[index + 1]] = theta_small.view(-1)
-	# 	return theta
-
-	def mean_vector(self):
-		return self.Reggr.theta_mean()
-
-	def mean_var(self, xtest):
-		# mu_avg = torch.zeros(size = (xtest.size()[0],1),dtype = torch.float64)
-		# var_avg = torch.zeros(size = (xtest.size()[0],1),dtype = torch.float64)
-		#
-		# for index, emb in enumerate(self.embeddings):
-		# 	mu,var = emb.mean_var(xtest)
-		# 	mu_avg = mu_avg + self.weights[index]*mu
-		# 	var_avg = var_avg + self.weights[index]*var
-		# return [mu_avg,var_avg]
-
-		return self.Reggr.mean_std(xtest)
-
-	def sample(self, xtest, size=1):
-		# sample_avg = torch.zeros(size = (xtest.size()[0],1),dtype = torch.float64)
-		#
-		# for index, emb in enumerate(self.embeddings):
-		# 	sample = emb.sample(xtest, size = size)
-		# 	sample_avg = sample_avg + self.weights[index]*sample
-		return self.Reggr.sample(xtest, size=size)
-
-	def embed(self, xtest):
-		n = xtest.size()[0]
-		Phi = torch.zeros(size=(n, int(self.total_embed_dim())), dtype=torch.float64)
-		dims_index = torch.cumsum(torch.Tensor([0] + self.get_emebed_dims()), dim=0).int()
-
-		for index, embedding in enumerate(self.embeddings):
-			Phi[:, dims_index[index]:dims_index[index + 1]] = embedding.embed_internal(xtest)
-
-		return Phi
-
-	def selector_matrix(self):
-		dims = []
-		for embedding in self.embeddings:
-			dims.append(embedding.get_basis_size())
-		total_dim = self.total_embed_dim()
-		selector = torch.zeros(size=(int(total_dim), self.no_models), dtype=torch.float64)
-		z = 0
-		for i in range(len(self.embeddings)):
-			selector[z:z + dims[i], i] = 1.0
-			z = z + dims[i]
-		return torch.t(selector)
-
-	###
-	def evaluate_design(self, C, Phi):
-		n = Phi.size()[0]
-
-		A = torch.lstsq(torch.t(C), torch.t(Phi))[0]
-		B = torch.t(A[0:n, :])
-
-		delta = torch.norm(B @ Phi - C, p=2)  # /torch.norm(B, p = 2) #relative error
-
-		pinv = torch.pinverse(torch.t(Phi) @ Phi)
-		W = C @ pinv @ torch.t(C)
-
-		rank = torch.matrix_rank(B)
-		lambda_max = torch.symeig(W)[0][-1]  # largest eigenvalue
-
-		upper_bound = lambda_max * (self.s * self.s * 2 + delta)
-
-		return [upper_bound.detach(), rank]
-
-	def acquisiton_function(self, C, Phi, candidates):
-		values = []
-		ranks = []
-		for candidate_point in candidates:
-			newPhi = torch.cat((Phi, candidate_point.view(1, -1)))
-			values.append(self.evaluate_design(C, newPhi)[0])
-			ranks.append(self.evaluate_design(C, newPhi)[1])
-
-		return [torch.Tensor(values), torch.Tensor(ranks)]
+    def __init__(self, embeddings, init_weights=None, lam=0.0, s=0.1):
+        self.embeddings = embeddings
+        self.init_weights = init_weights
+        self.no_models = len(embeddings)
+        self.s = s
+        self.lam = lam
+        if self.init_weights is None:
+            self.init_weights = torch.ones(self.no_models)
+        self.weights = self.init_weights
+        if not isinstance(self.lam, list):
+            self.lam = [lam for i in range(self.no_models)]
+
+    def get_emebed_dims(self):
+        self.dims = []
+        for embedding in self.embeddings:
+            self.dims.append(embedding.get_basis_size())
+        return self.dims
+
+    def total_embed_dim(self):
+        sum = np.sum(self.get_emebed_dims())
+        return sum
+
+    def fit_gp(self, x, y):
+
+        self.x = x
+        self.y = y
+        (self.n, self.d) = self.x.size()
+        self.total_m = self.total_embed_dim()
+
+        self.Reggr = KernelizedFeatures(embeding=self, m=self.total_m, d=d, s=self.s)
+        self.Reggr.fit_gp(x, y)
+
+    # def mean_vector(self):
+    # 	theta = torch.zeros(size = (self.total_embed_dim()))
+    # 	dims_index = torch.cumsum(torch.tensor([0] + self.get_emebed_dims()),dim = 0).int()
+    # 	for index, emb in enumerate(self.embeddings):
+    # 		theta_small = emb.sample_theta()
+    # 		theta[dims_index[index]:dims_index[index + 1]] = theta_small.view(-1)
+    # 	return theta
+
+    def mean_vector(self):
+        return self.Reggr.theta_mean()
+
+    def mean_var(self, xtest):
+        # mu_avg = torch.zeros(size = (xtest.size()[0],1),dtype = torch.float64)
+        # var_avg = torch.zeros(size = (xtest.size()[0],1),dtype = torch.float64)
+        #
+        # for index, emb in enumerate(self.embeddings):
+        # 	mu,var = emb.mean_var(xtest)
+        # 	mu_avg = mu_avg + self.weights[index]*mu
+        # 	var_avg = var_avg + self.weights[index]*var
+        # return [mu_avg,var_avg]
+
+        return self.Reggr.mean_std(xtest)
+
+    def sample(self, xtest, size=1):
+        # sample_avg = torch.zeros(size = (xtest.size()[0],1),dtype = torch.float64)
+        #
+        # for index, emb in enumerate(self.embeddings):
+        # 	sample = emb.sample(xtest, size = size)
+        # 	sample_avg = sample_avg + self.weights[index]*sample
+        return self.Reggr.sample(xtest, size=size)
+
+    def embed(self, xtest):
+        n = xtest.size()[0]
+        Phi = torch.zeros(size=(n, int(self.total_embed_dim())), dtype=torch.float64)
+        dims_index = torch.cumsum(
+            torch.tensor([0] + self.get_emebed_dims()), dim=0
+        ).int()
+
+        for index, embedding in enumerate(self.embeddings):
+            Phi[:, dims_index[index] : dims_index[index + 1]] = (
+                embedding.embed_internal(xtest)
+            )
+
+        return Phi
+
+    def selector_matrix(self):
+        dims = []
+        for embedding in self.embeddings:
+            dims.append(embedding.get_basis_size())
+        total_dim = self.total_embed_dim()
+        selector = torch.zeros(
+            size=(int(total_dim), self.no_models), dtype=torch.float64
+        )
+        z = 0
+        for i in range(len(self.embeddings)):
+            selector[z : z + dims[i], i] = 1.0
+            z = z + dims[i]
+        return torch.t(selector)
+
+    ###
+    def evaluate_design(self, C, Phi):
+        n = Phi.size()[0]
+
+        A = torch.lstsq(torch.t(C), torch.t(Phi))[0]
+        B = torch.t(A[0:n, :])
+
+        delta = torch.norm(B @ Phi - C, p=2)  # /torch.norm(B, p = 2) #relative error
+
+        pinv = torch.pinverse(torch.t(Phi) @ Phi)
+        W = C @ pinv @ torch.t(C)
+
+        rank = torch.matrix_rank(B)
+        lambda_max = torch.symeig(W)[0][-1]  # largest eigenvalue
+
+        upper_bound = lambda_max * (self.s * self.s * 2 + delta)
+
+        return [upper_bound.detach(), rank]
+
+    def acquisiton_function(self, C, Phi, candidates):
+        values = []
+        ranks = []
+        for candidate_point in candidates:
+            newPhi = torch.cat((Phi, candidate_point.view(1, -1)))
+            values.append(self.evaluate_design(C, newPhi)[0])
+            ranks.append(self.evaluate_design(C, newPhi)[1])
+
+        return [torch.tensor(values), torch.tensor(ranks)]
 
 
 if __name__ == "__main__":
 
-	n = 16
-	N = 4
-	s = 0.00000001
-	d = 1
-	TestFunction = MultiRKHS()
-	xtest = TestFunction.interval(n)
-	x = TestFunction.initial_guess(N)
-	y = TestFunction.eval(x, sigma=s)
-	bounds = TestFunction.bounds()
-
-	p = 2
-	embedding2 = PolynomialEmbedding(d, p, groups=None)
-	GP1 = KernelizedFeatures(embeding=embedding2, m=embedding2.size, d=d, s=s,
-							 groups=None, bounds=bounds)
-
-	map = lambda x: torch.abs(x)
-	embedding3 = CustomEmbedding(d, map, 1, groups=None)
-
-	GP2 = KernelizedFeatures(embeding=embedding3, m=embedding3.size, d=d, s=s,
-							 groups=None, bounds=bounds)
-
-	m = 2
-	gamma = 0.2
-	GP3 = GaussianProcessFF(d=d, s=s, m=m, gamma=gamma, bounds=bounds, groups=None)
-	GP4 = GaussianProcessFF(d=d, s=s, m=m, gamma=gamma, bounds=bounds, groups=None)
-
-	MKL = MKL([GP1, GP2], s=s)
-
-	C = MKL.selector_matrix()
-	Candidates = MKL.embed(xtest)
-	eps = 1
-	N = 1
-	x = TestFunction.initial_guess(N)
-
-	plt.close('all')
-
-	while eps > 10e-3:
-		# print (x,eps)
-		Phi = MKL.embed(x)
-		# print (C.size(), Phi.size())
-		print(N, MKL.evaluate_design(C, Phi))
-		eps = MKL.evaluate_design(C, Phi)[0]
-		# N = N + 1
-		score, rank = MKL.acquisiton_function(C, Phi, Candidates)
-		score = score + 1. / (rank - 1)
-		index_min = torch.argmin(score)
-		x_min = xtest[index_min]
-
-		plt.plot(xtest.numpy(), torch.log(score).numpy(), 'g')
-		plt.plot(xtest.numpy(), rank.numpy(), 'r--')
-		plt.plot(x, x * 0, 'ro')
-		plt.plot(xtest[index_min].numpy(), torch.log(score[index_min]).numpy(), 'go')
-		plt.show()
-
-		x = torch.cat((x, x_min.view(1, -1)))
-
-	y = TestFunction.eval(x, sigma=s)
-	print(x)
-	print(y)
-
-	MKL.fit_gp(x, y)
-	print("Projection:")
-	print("--------------")
-	print(C @ MKL.mean_vector())
-	print("--------------")
-
-	MKL.visualize(xtest, f_true=TestFunction.eval_noiseless)
-	plt.show()
+    n = 16
+    N = 4
+    s = 0.00000001
+    d = 1
+    TestFunction = MultiRKHS()
+    xtest = TestFunction.interval(n)
+    x = TestFunction.initial_guess(N)
+    y = TestFunction.eval(x, sigma=s)
+    bounds = TestFunction.bounds()
+
+    p = 2
+    embedding2 = PolynomialEmbedding(d, p, groups=None)
+    GP1 = KernelizedFeatures(
+        embeding=embedding2, m=embedding2.size, d=d, s=s, groups=None, bounds=bounds
+    )
+
+    map = lambda x: torch.abs(x)
+    embedding3 = CustomEmbedding(d, map, 1, groups=None)
+
+    GP2 = KernelizedFeatures(
+        embeding=embedding3, m=embedding3.size, d=d, s=s, groups=None, bounds=bounds
+    )
+
+    m = 2
+    gamma = 0.2
+    GP3 = GaussianProcessFF(d=d, s=s, m=m, gamma=gamma, bounds=bounds, groups=None)
+    GP4 = GaussianProcessFF(d=d, s=s, m=m, gamma=gamma, bounds=bounds, groups=None)
+
+    MKL = MKL([GP1, GP2], s=s)
+
+    C = MKL.selector_matrix()
+    Candidates = MKL.embed(xtest)
+    eps = 1
+    N = 1
+    x = TestFunction.initial_guess(N)
+
+    plt.close("all")
+
+    while eps > 10e-3:
+        # print (x,eps)
+        Phi = MKL.embed(x)
+        # print (C.size(), Phi.size())
+        print(N, MKL.evaluate_design(C, Phi))
+        eps = MKL.evaluate_design(C, Phi)[0]
+        # N = N + 1
+        score, rank = MKL.acquisiton_function(C, Phi, Candidates)
+        score = score + 1.0 / (rank - 1)
+        index_min = torch.argmin(score)
+        x_min = xtest[index_min]
+
+        plt.plot(xtest.numpy(), torch.log(score).numpy(), "g")
+        plt.plot(xtest.numpy(), rank.numpy(), "r--")
+        plt.plot(x, x * 0, "ro")
+        plt.plot(xtest[index_min].numpy(), torch.log(score[index_min]).numpy(), "go")
+        plt.show()
+
+        x = torch.cat((x, x_min.view(1, -1)))
+
+    y = TestFunction.eval(x, sigma=s)
+    print(x)
+    print(y)
+
+    MKL.fit_gp(x, y)
+    print("Projection:")
+    print("--------------")
+    print(C @ MKL.mean_vector())
+    print("--------------")
+
+    MKL.visualize(xtest, f_true=TestFunction.eval_noiseless)
+    plt.show()
diff --git a/stpy/continuous_processes/nystrom_fea.py b/stpy/continuous_processes/nystrom_fea.py
index a209c47..fa30fff 100755
--- a/stpy/continuous_processes/nystrom_fea.py
+++ b/stpy/continuous_processes/nystrom_fea.py
@@ -1,347 +1,460 @@
 import matplotlib.pyplot as plt
-from scipy.interpolate import LinearNDInterpolator
+from scipy.interpolate import LinearNDInterpolator, NearestNDInterpolator
 from scipy.interpolate import interp1d
 
 from stpy.continuous_processes.gauss_procc import GaussianProcess
 from stpy.embeddings.embedding import *
 from stpy.helpers.helper import *
+from stpy.helpers.posterior_sampling import tmg
 from stpy.kernels import KernelFunction
 
 
 class NystromFeatures(Embedding):
-	'''
-		Nystrom Features for Gaussian Kernel
-	'''
-
-	def __init__(self, kernel_object, m=100, approx="uniform", s=1., samples=100):
-
-		self.fit = False
-		self.m = m
-		try:
-			self.ms = int(torch.sum(m))
-		except:
-			self.ms = m
-		self.samples = samples
-		self.kernel_object = kernel_object
-		self.kernel = kernel_object.kernel
-		self.approx = approx
-		self.s = s
-
-	def description(self):
-		"""
-		Description of GP in text
-		:return: string with description
-		"""
-		return "Nystrom\n" + "Appprox: " + self.approx
-
-	def subsample(self, x, y):
-		if self.approx == "uniform":
-			C, w = self.uniform_subsampling(x, y)
-		elif self.approx == "leverage":
-			C, w = self.leverage_score_subsampling(x, y)
-		elif self.approx == "online_leverage":
-			C, w = self.sequential_leverage_score_subsampling(x, y)
-		return (C, w)
-
-	def uniform_subsampling(self, x, y):
-		N = x.size()[0]
-		C = np.random.choice(N, int(self.ms))
-		weights = torch.ones(self.ms)
-		return (C, weights)
-
-	def leverage_score_subsampling(self, x, y):
-		N = x.size()[0]
-		from stpy.continuous_processes.gauss_procc import GaussianProcess
-		GP = GaussianProcess(kernel_custom=self.kernel_object, s=self.s)
-		GP.fit_gp(x, y)
-		mean, leverage_scores = GP.mean_std(x)
-		weights = torch.ones(self.ms)
-
-		args = [0]
-		size = 1
-
-		for j in range(N):
-			point = x[j, :]
-			if size < self.ms:
-				leverage_score = float(leverage_scores[j, :])
-				q = np.random.binomial(self.ms, leverage_score)
-				if q > 0:
-					args.append(j)
-					weights[size] = (q / float(self.ms)) / leverage_score
-					size = size + 1
-				else:
-					pass
-
-		print(args, weights)
-		return (args, weights)
-
-	def sequential_leverage_score_subsampling(self, x, y):
-		N = x.size()[0]
-		d = x.size()[1]
-		from stpy.continuous_processes.gauss_procc import GaussianProcess
-		GP = GaussianProcess(kernel_custom=self.kernel_object, s=self.s)
-
-		dts = torch.zeros(self.ms, d, dtype=torch.float64)
-		dts[0, :] = x[0, :]
-		args = [0]
-		size = 1
-		weights = torch.ones(self.ms)
-
-		for j in range(N):
-			point = x[j, :]
-			# print (size,x.size())
-			if size < self.ms:
-				GP.fit_gp(dts[0:size, :], y[0:size, :])
-				mean, leverage_score = GP.mean_std(point.view(1, d))
-				q = np.random.binomial(self.ms, float(leverage_score))
-				if q > 0:
-					args.append(j)
-					dts[size, :] = point
-					weights[size] = (q / float(self.ms)) / leverage_score
-					size = size + 1
-				else:
-					pass
-		return (args, weights)
-
-	def fit_gp(self, x, y, eps=1e-14):
-		'''
-			Function to Fit GP
-		'''
-		self.x = x
-		self.y = y
-		self.d = x.size()[1]
-		self.N = x.size()[0]
-		assert (self.ms <= self.N)
-		self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
-		if self.approx == "svd":
-			self.xs = x
-			K = self.kernel(x, x)
-			if 3 * self.ms > self.N:
-				(D, V) = torch.linalg.eigh(K, UPLO='U')
-				V = torch.t(V)[self.N - self.ms:self.N, :].T
-				D = D[self.N - self.ms:self.N]
-				D[D <= eps] = 0
-
-			else:
-				(D, V) = torch.lobpcg(K, k=self.ms, niter=-1)
-
-			# Dinv = torch.diag(1./D[self.N-self.ms:self.N])
-			# Dinv[Dinv <=0 ] = 0
-			# Dinv = torch.sqrt(Dinv)
-			self.eigs = D
-			Dinv = torch.diag(torch.sqrt(1. / D))
-			# self.M = (torch.t(V)[self.N-self.ms:self.N,:]).T @ Dinv.T
-			self.M = V @ Dinv
-			# self.embed = lambda q: torch.t(torch.mm(Dinv, torch.mm(torch.t(V)[self.N-self.ms:self.N,:], self.kernel(q, self.x)   )))
-			self.embed = lambda q: self.kernel(q, self.xs).T @ self.M
-			self.C = []
-		elif self.approx == 'nothing':
-			self.xs = self.x[0:self.ms, :]
-			self.M = torch.eye(self.ms).double()
-			self.embed = lambda q: self.kernel(q, self.xs).T @ self.M
-
-		elif self.approx == 'positive_svd':
-			from sklearn.decomposition import NMF
-			GP = GaussianProcess(kernel=self.kernel_object)
-			ysample = GP.sample(x, size=self.samples) ** 2
-			X = ysample
-			model = NMF(n_components=self.ms, max_iter=8000, tol=1e-12)
-			W = torch.from_numpy(model.fit_transform(X))
-			H = torch.from_numpy(model.components_)
-			l = torch.norm(W, dim=1)
-			l = 1. / l
-
-			if x.size()[1] == 1:
-				fs = []
-				for j in range(self.ms):
-					fs.append(interp1d(x.view(-1).numpy(), (W.T @ torch.diag(l))[j, :].numpy()))
-				self.embed = lambda q: torch.cat([torch.from_numpy(fs[j](q)).view(-1, 1) for j in range(self.ms)],
-												 dim=1)
-
-			elif x.size()[1] == 2:
-				fs = []
-				for j in range(self.ms):
-					W_j = (W.T @ torch.diag(l))[j, :].numpy()
-					fs.append(LinearNDInterpolator(x, W_j))
-				self.embed = lambda q: torch.cat(
-					[torch.from_numpy(fs[j](q[:, 0], q[:, 1])).view(-1, 1) for j in range(self.ms)], dim=1)
-			# elif x.size()[1] == 2:
-			# 	fs = []
-			# 	for j in range(self.ms):
-			# 		W_j = (W.T @ torch.diag(l))[j, :].numpy()
-			# 		fs.append(Rbf(x[:,0],x[:,1], W_j))
-			# 	self.embed = lambda q: torch.cat([torch.from_numpy(fs[j](q[:,0],q[:,1])).view(-1, 1) for j in range(self.ms)],
-			# 									 dim=1)
-
-			self.C = []
-
-		elif self.approx == "cover":
-			K = self.kernel(x, x)  # + self.s * self.s * torch.eye(self.N, dtype=torch.float64)
-			Khalf = torch.from_numpy(np.real(scipy.linalg.sqrtm(K.numpy())))
-			Khalfinv = torch.pinverse(Khalf)
-			self.embed = lambda q: torch.t(
-				torch.mm(Khalfinv, self.kernel(q, self.x)))
-		else:
-			self.C, self.weights = self.subsample(x, y)
-			xs = x[self.C, :]
-			self.Dweights = torch.diag(self.weights).double()
-			K = torch.mm(torch.mm(self.Dweights, self.kernel(xs, xs)),
-						 self.Dweights)  # + self.s*self.s * torch.eye(self.ms, dtype=torch.float64)
-			#(D, V) = torch.symeig(K, eigenvectors=True)
-			(D, V) = torch.linalg.eigh(K)
-			Dinv = torch.diag(1. / D)
-			Dinv[Dinv <= 0] = 0
-			Dinv = torch.sqrt(Dinv)
-			# Dinv = torch.diag(torch.pow(D[:],-0.5))
-			self.embed = lambda q: torch.t(
-				torch.mm(Dinv, torch.mm(torch.t(V), torch.mm(self.Dweights, self.kernel(q, xs)))))
-		# self.embed = lambda x: torch.t(torch.mm(torch.sqrt(Dinv),torch.mm(V, self.kernel(x, xs))))
-		embeding = self.embed(x)
-		self.Z_ = embeding.T @ embeding + self.s * self.s * torch.eye(self.ms).double()
-
-		# self.K = (self.Z_ + self.s * self.s * torch.eye(self.ms, dtype=torch.float64))
-		self.K = self.Z_
-		self.Q = torch.t(embeding)
-
-		self.fit = True
-		return None
-
-	def mean_std(self, xtest):
-		if self.fit == False:
-			raise AssertionError("First fit")
-		else:
-			embeding = self.embed(xtest)
-			Q = self.embed(self.x)
-			theta_mean, _ = torch.solve(torch.mm(torch.t(Q), self.y), self.K)
-			ymean = torch.mm(embeding, theta_mean)
-			temp = torch.t(torch.solve(torch.t(embeding), self.K)[0])
-			diagonal = self.s * self.s * torch.einsum('ij,ji->i', (temp, torch.t(embeding))).view(-1, 1)
-			yvar = torch.sqrt(diagonal)
-
-		return (ymean, yvar)
-
-	def outer_kernel(self):
-		embeding = self.embed(self.x)
-		# print (embeding.size())
-		K = torch.mm(embeding, torch.t(embeding))
-		# Z = self.linear_kernel(embeding, (embeding))
-		K = (K + self.s * self.s * torch.eye(self.N, dtype=torch.float64))
-		# K = self.kernel(self.x,self.x) + self.s*self.s*torch.eye(self.N, dtype=torch.float64)
-		# print ("kernel:",K)
-		# print ("approximate:",Z)
-		return K
-
-	def sample_theta(self, size=1):
-		basis = int(int(torch.sum(self.m)))
-		zeros = torch.zeros(basis, size, dtype=torch.float64)
-		random_vector = torch.normal(mean=zeros, std=1.)
-
-		if self.fit == True:
-			# random vector
-			Z = torch.pinverse(self.K, rcond=10e-6)
-			self.L = torch.cholesky(Z, upper=False)
-			theta_mean = torch.mm(Z, torch.mm(self.Q, self.y))
-			theta = torch.mm(self.s * self.L, random_vector)
-			theta = theta + theta_mean
-		else:
-			theta_mean = 0
-			Z = (1. + self.s * self.s) * torch.eye(basis, dtype=torch.float64)
-			L = torch.cholesky(Z, upper=False)
-			theta = torch.mm(L, random_vector) + theta_mean
-		return theta
-
-	def sample(self, xtest, size=1):
-		'''
-			Sample functions from Gaussian Process
-		'''
-		theta = self.sample_theta(size=size)
-		f = torch.mm(self.embed(xtest), theta)
-		return f
-
-	def visualize(self, xtest, f_true=None, points=True, show=True):
-		[mu, std] = self.mean_std(xtest)
-		if self.d == 1:
-
-			plt.figure(figsize=(15, 7))
-			plt.clf()
-			plt.plot(self.x.numpy(), self.y.numpy(), 'r+', ms=10, marker="o")
-			plt.plot(self.x[self.C, :].numpy(), self.y[self.C, :].numpy(), 'g+', ms=10, marker="o")
-			# plt.plot(xtest.numpy(), self.sample(xtest, size=2).numpy(), 'k--', lw=2, label="sample")
-			plt.fill_between(xtest.numpy().flat, (mu - 2 * std).numpy().flat, (mu + 2 * std).numpy().flat,
-							 color="#dddddd")
-			if f_true is not None:
-				plt.plot(xtest.numpy(), f_true(xtest).numpy(), 'b-', lw=2)
-			plt.plot(xtest.numpy(), mu.numpy(), 'r-', lw=2, label="posterior mean")
-			plt.title('Posterior mean prediction plus 2 st.deviation')
-			plt.legend()
-			if show == True:
-				plt.show()
-
-		elif self.d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=(15, 7))
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].numpy(), (grid_x, grid_y), method='linear')
-			if f_true is not None:
-				grid_z = griddata((xx, yy), f_true(xtest)[:, 0].numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z, color='b', alpha=0.4)
-			if points == True:
-				ax.scatter(self.x[:, 0].numpy(), self.x[:, 1].numpy(), self.y[:, 0].numpy(), c='r', s=100, marker="o",
-						   depthshade=False)
-			ax.plot_surface(grid_x, grid_y, grid_z_mu, color='r', alpha=0.4)
-			plt.title('Posterior mean prediction plus 2 st.deviation')
-			plt.show()
-
-		else:
-			print("Visualization not implemented")
+    """
+    Nystrom Features for Gaussian Kernel
+    """
+
+    def __init__(
+        self, kernel_object, m=100, approx="uniform", s=1.0, samples=100, fast=False
+    ):
+        """
+        fast, optional
+            If it is true, the samples from the truncated Gaussian are approximated by squared samples of a Gaussian, by default True
+        """
+
+        self.fit = False
+        self.m = m
+        try:
+            self.ms = int(torch.sum(m))
+        except:
+            self.ms = m
+        self.samples = samples
+        self.kernel_object = kernel_object
+        self.kernel = kernel_object.kernel
+        self.approx = approx
+        self.s = s
+        self.fast = fast
+
+    def description(self):
+        """
+        Description of GP in text
+        :return: string with description
+        """
+        return "Nystrom\n" + "Appprox: " + self.approx
+
+    def subsample(self, x, y):
+        if self.approx == "uniform":
+            C, w = self.uniform_subsampling(x, y)
+        elif self.approx == "leverage":
+            C, w = self.leverage_score_subsampling(x, y)
+        elif self.approx == "online_leverage":
+            C, w = self.sequential_leverage_score_subsampling(x, y)
+        return (C, w)
+
+    def uniform_subsampling(self, x, y):
+        N = x.size()[0]
+        C = np.random.choice(N, int(self.ms))
+        weights = torch.ones(self.ms)
+        return (C, weights)
+
+    def leverage_score_subsampling(self, x, y):
+        N = x.size()[0]
+        from stpy.continuous_processes.gauss_procc import GaussianProcess
+
+        GP = GaussianProcess(kernel_custom=self.kernel_object, s=self.s)
+        GP.fit_gp(x, y)
+        mean, leverage_scores = GP.mean_std(x)
+        weights = torch.ones(self.ms)
+
+        args = [0]
+        size = 1
+
+        for j in range(N):
+            point = x[j, :]
+            if size < self.ms:
+                leverage_score = float(leverage_scores[j, :])
+                q = np.random.binomial(self.ms, leverage_score)
+                if q > 0:
+                    args.append(j)
+                    weights[size] = (q / float(self.ms)) / leverage_score
+                    size = size + 1
+                else:
+                    pass
+
+        print(args, weights)
+        return (args, weights)
+
+    def sequential_leverage_score_subsampling(self, x, y):
+        N = x.size()[0]
+        d = x.size()[1]
+        from stpy.continuous_processes.gauss_procc import GaussianProcess
+
+        GP = GaussianProcess(kernel_custom=self.kernel_object, s=self.s)
+
+        dts = torch.zeros(self.ms, d, dtype=torch.float64)
+        dts[0, :] = x[0, :]
+        args = [0]
+        size = 1
+        weights = torch.ones(self.ms)
+
+        for j in range(N):
+            point = x[j, :]
+            # print (size,x.size())
+            if size < self.ms:
+                GP.fit_gp(dts[0:size, :], y[0:size, :])
+                mean, leverage_score = GP.mean_std(point.view(1, d))
+                q = np.random.binomial(self.ms, float(leverage_score))
+                if q > 0:
+                    args.append(j)
+                    dts[size, :] = point
+                    weights[size] = (q / float(self.ms)) / leverage_score
+                    size = size + 1
+                else:
+                    pass
+        return (args, weights)
+
+    def fit_gp(self, x, y, eps=1e-14):
+        """
+        Function to Fit GP
+        """
+        self.x = x
+        self.y = y
+        self.d = x.size()[1]
+        self.N = x.size()[0]
+        assert self.ms <= self.N
+        self.linear_kernel = KernelFunction(kernel_name="linear").linear_kernel
+        if self.approx == "svd":
+            self.xs = x
+            K = self.kernel(x, x)
+            if 3 * self.ms > self.N:
+                (D, V) = torch.linalg.eigh(K, UPLO="U")
+                V = torch.t(V)[self.N - self.ms : self.N, :].T
+                D = D[self.N - self.ms : self.N]
+                D[D <= eps] = 0
+
+            else:
+                (D, V) = torch.lobpcg(K, k=self.ms, niter=-1)
+
+            # Dinv = torch.diag(1./D[self.N-self.ms:self.N])
+            # Dinv[Dinv <=0 ] = 0
+            # Dinv = torch.sqrt(Dinv)
+            self.eigs = D
+            Dinv = torch.diag(torch.sqrt(1.0 / D))
+            # self.M = (torch.t(V)[self.N-self.ms:self.N,:]).T @ Dinv.T
+            self.M = V @ Dinv
+            # self.embed = lambda q: torch.t(torch.mm(Dinv, torch.mm(torch.t(V)[self.N-self.ms:self.N,:], self.kernel(q, self.x)   )))
+            self.embed = lambda q: self.kernel(q, self.xs).T @ self.M
+            self.C = []
+        elif self.approx == "nothing":
+            self.xs = self.x[0 : self.ms, :]
+            self.M = torch.eye(self.ms).double()
+            self.embed = lambda q: self.kernel(q, self.xs).T @ self.M
+
+        elif self.approx == "positive_svd":
+            from sklearn.decomposition import NMF
+
+            if self.fast:
+                GP = GaussianProcess(kernel=self.kernel_object)
+                ysample = GP.sample(x, size=self.samples) ** 2
+                X = ysample
+            else:
+                burn_in = 30
+                ysample = tmg(
+                    self.samples,
+                    np.zeros(len(x)),
+                    self.kernel_object.kernel(x, x).cpu().numpy()
+                    + 1e-7 * np.eye(len(x)),
+                    torch.ones(len(x)).cpu().numpy(),
+                    np.eye(len(x)),
+                    np.zeros(len(x)),
+                    burn_in,
+                    True,
+                )
+                X = torch.tensor(ysample.T)
+
+            model = NMF(n_components=self.ms, max_iter=8000, tol=1e-12)
+            W = torch.tensor(model.fit_transform(X.cpu()))
+            H = torch.tensor(model.components_)
+            W_norm = W / torch.linalg.norm(W, dim=0)
+
+            if x.size()[1] == 1:
+                fs = []
+                for j in range(self.ms):
+                    fs.append(
+                        interp1d(
+                            x.view(-1).cpu().numpy(),
+                            W_norm[:, j].cpu().numpy(),
+                        )
+                    )
+                self.embed = lambda q: torch.cat(
+                    [torch.tensor(fs[j](q)).view(-1, 1) for j in range(self.ms)],
+                    dim=1,
+                )
+
+            elif x.size()[1] == 2:
+
+                fs = []
+                for j in range(self.ms):
+                    # each column of W is one \phi_i that is normalized to \|phi_i\|_2=1
+                    W_j = W_norm[:, j].cpu().numpy()
+                    fs.append(
+                        (
+                            LinearNDInterpolator(x.cpu().numpy(), W_j),
+                            NearestNDInterpolator(x.cpu().numpy(), W_j),
+                        )
+                    )
+
+                def embed(q):
+                    out_list = []
+                    # Interpolate for points inside convex set else Nearest Neighbor
+                    for j in range(self.ms):
+                        cur = fs[j][0](q[:, 0].cpu().numpy(), q[:, 1].cpu().numpy())
+                        mask = np.isnan(cur)
+                        cur[mask] = fs[j][1](
+                            q[:, 0].cpu().numpy()[mask], q[:, 1].cpu().numpy()[mask]
+                        )
+                        out_list.append(torch.tensor(cur).view(-1, 1))
+                    return torch.cat(out_list, dim=1)
+
+                self.embed = embed
+
+                # self.embed = lambda q: torch.cat(
+                #     [
+                #         torch.tensor(
+                #             fs[j](q[:, 0].cpu().numpy(), q[:, 1].cpu().numpy())
+                #         ).view(-1, 1)
+                #         for j in range(self.ms)
+                #     ],
+                #     dim=1,
+                # )
+
+            # elif x.size()[1] == 2:
+            # 	fs = []
+            # 	for j in range(self.ms):
+            # 		W_j = (W.T @ torch.diag(l))[j, :].cpu().numpy()
+            # 		fs.append(Rbf(x[:,0],x[:,1], W_j))
+            # 	self.embed = lambda q: torch.cat([torch.tensor(fs[j](q[:,0],q[:,1])).view(-1, 1) for j in range(self.ms)],
+            # 									 dim=1)
+
+            self.C = []
+
+        elif self.approx == "cover":
+            K = self.kernel(
+                x, x
+            )  # + self.s * self.s * torch.eye(self.N, dtype=torch.float64)
+            Khalf = torch.tensor(np.real(scipy.linalg.sqrtm(K.cpu().numpy())))
+            Khalfinv = torch.pinverse(Khalf)
+            self.embed = lambda q: torch.t(torch.mm(Khalfinv, self.kernel(q, self.x)))
+        else:
+            self.C, self.weights = self.subsample(x, y)
+            xs = x[self.C, :]
+            self.Dweights = torch.diag(self.weights).double()
+            K = torch.mm(
+                torch.mm(self.Dweights, self.kernel(xs, xs)), self.Dweights
+            )  # + self.s*self.s * torch.eye(self.ms, dtype=torch.float64)
+            # (D, V) = torch.symeig(K, eigenvectors=True)
+            (D, V) = torch.linalg.eigh(K)
+            Dinv = torch.diag(1.0 / D)
+            Dinv[Dinv <= 0] = 0
+            Dinv = torch.sqrt(Dinv)
+            # Dinv = torch.diag(torch.pow(D[:],-0.5))
+            self.embed = lambda q: torch.t(
+                torch.mm(
+                    Dinv,
+                    torch.mm(torch.t(V), torch.mm(self.Dweights, self.kernel(q, xs))),
+                )
+            )
+        # self.embed = lambda x: torch.t(torch.mm(torch.sqrt(Dinv),torch.mm(V, self.kernel(x, xs))))
+        embeding = self.embed(x)
+        self.Z_ = embeding.T @ embeding + self.s * self.s * torch.eye(self.ms).double()
+
+        # self.K = (self.Z_ + self.s * self.s * torch.eye(self.ms, dtype=torch.float64))
+        self.K = self.Z_
+        self.Q = torch.t(embeding)
+
+        self.fit = True
+        return None
+
+    def mean_std(self, xtest):
+        if self.fit == False:
+            raise AssertionError("First fit")
+        else:
+            embeding = self.embed(xtest)
+            Q = self.embed(self.x)
+            theta_mean, _ = torch.solve(torch.mm(torch.t(Q), self.y), self.K)
+            ymean = torch.mm(embeding, theta_mean)
+            temp = torch.t(torch.solve(torch.t(embeding), self.K)[0])
+            diagonal = (
+                self.s
+                * self.s
+                * torch.einsum("ij,ji->i", (temp, torch.t(embeding))).view(-1, 1)
+            )
+            yvar = torch.sqrt(diagonal)
+
+        return (ymean, yvar)
+
+    def outer_kernel(self):
+        embeding = self.embed(self.x)
+        # print (embeding.size())
+        K = torch.mm(embeding, torch.t(embeding))
+        # Z = self.linear_kernel(embeding, (embeding))
+        K = K + self.s * self.s * torch.eye(self.N, dtype=torch.float64)
+        # K = self.kernel(self.x,self.x) + self.s*self.s*torch.eye(self.N, dtype=torch.float64)
+        # print ("kernel:",K)
+        # print ("approximate:",Z)
+        return K
+
+    def sample_theta(self, size=1):
+        basis = int(int(torch.sum(self.m)))
+        zeros = torch.zeros(basis, size, dtype=torch.float64)
+        random_vector = torch.normal(mean=zeros, std=1.0)
+
+        if self.fit == True:
+            # random vector
+            Z = torch.pinverse(self.K, rcond=10e-6)
+            self.L = torch.cholesky(Z, upper=False)
+            theta_mean = torch.mm(Z, torch.mm(self.Q, self.y))
+            theta = torch.mm(self.s * self.L, random_vector)
+            theta = theta + theta_mean
+        else:
+            theta_mean = 0
+            Z = (1.0 + self.s * self.s) * torch.eye(basis, dtype=torch.float64)
+            L = torch.cholesky(Z, upper=False)
+            theta = torch.mm(L, random_vector) + theta_mean
+        return theta
+
+    def sample(self, xtest, size=1):
+        """
+        Sample functions from Gaussian Process
+        """
+        theta = self.sample_theta(size=size)
+        f = torch.mm(self.embed(xtest), theta)
+        return f
+
+    def visualize(self, xtest, f_true=None, points=True, show=True):
+        [mu, std] = self.mean_std(xtest)
+        if self.d == 1:
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            plt.plot(
+                self.x.cpu().numpy(), self.y.cpu().numpy(), "r+", ms=10, marker="o"
+            )
+            plt.plot(
+                self.x[self.C, :].cpu().numpy(),
+                self.y[self.C, :].cpu().numpy(),
+                "g+",
+                ms=10,
+                marker="o",
+            )
+            # plt.plot(xtest.cpu().numpy(), self.sample(xtest, size=2).cpu().numpy(), 'k--', lw=2, label="sample")
+            plt.fill_between(
+                xtest.cpu().numpy().flat,
+                (mu - 2 * std).cpu().numpy().flat,
+                (mu + 2 * std).cpu().numpy().flat,
+                color="#dddddd",
+            )
+            if f_true is not None:
+                plt.plot(xtest.cpu().numpy(), f_true(xtest).cpu().numpy(), "b-", lw=2)
+            plt.plot(
+                xtest.cpu().numpy(),
+                mu.cpu().numpy(),
+                "r-",
+                lw=2,
+                label="posterior mean",
+            )
+            plt.title("Posterior mean prediction plus 2 st.deviation")
+            plt.legend()
+            if show == True:
+                plt.show()
+
+        elif self.d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].cpu().numpy()
+            yy = xtest[:, 1].cpu().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].cpu().numpy(), (grid_x, grid_y), method="linear"
+            )
+            if f_true is not None:
+                grid_z = griddata(
+                    (xx, yy),
+                    f_true(xtest)[:, 0].cpu().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z, color="b", alpha=0.4)
+            if points == True:
+                ax.scatter(
+                    self.x[:, 0].cpu().numpy(),
+                    self.x[:, 1].cpu().numpy(),
+                    self.y[:, 0].cpu().numpy(),
+                    c="r",
+                    s=100,
+                    marker="o",
+                    depthshade=False,
+                )
+            ax.plot_surface(grid_x, grid_y, grid_z_mu, color="r", alpha=0.4)
+            plt.title("Posterior mean prediction plus 2 st.deviation")
+            plt.show()
+
+        else:
+            print("Visualization not implemented")
 
 
 if __name__ == "__main__":
-	# domain size
-	L_infinity_ball = 1
-	# dimension
-	d = 1
-	# error variance
-	s = 0.1
-	# grid density
-	n = 1024
-	# number of intial points
-	N = 100
-	# smoothness
-	gamma = torch.from_numpy(np.array([0.4, 0.4]))
-	# test problem
-
-	xtest = torch.from_numpy(interval(n, d))
-	x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d)))
-
-	f_no_noise = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
-	# f_no_noise = lambda q: torch.sin((q[:,0] * 4)).view(-1, 1)
-
-	f = lambda q: f_no_noise(q) + torch.normal(mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.,
-											   out=None) * s
-	# targets
-	y = f(x)
-
-	# GP model with squared exponential
-
-	kernel = KernelFunction(gamma=0.05)
-	GP0 = GaussianProcess(kernel_custom=kernel, s=s)
-	GP0.fit_gp(x, y)
-	GP0.visualize(xtest, f_true=f_no_noise)
-
-	GP = NystromFeatures(kernel, m=torch.Tensor([30]), s=s, approx="uniform")
-	GP.fit_gp(x, y)
-	GP.visualize(xtest, f_true=f_no_noise)
-
-	GP = NystromFeatures(kernel, m=torch.Tensor([30]), s=s, approx="online_leverage")
-	GP.fit_gp(x, y)
-	GP.visualize(xtest, f_true=f_no_noise)
-
-	GP = NystromFeatures(kernel, m=torch.Tensor([30]), s=s, approx="svd")
-	GP.fit_gp(x, y)
-	print(GP0.K, GP.outer_kernel())
-	GP.visualize(xtest, f_true=f_no_noise)
+    # domain size
+    L_infinity_ball = 1
+    # dimension
+    d = 1
+    # error variance
+    s = 0.1
+    # grid density
+    n = 1024
+    # number of intial points
+    N = 100
+    # smoothness
+    gamma = torch.tensor(np.array([0.4, 0.4]))
+    # test problem
+
+    xtest = torch.tensor(interval(n, d))
+    x = torch.tensor(np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d)))
+
+    f_no_noise = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
+    # f_no_noise = lambda q: torch.sin((q[:,0] * 4)).view(-1, 1)
+
+    f = (
+        lambda q: f_no_noise(q)
+        + torch.normal(
+            mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.0, out=None
+        )
+        * s
+    )
+    # targets
+    y = f(x)
+
+    # GP model with squared exponential
+
+    kernel = KernelFunction(gamma=0.05)
+    GP0 = GaussianProcess(kernel_custom=kernel, s=s)
+    GP0.fit_gp(x, y)
+    GP0.visualize(xtest, f_true=f_no_noise)
+
+    GP = NystromFeatures(kernel, m=torch.tensor([30]), s=s, approx="uniform")
+    GP.fit_gp(x, y)
+    GP.visualize(xtest, f_true=f_no_noise)
+
+    GP = NystromFeatures(kernel, m=torch.tensor([30]), s=s, approx="online_leverage")
+    GP.fit_gp(x, y)
+    GP.visualize(xtest, f_true=f_no_noise)
+
+    GP = NystromFeatures(kernel, m=torch.tensor([30]), s=s, approx="svd")
+    GP.fit_gp(x, y)
+    print(GP0.K, GP.outer_kernel())
+    GP.visualize(xtest, f_true=f_no_noise)
diff --git a/stpy/continuous_processes/primal_mkl.py b/stpy/continuous_processes/primal_mkl.py
index 4944f9b..84965c0 100755
--- a/stpy/continuous_processes/primal_mkl.py
+++ b/stpy/continuous_processes/primal_mkl.py
@@ -3,200 +3,215 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-class PrimalMKL(RandomProcess):
-
-	def __init__(self,embeddings,init_weights = None, lam = 0.0, s = 0):
-		self.embeddings = embeddings
-		self.init_weights = init_weights
-		self.no_models = len(embeddings)
-		self.s = s
-		self.lam = lam
-		if not isinstance(self.lam,list):
-			self.lam = [lam for i in range(self.no_models)]
-
-	def total_embed_dim(self):
-		self.dims = []
-		for embedding in self.embeddings:
-			self.dims.append(embedding.get_basis_size())
-		sum = torch.sum(torch.Tensor(self.dims))
-		return sum
-
-	def get_emebed_dims(self):
-		self.total_embed_dim()
-		return self.dims
-
-	# def fit_gp(self, x, y):
-	# 	"""
-	# 		In this function we are fitting
-	# 		In this function we are fitting
-	#
-	#
-	#
-	# 	:param x:
-	# 	:param y:
-	# 	:return:
-	# 	"""
-	#
-	#
-	# 	self.x = x
-	# 	self.y = y
-	# 	(self.n, self.d) = self.x.size()
-	# 	self.total_m = self.total_embed_dim()
-	# 	dims_index = torch.cumsum(torch.Tensor([0] + self.get_emebed_dims()),dim = 0).int()
-	# 	self.w = [torch.ones(size = (i,1), dtype = torch.float64,requires_grad = True)  for i in self.get_emebed_dims()]
-	#
-	# 	self.theta = torch.ones(size = (self.no_models,1), dtype = torch.float64,requires_grad = True)
-	#
-	#
-
-
-	def fit_gp(self,x,y):
-		self.x = x
-		self.y = y
-		(self.n,self.d) = self.x.size()
-		self.total_m = self.total_embed_dim()
-		dims_index = torch.cumsum(torch.Tensor([0] + self.get_emebed_dims()),dim = 0).int()
-
-		self.w = [torch.ones(size = (i,1), dtype = torch.float64,requires_grad = True)  for i in self.get_emebed_dims()]
-
-		self.theta = torch.ones(size = (self.no_models,1), dtype = torch.float64,requires_grad = True)
-
-		# def cost(theta,w):
-		#
-		# 	Phi = torch.zeros(size = (self.n,int(self.total_m)), dtype = torch.float64)
-		# 	reg = 0.0
-		# 	for index,embedding in enumerate(self.embeddings):
-		# 		Phi[:,dims_index[index]:dims_index[index+1]] = embedding.embed(self.x)*torch.sqrt(theta[index])
-		# 		reg = reg + torch.sqrt(torch.sum((torch.sqrt(theta[index])*w[index])**2))
-		# 	wvector = torch.cat(w, 0)
-		# 	cost = torch.sum((torch.mm(Phi,wvector) - self.y)**2)
-		# 	cost = cost + self.lam*reg
-		# 	return cost
-
-		def regularizers(w):
-			reg = torch.zeros(self.no_models,dtype=torch.float64)
-			for index, embedding in enumerate(self.embeddings):
-				reg[index] = torch.sqrt(torch.sum(w[index] ** 2))
-			return reg
-
-		def cost(w):
-			Phi = torch.zeros(size = (self.n,int(self.total_m)), dtype = torch.float64)
-			reg = 0.0
-			for index,embedding in enumerate(self.embeddings):
-				Phi[:,dims_index[index]:dims_index[index+1]] = embedding.embed_internal(self.x)
-				reg = reg + self.lam[index]*torch.sqrt(torch.sum(w[index])**2)
-			wvector = torch.cat(w, 0)
-			cost = torch.sum((torch.mm(Phi,wvector) - self.y)**2)
-			cost = cost + reg**2 + self.s*torch.norm(wvector)**2
-			return cost
-
-
-
-		## optimizer objective
-		loss = torch.zeros(1,1,requires_grad = True,dtype = torch.float64)
-		loss = loss + cost(self.w)
-
-
-
-
-		#loss.requires_grad_(True)
-
-
-		from pymanopt.manifolds import Euclidean, Product
-		from pymanopt import Problem
-		from pymanopt.solvers import ConjugateGradient
-		from stpy.cost_functions import CostFunction
-
-		# define cost function
-		C = CostFunction(cost, number_args=self.no_models)
-		[cost_numpy, egrad_numpy, ehess_numpy] = C.define()
-		x = [np.ones(shape = (i,1))  for i in self.get_emebed_dims()]
-
-
-
-		# Optimization with Conjugate Gradient Descent
-		#print (cost_numpy(x))
-		manifold = Product( [Euclidean(i) for i in self.get_emebed_dims()])
-		problem = Problem(manifold=manifold, cost=cost_numpy, egrad=egrad_numpy, ehess=ehess_numpy, verbosity=10)
-		#solver = SteepestDescent(maxiter=1000, mingradnorm=1e-8, minstepsize=1e-10)
-		solver = ConjugateGradient(maxiter=1000, mingradnorm=1e-8, minstepsize=1e-20)
-		Xopt = solver.solve(problem, x=x)
 
+class PrimalMKL(RandomProcess):
 
-
-
-
-		self.w = [torch.from_numpy(w) for w in Xopt]
-		self.theta =  torch.sum(regularizers(self.w),dim = 0)/regularizers(self.w) + self.s
-		self.theta = 1./self.theta
-
-		print (self.theta)
-
-
-	def mean_var(self,xtest):
-		n = xtest.size()[0]
-		dims_index = torch.cumsum(torch.Tensor([0] + self.get_emebed_dims()),dim = 0).int()
-		Phi = torch.zeros(size=(n, int(self.total_m)), dtype=torch.float64)
-
-		for index, embedding in enumerate(self.embeddings):
-			Phi[:, dims_index[index]:dims_index[index + 1]] = embedding.embed_internal(xtest)
-
-		wvector = torch.cat(self.w, 0)
-		mu = torch.mm(Phi, wvector)
-
-		K = (torch.mm(torch.t(Phi),Phi) + self.s * torch.eye(int(self.total_m), dtype=torch.float64))
-		temp = torch.t(torch.solve(torch.t(Phi),K)[0])
-		var = torch.sqrt(self.s*self.s*torch.einsum('ij,ji->i', (temp, torch.t(Phi) )).view(-1, 1))
-
-		mu = mu.detach()
-		var = var.detach()
-
-		return (mu,var)
-
-	def sample(self,xtest, size =1):
-		mu, var = self.mean_var(xtest)
-		sample = mu + var
-		return sample
-
-	def visualize(self,xtest,f_true = None, points = True, show = True):
-		super().visualize(xtest,f_true = f_true, points = points, show = False)
-		## histogram of weights
-		plt.figure(2)
-		plt.bar(np.arange(len(self.embeddings)), self.theta.detach().numpy().flatten(), np.ones(len(self.embeddings)) * 0.5)
-		plt.show()
+    def __init__(self, embeddings, init_weights=None, lam=0.0, s=0):
+        self.embeddings = embeddings
+        self.init_weights = init_weights
+        self.no_models = len(embeddings)
+        self.s = s
+        self.lam = lam
+        if not isinstance(self.lam, list):
+            self.lam = [lam for i in range(self.no_models)]
+
+    def total_embed_dim(self):
+        self.dims = []
+        for embedding in self.embeddings:
+            self.dims.append(embedding.get_basis_size())
+        sum = torch.sum(torch.tensor(self.dims))
+        return sum
+
+    def get_emebed_dims(self):
+        self.total_embed_dim()
+        return self.dims
+
+    # def fit_gp(self, x, y):
+    # 	"""
+    # 		In this function we are fitting
+    # 		In this function we are fitting
+    #
+    #
+    #
+    # 	:param x:
+    # 	:param y:
+    # 	:return:
+    # 	"""
+    #
+    #
+    # 	self.x = x
+    # 	self.y = y
+    # 	(self.n, self.d) = self.x.size()
+    # 	self.total_m = self.total_embed_dim()
+    # 	dims_index = torch.cumsum(torch.tensor([0] + self.get_emebed_dims()),dim = 0).int()
+    # 	self.w = [torch.ones(size = (i,1), dtype = torch.float64,requires_grad = True)  for i in self.get_emebed_dims()]
+    #
+    # 	self.theta = torch.ones(size = (self.no_models,1), dtype = torch.float64,requires_grad = True)
+    #
+    #
+
+    def fit_gp(self, x, y):
+        self.x = x
+        self.y = y
+        (self.n, self.d) = self.x.size()
+        self.total_m = self.total_embed_dim()
+        dims_index = torch.cumsum(
+            torch.tensor([0] + self.get_emebed_dims()), dim=0
+        ).int()
+
+        self.w = [
+            torch.ones(size=(i, 1), dtype=torch.float64, requires_grad=True)
+            for i in self.get_emebed_dims()
+        ]
+
+        self.theta = torch.ones(
+            size=(self.no_models, 1), dtype=torch.float64, requires_grad=True
+        )
+
+        # def cost(theta,w):
+        #
+        # 	Phi = torch.zeros(size = (self.n,int(self.total_m)), dtype = torch.float64)
+        # 	reg = 0.0
+        # 	for index,embedding in enumerate(self.embeddings):
+        # 		Phi[:,dims_index[index]:dims_index[index+1]] = embedding.embed(self.x)*torch.sqrt(theta[index])
+        # 		reg = reg + torch.sqrt(torch.sum((torch.sqrt(theta[index])*w[index])**2))
+        # 	wvector = torch.cat(w, 0)
+        # 	cost = torch.sum((torch.mm(Phi,wvector) - self.y)**2)
+        # 	cost = cost + self.lam*reg
+        # 	return cost
+
+        def regularizers(w):
+            reg = torch.zeros(self.no_models, dtype=torch.float64)
+            for index, embedding in enumerate(self.embeddings):
+                reg[index] = torch.sqrt(torch.sum(w[index] ** 2))
+            return reg
+
+        def cost(w):
+            Phi = torch.zeros(size=(self.n, int(self.total_m)), dtype=torch.float64)
+            reg = 0.0
+            for index, embedding in enumerate(self.embeddings):
+                Phi[:, dims_index[index] : dims_index[index + 1]] = (
+                    embedding.embed_internal(self.x)
+                )
+                reg = reg + self.lam[index] * torch.sqrt(torch.sum(w[index]) ** 2)
+            wvector = torch.cat(w, 0)
+            cost = torch.sum((torch.mm(Phi, wvector) - self.y) ** 2)
+            cost = cost + reg**2 + self.s * torch.norm(wvector) ** 2
+            return cost
+
+        ## optimizer objective
+        loss = torch.zeros(1, 1, requires_grad=True, dtype=torch.float64)
+        loss = loss + cost(self.w)
+
+        # loss.requires_grad_(True)
+
+        from pymanopt.manifolds import Euclidean, Product
+        from pymanopt import Problem
+        from pymanopt.solvers import ConjugateGradient
+        from stpy.cost_functions import CostFunction
+
+        # define cost function
+        C = CostFunction(cost, number_args=self.no_models)
+        [cost_numpy, egrad_numpy, ehess_numpy] = C.define()
+        x = [np.ones(shape=(i, 1)) for i in self.get_emebed_dims()]
+
+        # Optimization with Conjugate Gradient Descent
+        # print (cost_numpy(x))
+        manifold = Product([Euclidean(i) for i in self.get_emebed_dims()])
+        problem = Problem(
+            manifold=manifold,
+            cost=cost_numpy,
+            egrad=egrad_numpy,
+            ehess=ehess_numpy,
+            verbosity=10,
+        )
+        # solver = SteepestDescent(maxiter=1000, mingradnorm=1e-8, minstepsize=1e-10)
+        solver = ConjugateGradient(maxiter=1000, mingradnorm=1e-8, minstepsize=1e-20)
+        Xopt = solver.solve(problem, x=x)
+
+        self.w = [torch.from_numpy(w) for w in Xopt]
+        self.theta = (
+            torch.sum(regularizers(self.w), dim=0) / regularizers(self.w) + self.s
+        )
+        self.theta = 1.0 / self.theta
+
+        print(self.theta)
+
+    def mean_var(self, xtest):
+        n = xtest.size()[0]
+        dims_index = torch.cumsum(
+            torch.tensor([0] + self.get_emebed_dims()), dim=0
+        ).int()
+        Phi = torch.zeros(size=(n, int(self.total_m)), dtype=torch.float64)
+
+        for index, embedding in enumerate(self.embeddings):
+            Phi[:, dims_index[index] : dims_index[index + 1]] = (
+                embedding.embed_internal(xtest)
+            )
+
+        wvector = torch.cat(self.w, 0)
+        mu = torch.mm(Phi, wvector)
+
+        K = torch.mm(torch.t(Phi), Phi) + self.s * torch.eye(
+            int(self.total_m), dtype=torch.float64
+        )
+        temp = torch.t(torch.solve(torch.t(Phi), K)[0])
+        var = torch.sqrt(
+            self.s * self.s * torch.einsum("ij,ji->i", (temp, torch.t(Phi))).view(-1, 1)
+        )
+
+        mu = mu.detach()
+        var = var.detach()
+
+        return (mu, var)
+
+    def sample(self, xtest, size=1):
+        mu, var = self.mean_var(xtest)
+        sample = mu + var
+        return sample
+
+    def visualize(self, xtest, f_true=None, points=True, show=True):
+        super().visualize(xtest, f_true=f_true, points=points, show=False)
+        ## histogram of weights
+        plt.figure(2)
+        plt.bar(
+            np.arange(len(self.embeddings)),
+            self.theta.detach().numpy().flatten(),
+            np.ones(len(self.embeddings)) * 0.5,
+        )
+        plt.show()
 
 
 if __name__ == "__main__":
-	from stpy.continuous_processes.fourier_fea import GaussianProcessFF
-	from stpy.continuous_processes.gauss_procc import GaussianProcess
-	from stpy.test_functions.benchmarks import MultiRKHS
-
-	n = 1024
-	N = 100
-	s = 0.01
-	TestFunction = MultiRKHS()
-	xtest = TestFunction.interval(n)
-	x = TestFunction.initial_guess(N)
-	y = TestFunction.eval(x,sigma = s)
-	#TestFunction.visualize(xtest)
-
-
-	GP1 = GaussianProcess(s=0, kernel="linear")
-	GP2 = GaussianProcessFF(s=s, m=100, approx="hermite")
-
-	MKL = PrimalMKL([GP1,GP2], lam=[0.1, 0.1], s = s)
-	MKL.fit_gp(x, y)
-
-	print ("Importance Weights:",MKL.theta)
-
-	print("Slope of linear line:", MKL.w[0])
-
-	MKL.visualize(xtest, f_true=TestFunction.eval_noiseless)
-
-	# MKL = PrimalMKL(GPs, lam=0.01)
-	# MKL.fit_gp(x,y)
-	# MKL.visualize(xtest,f_true=TestFunction.eval_noiseless)
-	#
-	# MKL = PrimalMKL(GPs, lam=0.0001)
-	# MKL.fit_gp(x,y)
-	# MKL.visualize(xtest,f_true=TestFunction.eval_noiseless)
+    from stpy.continuous_processes.fourier_fea import GaussianProcessFF
+    from stpy.continuous_processes.gauss_procc import GaussianProcess
+    from stpy.test_functions.benchmarks import MultiRKHS
+
+    n = 1024
+    N = 100
+    s = 0.01
+    TestFunction = MultiRKHS()
+    xtest = TestFunction.interval(n)
+    x = TestFunction.initial_guess(N)
+    y = TestFunction.eval(x, sigma=s)
+    # TestFunction.visualize(xtest)
+
+    GP1 = GaussianProcess(s=0, kernel="linear")
+    GP2 = GaussianProcessFF(s=s, m=100, approx="hermite")
+
+    MKL = PrimalMKL([GP1, GP2], lam=[0.1, 0.1], s=s)
+    MKL.fit_gp(x, y)
+
+    print("Importance Weights:", MKL.theta)
+
+    print("Slope of linear line:", MKL.w[0])
+
+    MKL.visualize(xtest, f_true=TestFunction.eval_noiseless)
+
+    # MKL = PrimalMKL(GPs, lam=0.01)
+    # MKL.fit_gp(x,y)
+    # MKL.visualize(xtest,f_true=TestFunction.eval_noiseless)
+    #
+    # MKL = PrimalMKL(GPs, lam=0.0001)
+    # MKL.fit_gp(x,y)
+    # MKL.visualize(xtest,f_true=TestFunction.eval_noiseless)
diff --git a/stpy/continuous_processes/trace_features.py b/stpy/continuous_processes/trace_features.py
index 6f8a8ab..a8f5f26 100644
--- a/stpy/continuous_processes/trace_features.py
+++ b/stpy/continuous_processes/trace_features.py
@@ -7,117 +7,125 @@
 
 class TraceFeatures(KernelizedFeatures):
 
-	def __init__(self, *args, PSD=False, **kwargs):
-		super().__init__(*args, **kwargs)
-		self.m = int(self.m)
-		self.PSD = PSD
-
-	def construct_covariance(self):
-		emb = self.emb
-		X = torch.flatten(torch.einsum('ij,ik->jki', emb, emb).permute(1, 0, 2), end_dim=1)
-		V = torch.einsum('ik,jk->ij', X, X)
-		# Z = torch.einsum('ij,j->i',X,y.reshape(-1)).reshape(-1,1)
-		self.V = V + self.lam * self.s ** 2 * torch.eye(self.m ** 2).double()
-
-	# self.A_new,_ = torch.solve(Z,self.V)
-	# self.A_new = self.A_new.reshape(self.m,self.m)
-
-	def fit_gp(self, x, y):
-		self.n, self.d = x.size()
-		self.x = x
-		self.y = y
-
-		self.emb = self.embed(x)
-		self.construct_covariance()
-
-		emb = self.emb.numpy()
-		A = cp.Variable((self.m, self.m), symmetric=True)
-		cost = cp.sum_squares \
-				   (cp.diag(emb @ A @ emb.T) - y.view(-1).numpy()) / (self.s ** 2) + (self.lam) * cp.norm(A, "fro")
-
-		if self.PSD == True:
-			constraints = [A >> 0]
-		else:
-			constraints = []
-
-		prob = cp.Problem(cp.Minimize(cost), constraints)
-		prob.solve(solver=cp.MOSEK, verbose=True)
-		self.A = torch.from_numpy(A.value)
-		self.fit = True
-
-	def mean_std(self, xtest, std=True):
-		emb = self.embed(xtest)
-		mu = torch.einsum('ij,jk,ik->i', emb, self.A, emb).view(-1, 1)
-		if std == True:
-			# invV = torch.inverse(self.V)
-			X = torch.flatten(torch.einsum('ij,ik->jki', emb, emb), end_dim=1)
-			Z, _ = torch.solve(X, self.V)
-			# diagonal = self.lam*self.s ** 2 * torch.einsum('ji,jk,ki->i', (X, invV, X)).view(-1, 1)
-			diagonal = self.lam * self.s ** 2 * torch.einsum('ij,ij->j', X, Z).view(-1, 1)
-			return mu, torch.sqrt(diagonal).view(-1, 1)
-		else:
-			return mu
-
-	def band(self, xtest, sqrtbeta=2., maximization=True):
-		emb = self.embed(xtest)
-		X = torch.einsum('ij,ik->ijk', emb, emb)
-		n = emb.size()[0]
-		ucb = torch.zeros(size=(n, 1)).double()
-
-		for i in range(n):
-			A = cp.Variable((self.m, self.m), symmetric=True)
-			cost = cp.trace(A @ X[i, :, :])
-
-			Z = torch.cholesky(self.V, upper=True)
-			zero = np.zeros(self.m ** 2)
-			constraints = [cp.SOC(zero.T @ cp.vec(A) + self.s * sqrtbeta, Z @ (cp.vec(A) - cp.vec(self.A.numpy())))]
-
-			if self.PSD == True:
-				constraints += [A >> 0]
-
-			if maximization == True:
-				prob = cp.Problem(cp.Maximize(cost), constraints)
-			else:
-				prob = cp.Problem(cp.Minimize(cost), constraints)
-
-			prob.solve(solver=cp.MOSEK, verbose=False)
-			ucb[i] = torch.trace(torch.from_numpy(A.value) @ X[i, :, :])
-		return ucb
-
-	def lcb(self, xtest, sqrtbeta=2.):
-		return self.band(xtest, sqrtbeta=sqrtbeta, maximization=False)
-
-	def ucb(self, xtest, sqrtbeta=2.):
-		return self.band(xtest, sqrtbeta=sqrtbeta, maximization=True)
+    def __init__(self, *args, PSD=False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.m = int(self.m)
+        self.PSD = PSD
+
+    def construct_covariance(self):
+        emb = self.emb
+        X = torch.flatten(
+            torch.einsum("ij,ik->jki", emb, emb).permute(1, 0, 2), end_dim=1
+        )
+        V = torch.einsum("ik,jk->ij", X, X)
+        # Z = torch.einsum('ij,j->i',X,y.reshape(-1)).reshape(-1,1)
+        self.V = V + self.lam * self.s**2 * torch.eye(self.m**2).double()
+
+    # self.A_new,_ = torch.solve(Z,self.V)
+    # self.A_new = self.A_new.reshape(self.m,self.m)
+
+    def fit_gp(self, x, y):
+        self.n, self.d = x.size()
+        self.x = x
+        self.y = y
+
+        self.emb = self.embed(x)
+        self.construct_covariance()
+
+        emb = self.emb.numpy()
+        A = cp.Variable((self.m, self.m), symmetric=True)
+        cost = cp.sum_squares(cp.diag(emb @ A @ emb.T) - y.view(-1).numpy()) / (
+            self.s**2
+        ) + (self.lam) * cp.norm(A, "fro")
+
+        if self.PSD == True:
+            constraints = [A >> 0]
+        else:
+            constraints = []
+
+        prob = cp.Problem(cp.Minimize(cost), constraints)
+        prob.solve(solver=cp.MOSEK, verbose=True)
+        self.A = torch.from_numpy(A.value)
+        self.fit = True
+
+    def mean_std(self, xtest, std=True):
+        emb = self.embed(xtest)
+        mu = torch.einsum("ij,jk,ik->i", emb, self.A, emb).view(-1, 1)
+        if std == True:
+            # invV = torch.inverse(self.V)
+            X = torch.flatten(torch.einsum("ij,ik->jki", emb, emb), end_dim=1)
+            Z, _ = torch.solve(X, self.V)
+            # diagonal = self.lam*self.s ** 2 * torch.einsum('ji,jk,ki->i', (X, invV, X)).view(-1, 1)
+            diagonal = self.lam * self.s**2 * torch.einsum("ij,ij->j", X, Z).view(-1, 1)
+            return mu, torch.sqrt(diagonal).view(-1, 1)
+        else:
+            return mu
+
+    def band(self, xtest, sqrtbeta=2.0, maximization=True):
+        emb = self.embed(xtest)
+        X = torch.einsum("ij,ik->ijk", emb, emb)
+        n = emb.size()[0]
+        ucb = torch.zeros(size=(n, 1)).double()
+
+        for i in range(n):
+            A = cp.Variable((self.m, self.m), symmetric=True)
+            cost = cp.trace(A @ X[i, :, :])
+
+            Z = torch.cholesky(self.V, upper=True)
+            zero = np.zeros(self.m**2)
+            constraints = [
+                cp.SOC(
+                    zero.T @ cp.vec(A) + self.s * sqrtbeta,
+                    Z @ (cp.vec(A) - cp.vec(self.A.numpy())),
+                )
+            ]
+
+            if self.PSD == True:
+                constraints += [A >> 0]
+
+            if maximization == True:
+                prob = cp.Problem(cp.Maximize(cost), constraints)
+            else:
+                prob = cp.Problem(cp.Minimize(cost), constraints)
+
+            prob.solve(solver=cp.MOSEK, verbose=False)
+            ucb[i] = torch.trace(torch.from_numpy(A.value) @ X[i, :, :])
+        return ucb
+
+    def lcb(self, xtest, sqrtbeta=2.0):
+        return self.band(xtest, sqrtbeta=sqrtbeta, maximization=False)
+
+    def ucb(self, xtest, sqrtbeta=2.0):
+        return self.band(xtest, sqrtbeta=sqrtbeta, maximization=True)
 
 
 if __name__ == "__main__":
-	from stpy.embeddings.embedding import HermiteEmbedding
-	import matplotlib.pyplot as plt
+    from stpy.embeddings.embedding import HermiteEmbedding
+    import matplotlib.pyplot as plt
 
-	m = 32
-	n = 16
-	s = 0.01
-	N = 5
+    m = 32
+    n = 16
+    s = 0.01
+    N = 5
 
-	func = lambda x: torch.sin(x * np.pi) ** 2 + 0.5
-	x = torch.from_numpy(np.random.uniform(-1, 1, size=(N, 1)))
-	y = func(x)
+    func = lambda x: torch.sin(x * np.pi) ** 2 + 0.5
+    x = torch.from_numpy(np.random.uniform(-1, 1, size=(N, 1)))
+    y = func(x)
 
-	embedding = HermiteEmbedding(m=m, gamma=0.5)
-	xtest = torch.from_numpy(np.linspace(-1, 1, n)).view(-1, 1)
+    embedding = HermiteEmbedding(m=m, gamma=0.5)
+    xtest = torch.from_numpy(np.linspace(-1, 1, n)).view(-1, 1)
 
-	F = TraceFeatures(s=s, embedding=embedding, m=m, PSD=True)
-	F.fit_gp(x, y)
+    F = TraceFeatures(s=s, embedding=embedding, m=m, PSD=True)
+    F.fit_gp(x, y)
 
-	F.visualize(xtest, f_true=func, size=0, show=False)
+    F.visualize(xtest, f_true=func, size=0, show=False)
 
-	lcb = F.lcb(xtest)
-	ucb = F.ucb(xtest)
-	plt.plot(xtest, lcb, '-s', color='lightblue', label='lcb')
-	plt.plot(xtest, ucb, '-s', color='gray', label='ucb')
-	plt.legend()
-	plt.show()
+    lcb = F.lcb(xtest)
+    ucb = F.ucb(xtest)
+    plt.plot(xtest, lcb, "-s", color="lightblue", label="lcb")
+    plt.plot(xtest, ucb, "-s", color="gray", label="ucb")
+    plt.legend()
+    plt.show()
 #
 # mu, std = F.mean_std(xtest)
 # plt.plot(xtest,func(xtest),'r',label = 'true')
diff --git a/stpy/continuous_processes/truncated_gp.py b/stpy/continuous_processes/truncated_gp.py
new file mode 100644
index 0000000..91d05a6
--- /dev/null
+++ b/stpy/continuous_processes/truncated_gp.py
@@ -0,0 +1,84 @@
+import numpy as np
+from stpy.continuous_processes.gauss_procc import GaussianProcess
+from stpy.helpers.posterior_sampling import tmg
+import torch
+
+
+class TruncatedGP:
+    """
+    A truncated Gaussian Process that can serve as a ground truth model
+    for the PPP estimators. Sampling is very slow at the moment
+    """
+
+    def __init__(self, kernel, d):
+        self.gp = GaussianProcess(kernel=kernel, d=d)
+        self.x_acc = None
+        self.y_acc = None
+
+    def __call__(self, x: torch.tensor, dt: float = 1.0, burn_in=30):
+        N = len(x)
+        # Initialize sample array
+        sample = torch.zeros(N)
+
+        if self.x_acc is None:
+            x_new = x
+        else:
+            # Find indices of x that are already in self.x_acc
+            matching = torch.all(
+                x.unsqueeze(1) == self.x_acc.unsqueeze(0), dim=2
+            )  # (N, M)
+            matching_indices = torch.nonzero(matching, as_tuple=False)  # (K, 2)
+            idx_cached_in_x = matching_indices[:, 0]  # Indices in x
+            idx_cached_in_acc = matching_indices[:, 1]  # Indices in self.x_acc
+
+            # Determine which indices are new
+            mask_cached = torch.zeros(N, dtype=torch.bool)
+            mask_cached[idx_cached_in_x] = True
+            idx_new = torch.nonzero(~mask_cached).squeeze(1)
+            # Retrieve cached function values
+            sample[idx_cached_in_x] = self.y_acc[idx_cached_in_acc]
+            x_new = x[idx_new]
+
+        # Compute function values for new points
+        if len(x_new) > 0:
+            if self.gp.fitted:
+                mean_new, cov_new = self.gp.mean_std_sub(x_new, full=True)
+                mean_new = mean_new.squeeze(1)
+            else:
+                mean_new = torch.zeros(
+                    len(x_new),
+                )
+                cov_new = self.gp.kernel(
+                    x,
+                    x,
+                )
+
+            # Sample truncated GP for new points
+            factor = torch.eye(len(x_new))
+            summand = torch.zeros(len(x_new))
+            cov_new = cov_new.cpu().numpy() + 1e-7 * np.eye(len(x_new))
+            sample_new = tmg(
+                1,
+                mean_new.cpu().numpy(),
+                cov_new,
+                torch.ones(len(x_new)).cpu().numpy(),
+                factor.cpu().numpy(),
+                summand.cpu().numpy(),
+                burn_in,
+                True,
+            )
+            sample_new = torch.tensor(sample_new[0])
+
+            # Update sample array and caches
+            if self.x_acc is None:
+                sample = sample_new
+                self.x_acc = x_new
+                self.y_acc = sample_new
+            else:
+                sample[idx_new] = sample_new
+                self.x_acc = torch.cat([self.x_acc, x_new])
+                self.y_acc = torch.cat([self.y_acc, sample_new])
+
+            self.gp.fit(self.x_acc, self.y_acc.unsqueeze(1))
+
+        return sample * dt
diff --git a/stpy/continuous_processes/truncated_kernelized_features.py b/stpy/continuous_processes/truncated_kernelized_features.py
index 4fd76c3..01fc298 100644
--- a/stpy/continuous_processes/truncated_kernelized_features.py
+++ b/stpy/continuous_processes/truncated_kernelized_features.py
@@ -1,61 +1,96 @@
 from stpy.continuous_processes.kernelized_features import KernelizedFeatures
-import torch 
+import torch
+
 
 class TruncatedKernelizedFeatures(KernelizedFeatures):
 
-	def __init__(self, embedding, m, s=0.001, lam=1, d=1, diameter=1, verbose=True, groups=None, bounds=None, scale=1, kappa=1, poly=2,
-	 primal=True, beta_fun=None, alpha_score=lambda t: t**(1/4), default_alpha_score=1., bound = 1.):
-		super().__init__(embedding, m, s =s, lam=lam,d= d,diameter= diameter, verbose=verbose,
-			  groups = groups, bounds=bounds, scale=scale, kappa=kappa, poly=poly, primal=primal, beta_fun = beta_fun, bound = bound)
-		primal = True
-		self.bound = bound
-		self.alpha_score = alpha_score
-		self.default_alpha_score = default_alpha_score
-
-	def theta_mean(self, var=False, prior=False):
-		self.precompute()
-
-		if self.fitted == True and prior == False:
-			theta_mean = self.invV@self.Q.T@self.y_truncated
-			Z = self.s**2 * self.invV
-		else:
-			theta_mean = 0*torch.ones(size=(self.m, 1)).double()
-
-		if var is False:
-			return theta_mean
-		else:
-			return (theta_mean, Z)
-
-	def fit(self, x=None, y=None):
-		self.alphas = self.y*0 + self.default_alpha_score
-		super().fit(x= x, y= y)
-
-	def add_points(self,d):
-		x, y = d
-		if self.x is not None:
-			self.x = torch.cat((self.x, x), dim=0)
-			self.y = torch.cat((self.y, y), dim=0)
-			new_alpha =torch.Tensor( [self.alpha_score(self.x.size()[0])]).view(1,1)
-			self.alphas = torch.cat((self.alphas,new_alpha),dim=0)
-		else:
-			self.x = x
-			self.y = y
-			self.alphas = self.default_alpha_score
-		self.fitted = False
-
-	def add_data_point(self,x,y):
-		self.add_points(x,y)
-		
-	def precompute(self):
-		if self.fitted == False:
-			self.Q = self.embed(self.x)
-			I = torch.eye(int(self.m)).double()
-			Z_ = self.Q.T@self.Q
-			self.V = Z_ + (self.s **2) * self.lam *I
-			self.invV = torch.pinverse(self.V)
-			self.y_truncated = self.y.view(-1)*(torch.abs(self.y) < self.alphas).view(-1).double()
-			self.y_truncated = self.y_truncated.view(-1,1)
-			self.fitted = True
-		else:
-			pass
+    def __init__(
+        self,
+        embedding,
+        m,
+        s=0.001,
+        lam=1,
+        d=1,
+        diameter=1,
+        verbose=True,
+        groups=None,
+        bounds=None,
+        scale=1,
+        kappa=1,
+        poly=2,
+        primal=True,
+        beta_fun=None,
+        alpha_score=lambda t: t ** (1 / 4),
+        default_alpha_score=1.0,
+        bound=1.0,
+    ):
+        super().__init__(
+            embedding,
+            m,
+            s=s,
+            lam=lam,
+            d=d,
+            diameter=diameter,
+            verbose=verbose,
+            groups=groups,
+            bounds=bounds,
+            scale=scale,
+            kappa=kappa,
+            poly=poly,
+            primal=primal,
+            beta_fun=beta_fun,
+            bound=bound,
+        )
+        primal = True
+        self.bound = bound
+        self.alpha_score = alpha_score
+        self.default_alpha_score = default_alpha_score
+
+    def theta_mean(self, var=False, prior=False):
+        self.precompute()
+
+        if self.fitted == True and prior == False:
+            theta_mean = self.invV @ self.Q.T @ self.y_truncated
+            Z = self.s**2 * self.invV
+        else:
+            theta_mean = 0 * torch.ones(size=(self.m, 1)).double()
+
+        if var is False:
+            return theta_mean
+        else:
+            return (theta_mean, Z)
+
+    def fit(self, x=None, y=None):
+        self.alphas = self.y * 0 + self.default_alpha_score
+        super().fit(x=x, y=y)
+
+    def add_points(self, d):
+        x, y = d
+        if self.x is not None:
+            self.x = torch.cat((self.x, x), dim=0)
+            self.y = torch.cat((self.y, y), dim=0)
+            new_alpha = torch.tensor([self.alpha_score(self.x.size()[0])]).view(1, 1)
+            self.alphas = torch.cat((self.alphas, new_alpha), dim=0)
+        else:
+            self.x = x
+            self.y = y
+            self.alphas = self.default_alpha_score
+        self.fitted = False
+
+    def add_data_point(self, x, y):
+        self.add_points(x, y)
 
+    def precompute(self):
+        if self.fitted == False:
+            self.Q = self.embed(self.x)
+            I = torch.eye(int(self.m)).double()
+            Z_ = self.Q.T @ self.Q
+            self.V = Z_ + (self.s**2) * self.lam * I
+            self.invV = torch.pinverse(self.V)
+            self.y_truncated = (
+                self.y.view(-1) * (torch.abs(self.y) < self.alphas).view(-1).double()
+            )
+            self.y_truncated = self.y_truncated.view(-1, 1)
+            self.fitted = True
+        else:
+            pass
diff --git a/stpy/dimred/sri.py b/stpy/dimred/sri.py
index 6904037..04f6d75 100644
--- a/stpy/dimred/sri.py
+++ b/stpy/dimred/sri.py
@@ -5,130 +5,138 @@
 from sklearn.cluster import KMeans
 
 
-class SRI():
-
-	def __init__(self):
-		"""
-		:param X: X values
-		:param y: response variables
-		:param relative: relative to number of samples
-		:param buckets:
-		"""
-
-	def standardize(self, X):
-		(n, d) = X.size()
-		Sigma_x = np.cov(self.X.numpy().T)
-		E_x = np.mean(self.X.numpy(), axis=0)
-		Sigma_x_half_inv = np.linalg.pinv(scipy.linalg.sqrtm(Sigma_x))
-		Z = (X.numpy() - np.outer(np.ones(n), E_x)) @ Sigma_x_half_inv
-
-		return Sigma_x_half_inv, Z
-
-	def slice_kmeans(self, y):
-		indices = []
-		kmeans = KMeans(n_clusters=self.buckets).fit(y.numpy().reshape(-1, 1))
-
-		for label in range(self.buckets):
-			ind = kmeans.labels_ == label
-			indices.append(ind)
-		return indices
-
-	def fit_sri(self, X, y, buckets=10):
-		self.X = X
-		self.y = y
-		self.buckets = buckets
-		(n, d) = self.X.size()
-		Sigma_x_half_inv, Z = self.standardize(self.X)
-
-		if isinstance(self.buckets, int):
-			indices = self.slice_kmeans(self.y)
-
-			zs = []
-			ns = []
-			for ind in indices:
-				if np.sum(ind) > 1:
-					z = np.mean(Z[ind, :].reshape(-1, d), axis=0)
-					ns.append(np.sum(ind))
-					zs.append(z)
-			Zn = np.array(zs)
-			V = (Zn.T @ np.diag(ns) @ Zn) / self.buckets
-
-		else:
-			raise AssertionError("Unknown bucketing rule.")
-
-		lams, eignv = np.linalg.eig(V)
-		betas = Sigma_x_half_inv @ eignv
-		return torch.from_numpy(lams), torch.from_numpy(betas)
-
-	def fit_save(self, X, y, buckets=10):
-		self.X = X
-		self.y = y
-		self.buckets = buckets
-		(n, d) = self.X.size()
-		Sigma_x_half_inv, Z = self.standardize(self.X)
-
-		if isinstance(self.buckets, int):
-			indices = self.slice_kmeans(self.y)
-
-			V = np.zeros(shape=(d, d))
-			I = np.eye(d)
-			for ind in indices:
-				ns = np.sum(ind)
-				if ns > 1:
-					Covar_slice = np.cov(Z[ind, :].reshape(-1, d).T)
-					V = V + ((I - Covar_slice) @ (I - Covar_slice)) * (float(ns) / float(n))
-
-		else:
-			raise AssertionError("Unknown bucketing rule.")
-
-		lams, eignv = np.linalg.eig(V)
-		betas = Sigma_x_half_inv @ eignv
-		return torch.from_numpy(lams), torch.from_numpy(betas)
-
-	def gradient_design(self, d, k, nablaF, eps=10e-4):
-		Sigma = torch.eye(d).double() * eps
-		x0 = torch.rand(size=(k, d)).double()
-		subspace = nablaF(x0)
-		Sigma = Sigma + subspace.T @ subspace
-		return x0, Sigma, subspace
-
-	def sample_dir(self, n, x0, subspace, eps=10e-4):
-		indices = np.arange(0, x0.size()[0], 1)
-		choice = np.random.choice(indices, n, replace=True)
-		magnitude = np.diag(np.random.randn(n))
-		sample = x0.numpy()[choice] + magnitude @ subspace[choice].numpy() + eps * np.random.randn(n, d)
-		return torch.from_numpy(sample)
+class SRI:
+
+    def __init__(self):
+        """
+        :param X: X values
+        :param y: response variables
+        :param relative: relative to number of samples
+        :param buckets:
+        """
+
+    def standardize(self, X):
+        (n, d) = X.size()
+        Sigma_x = np.cov(self.X.numpy().T)
+        E_x = np.mean(self.X.numpy(), axis=0)
+        Sigma_x_half_inv = np.linalg.pinv(scipy.linalg.sqrtm(Sigma_x))
+        Z = (X.numpy() - np.outer(np.ones(n), E_x)) @ Sigma_x_half_inv
+
+        return Sigma_x_half_inv, Z
+
+    def slice_kmeans(self, y):
+        indices = []
+        kmeans = KMeans(n_clusters=self.buckets).fit(y.numpy().reshape(-1, 1))
+
+        for label in range(self.buckets):
+            ind = kmeans.labels_ == label
+            indices.append(ind)
+        return indices
+
+    def fit_sri(self, X, y, buckets=10):
+        self.X = X
+        self.y = y
+        self.buckets = buckets
+        (n, d) = self.X.size()
+        Sigma_x_half_inv, Z = self.standardize(self.X)
+
+        if isinstance(self.buckets, int):
+            indices = self.slice_kmeans(self.y)
+
+            zs = []
+            ns = []
+            for ind in indices:
+                if np.sum(ind) > 1:
+                    z = np.mean(Z[ind, :].reshape(-1, d), axis=0)
+                    ns.append(np.sum(ind))
+                    zs.append(z)
+            Zn = np.array(zs)
+            V = (Zn.T @ np.diag(ns) @ Zn) / self.buckets
+
+        else:
+            raise AssertionError("Unknown bucketing rule.")
+
+        lams, eignv = np.linalg.eig(V)
+        betas = Sigma_x_half_inv @ eignv
+        return torch.from_numpy(lams), torch.from_numpy(betas)
+
+    def fit_save(self, X, y, buckets=10):
+        self.X = X
+        self.y = y
+        self.buckets = buckets
+        (n, d) = self.X.size()
+        Sigma_x_half_inv, Z = self.standardize(self.X)
+
+        if isinstance(self.buckets, int):
+            indices = self.slice_kmeans(self.y)
+
+            V = np.zeros(shape=(d, d))
+            I = np.eye(d)
+            for ind in indices:
+                ns = np.sum(ind)
+                if ns > 1:
+                    Covar_slice = np.cov(Z[ind, :].reshape(-1, d).T)
+                    V = V + ((I - Covar_slice) @ (I - Covar_slice)) * (
+                        float(ns) / float(n)
+                    )
+
+        else:
+            raise AssertionError("Unknown bucketing rule.")
+
+        lams, eignv = np.linalg.eig(V)
+        betas = Sigma_x_half_inv @ eignv
+        return torch.from_numpy(lams), torch.from_numpy(betas)
+
+    def gradient_design(self, d, k, nablaF, eps=10e-4):
+        Sigma = torch.eye(d).double() * eps
+        x0 = torch.rand(size=(k, d)).double()
+        subspace = nablaF(x0)
+        Sigma = Sigma + subspace.T @ subspace
+        return x0, Sigma, subspace
+
+    def sample_dir(self, n, x0, subspace, eps=10e-4):
+        indices = np.arange(0, x0.size()[0], 1)
+        choice = np.random.choice(indices, n, replace=True)
+        magnitude = np.diag(np.random.randn(n))
+        sample = (
+            x0.numpy()[choice]
+            + magnitude @ subspace[choice].numpy()
+            + eps * np.random.randn(n, d)
+        )
+        return torch.from_numpy(sample)
 
 
 if __name__ == "__main__":
-	d = 3
-	p = 2
+    d = 3
+    p = 2
 
-	sigma = 0.
-	A = torch.from_numpy(np.random.randn(d, p))
-	A = torch.from_numpy(np.eye(d, p))
-	print(A)
-	# exampel function
-	f = lambda x: torch.sum((x @ A) ** 2, dim=1) + sigma * torch.randn(x.size()[0], dtype=torch.double)
-	f_no_noise = lambda x: torch.sum((x @ A) ** 2, dim=1)
+    sigma = 0.0
+    A = torch.from_numpy(np.random.randn(d, p))
+    A = torch.from_numpy(np.eye(d, p))
+    print(A)
+    # exampel function
+    f = lambda x: torch.sum((x @ A) ** 2, dim=1) + sigma * torch.randn(
+        x.size()[0], dtype=torch.double
+    )
+    f_no_noise = lambda x: torch.sum((x @ A) ** 2, dim=1)
 
-	nablaF = lambda x: x @ A @ A.T
+    nablaF = lambda x: x @ A @ A.T
 
-	DimRed = SRI()
-	N = 100
-	x0, Sigma, subspace = DimRed.gradient_design(d, d, nablaF)
-	X0 = DimRed.sample_dir(N, x0, subspace)
-	y0 = f(X0)
+    DimRed = SRI()
+    N = 100
+    x0, Sigma, subspace = DimRed.gradient_design(d, d, nablaF)
+    X0 = DimRed.sample_dir(N, x0, subspace)
+    y0 = f(X0)
 
-	plt.scatter(X0[:, 0], X0[:, 1], c=y0.view(-1))
-	plt.show()
+    plt.scatter(X0[:, 0], X0[:, 1], c=y0.view(-1))
+    plt.show()
 
-	lams, betas = DimRed.fit_sri(X0, y0, buckets=20)
+    lams, betas = DimRed.fit_sri(X0, y0, buckets=20)
 
-	print(lams / torch.sum(lams))
-	print(betas)
+    print(lams / torch.sum(lams))
+    print(betas)
 
-	lams2, betas2 = DimRed.fit_save(X0, y0, buckets=20)
+    lams2, betas2 = DimRed.fit_save(X0, y0, buckets=20)
 
-	print(lams2 / torch.sum(lams2))
-	print(betas2)
+    print(lams2 / torch.sum(lams2))
+    print(betas2)
diff --git a/stpy/embeddings/bernstein_embedding.py b/stpy/embeddings/bernstein_embedding.py
index 3653911..68467c6 100644
--- a/stpy/embeddings/bernstein_embedding.py
+++ b/stpy/embeddings/bernstein_embedding.py
@@ -7,105 +7,109 @@
 
 class BernsteinEmbedding(PositiveEmbedding):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
-
-	def basis_fun(self, x, j):  # 1d
-		"""
-		Return the value of basis function \phi_j(x)
-		:param x: double, need to be in the interval
-		:param j: integer, index of hat functions, 0 <= j <= m-1
-		:return: \phi_j(x)
-		"""
-		lim = [self.interval[0], self.interval[1]]
-		c = np.zeros(shape=(self.m, 1))
-		c[j] = 1
-		bp = BPoly(c, lim, extrapolate=False)
-		res = bp(x.numpy())
-		value = torch.from_numpy(np.nan_to_num(res))
-		return value
-
-	# return torch.from_numpy(bp(x.numpy()))
-
-	def get_polynomial(self, j):
-		if self.d == 1:
-			lim = [self.interval[0], self.interval[1]]
-			c = np.zeros(shape=(self.m, 1))
-			c[j] = 1
-			roots = PPoly.from_bernstein_basis(BPoly(c, lim)).roots()
-			poly = np.polynomial.polynomial.Polynomial(np.polynomial.polynomial.polyfromroots(roots),
-													   domain=np.array(lim))
-
-		elif self.d == 2:
-			lim = [self.interval[0], self.interval[1]]
-			k = j // self.m
-			l = j % self.m
-			c = np.zeros(shape=(self.m, 1))
-			c[k] = 1
-			bp = BPoly(c, lim)
-			c = np.zeros(shape=(self.m, 1))
-			c[l] = 1
-			bp2 = BPoly(c, lim)
-			roots1 = PPoly.from_bernstein_basis(bp).roots()
-			roots2 = PPoly.from_bernstein_basis(bp2).roots()
-			poly1 = np.polynomial.polynomial.Polynomial(np.polynomial.polynomial.polyfromroots(roots1),
-														domain=np.array(lim))
-			poly2 = np.polynomial.polynomial.Polynomial(np.polynomial.polynomial.polyfromroots(roots2),
-														domain=np.array(lim))
-			poly = poly1 * poly2
-		return poly
-
-	def integral(self, S):
-		assert (S.d == self.d)
-		psi = torch.zeros(self.get_m()).double()
-
-		if self.d == 1:
-			a, b = float(S.bounds[0, 0]), float(S.bounds[0, 1])
-			for j in range(self.get_m()):
-				lim = [self.interval[0], self.interval[1]]
-				c = np.zeros(shape=(self.m, 1))
-				c[j] = 1
-				bp = BPoly(c, lim)
-				xa = np.maximum(self.interval[0], a)
-				xb = np.minimum(self.interval[1], b)
-				psi[j] = bp.integrate(xa, xb, extrapolate=False)
-
-		elif self.d == 2:
-			xa, xb = S.bounds[0, 0], S.bounds[0, 1]
-			ya, yb = S.bounds[1, 0], S.bounds[1, 1]
-			for j in range(self.get_m()):
-				lim = [self.interval[0], self.interval[1]]
-
-				k = j // self.m
-				l = j % self.m
-
-				c = np.zeros(shape=(self.m, 1))
-				c[k] = 1
-				bp = BPoly(c, lim)
-				vol1 = bp.integrate(xa, xb)
-				c = np.zeros(shape=(self.m, 1))
-				c[l] = 1
-				bp = BPoly(c, lim)
-				vol2 = bp.integrate(ya, yb)
-				psi[j] = vol1 * vol2
-
-		Gamma_half = self.cov()
-		return psi @ Gamma_half
-
-	def product_integral(self, S):
-		m = self.get_m()
-		Psi = torch.zeros(size=(m, m)).double()
-		a, b = S.bounds[0, 0], S.bounds[0, 1]
-		for i in range(m):
-			for j in range(m):
-				P = self.get_polynomial(i) * self.get_polynomial(j)
-				new_p = P.integ()
-				xb = np.minimum(new_p.domain[1], b)
-				xa = np.maximum(new_p.domain[0], a)
-				Psi[i, j] = new_p(xb) - new_p(xa)
-				print(i, j, Psi[i, j])
-		Gamma_half = self.cov()
-		return Gamma_half @ Psi @ Gamma_half.T
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def basis_fun(self, x, j):  # 1d
+        r"""
+        Return the value of basis function \phi_j(x)
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: \phi_j(x)
+        """
+        lim = [self.interval[0], self.interval[1]]
+        c = np.zeros(shape=(self.m, 1))
+        c[j] = 1
+        bp = BPoly(c, lim, extrapolate=False)
+        res = bp(x.numpy())
+        value = torch.from_numpy(np.nan_to_num(res))
+        return value
+
+    # return torch.from_numpy(bp(x.numpy()))
+
+    def get_polynomial(self, j):
+        if self.d == 1:
+            lim = [self.interval[0], self.interval[1]]
+            c = np.zeros(shape=(self.m, 1))
+            c[j] = 1
+            roots = PPoly.from_bernstein_basis(BPoly(c, lim)).roots()
+            poly = np.polynomial.polynomial.Polynomial(
+                np.polynomial.polynomial.polyfromroots(roots), domain=np.array(lim)
+            )
+
+        elif self.d == 2:
+            lim = [self.interval[0], self.interval[1]]
+            k = j // self.m
+            l = j % self.m
+            c = np.zeros(shape=(self.m, 1))
+            c[k] = 1
+            bp = BPoly(c, lim)
+            c = np.zeros(shape=(self.m, 1))
+            c[l] = 1
+            bp2 = BPoly(c, lim)
+            roots1 = PPoly.from_bernstein_basis(bp).roots()
+            roots2 = PPoly.from_bernstein_basis(bp2).roots()
+            poly1 = np.polynomial.polynomial.Polynomial(
+                np.polynomial.polynomial.polyfromroots(roots1), domain=np.array(lim)
+            )
+            poly2 = np.polynomial.polynomial.Polynomial(
+                np.polynomial.polynomial.polyfromroots(roots2), domain=np.array(lim)
+            )
+            poly = poly1 * poly2
+        return poly
+
+    def integral(self, S):
+        assert S.d == self.d
+        psi = torch.zeros(self.get_m()).double()
+
+        if self.d == 1:
+            a, b = float(S.bounds[0, 0]), float(S.bounds[0, 1])
+            for j in range(self.get_m()):
+                lim = [self.interval[0], self.interval[1]]
+                c = np.zeros(shape=(self.m, 1))
+                c[j] = 1
+                bp = BPoly(c, lim)
+                xa = np.maximum(self.interval[0], a)
+                xb = np.minimum(self.interval[1], b)
+                psi[j] = bp.integrate(xa, xb, extrapolate=False)
+
+        elif self.d == 2:
+            xa, xb = S.bounds[0, 0], S.bounds[0, 1]
+            ya, yb = S.bounds[1, 0], S.bounds[1, 1]
+            for j in range(self.get_m()):
+                lim = [self.interval[0], self.interval[1]]
+
+                k = j // self.m
+                l = j % self.m
+
+                c = np.zeros(shape=(self.m, 1))
+                c[k] = 1
+                bp = BPoly(c, lim)
+                vol1 = bp.integrate(xa, xb)
+                c = np.zeros(shape=(self.m, 1))
+                c[l] = 1
+                bp = BPoly(c, lim)
+                vol2 = bp.integrate(ya, yb)
+                psi[j] = vol1 * vol2
+
+        Gamma_half = self.cov()
+        return psi @ Gamma_half
+
+    def product_integral(self, S):
+        m = self.get_m()
+        Psi = torch.zeros(size=(m, m)).double()
+        a, b = S.bounds[0, 0], S.bounds[0, 1]
+        for i in range(m):
+            for j in range(m):
+                P = self.get_polynomial(i) * self.get_polynomial(j)
+                new_p = P.integ()
+                xb = np.minimum(new_p.domain[1], b)
+                xa = np.maximum(new_p.domain[0], a)
+                Psi[i, j] = new_p(xb) - new_p(xa)
+                print(i, j, Psi[i, j])
+        Gamma_half = self.cov()
+        return Gamma_half @ Psi @ Gamma_half.T
+
 
 # def cov(self, inverse = False):
 # 	if self.precomp == False:
@@ -127,272 +131,299 @@ def product_integral(self, S):
 
 class BernsteinSplinesOverlapping(PositiveEmbedding):
 
-	def __init__(self, *args, degree=4, **kwargs):
-		super().__init__(*args, **kwargs)
-		self.degree = degree
-
-	def basis_fun(self, x, q, derivative=0, extrapolate=False):  # 1d
-		"""
-		Return the value of basis function \phi_j(x)
-
-		:param x: double, need to be in the interval
-		:param j: integer, index of hat functions, 0 <= j <= m-1
-		:return: \phi_j(x)
-		"""
-
-		j = q // (self.degree // 2)
-		k = q % (self.degree // 2)
-
-		dm = (self.interval[1] - self.interval[0]) / ((self.m // (self.degree // 2)))  # delta m
-		tj = self.interval[0] + j * dm
-		lim = [tj, tj + 2 * dm]
-
-		c = np.zeros(shape=(self.degree // 2, 1))
-		c[k] = 1.
-		bp = BPoly(c, lim)
-		res = bp(x.numpy(), nu=derivative, extrapolate=extrapolate)
-
-		if extrapolate == False:
-			mask = x.numpy() == (tj + dm / 2)
-			res[mask] = np.nan
-		value = torch.from_numpy(np.nan_to_num(res))
-		return value
-
-	def integral(self, S):
-		assert (S.d == self.d)
-		psi = torch.zeros(self.get_m()).double()
-
-		if self.d == 1:
-			a, b = float(S.bounds[0, 0]), float(S.bounds[0, 1])
-			for q in range(self.get_m()):
-				j = q // self.degree
-				k = q % self.degree
-
-				dm = (self.interval[1] - self.interval[0]) / ((self.m // self.degree))  # delta m
-				tj = self.interval[0] + j * dm
-				lim = [tj, tj + dm]
-				c = np.zeros(shape=(self.degree, 1))
-				c[k] = 1.
-				bp = BPoly(c, lim)
-				xa = np.maximum(tj, a)
-				xb = np.minimum(tj + dm, b)
-				psi[q] = np.nan_to_num(bp.integrate(xa, xb, extrapolate=False))
-
-		elif self.d == 2:
-			xa, xb = S.bounds[0, 0], S.bounds[0, 1]
-			ya, yb = S.bounds[1, 0], S.bounds[1, 1]
-			for z in range(self.get_m()):
-				q1 = z // self.m
-				q2 = z % self.m
-
-				j1 = q1 // self.degree
-				k1 = q1 % self.degree
-				j2 = q2 // self.degree
-				k2 = q2 % self.degree
-
-				dm = (self.interval[1] - self.interval[0]) / ((self.m // self.degree))  # delta m
-				tj1 = self.interval[0] + j1 * dm
-				tj2 = self.interval[0] + j2 * dm
-				lim1 = [tj1, tj1 + dm]
-				lim2 = [tj2, tj2 + dm]
-				c = np.zeros(shape=(self.degree, 1))
-				c[k1] = 1.
-				bp = BPoly(c, lim1)
-				vol1 = bp.integrate(xa, xb)
-				c = np.zeros(shape=(self.degree, 1))
-				c[k2] = 1.
-				bp = BPoly(c, lim2)
-				vol2 = bp.integrate(ya, yb)
-				psi[z] = vol1 * vol2
-
-		Gamma_half = self.cov()
-		return psi @ Gamma_half
+    def __init__(self, *args, degree=4, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.degree = degree
+
+    def basis_fun(self, x, q, derivative=0, extrapolate=False):  # 1d
+        r"""
+        Return the value of basis function \phi_j(x)
+
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: \phi_j(x)
+        """
+
+        j = q // (self.degree // 2)
+        k = q % (self.degree // 2)
+
+        dm = (self.interval[1] - self.interval[0]) / (
+            (self.m // (self.degree // 2))
+        )  # delta m
+        tj = self.interval[0] + j * dm
+        lim = [tj, tj + 2 * dm]
+
+        c = np.zeros(shape=(self.degree // 2, 1))
+        c[k] = 1.0
+        bp = BPoly(c, lim)
+        res = bp(x.numpy(), nu=derivative, extrapolate=extrapolate)
+
+        if extrapolate == False:
+            mask = x.numpy() == (tj + dm / 2)
+            res[mask] = np.nan
+        value = torch.from_numpy(np.nan_to_num(res))
+        return value
+
+    def integral(self, S):
+        assert S.d == self.d
+        psi = torch.zeros(self.get_m()).double()
+
+        if self.d == 1:
+            a, b = float(S.bounds[0, 0]), float(S.bounds[0, 1])
+            for q in range(self.get_m()):
+                j = q // self.degree
+                k = q % self.degree
+
+                dm = (self.interval[1] - self.interval[0]) / (
+                    (self.m // self.degree)
+                )  # delta m
+                tj = self.interval[0] + j * dm
+                lim = [tj, tj + dm]
+                c = np.zeros(shape=(self.degree, 1))
+                c[k] = 1.0
+                bp = BPoly(c, lim)
+                xa = np.maximum(tj, a)
+                xb = np.minimum(tj + dm, b)
+                psi[q] = np.nan_to_num(bp.integrate(xa, xb, extrapolate=False))
+
+        elif self.d == 2:
+            xa, xb = S.bounds[0, 0], S.bounds[0, 1]
+            ya, yb = S.bounds[1, 0], S.bounds[1, 1]
+            for z in range(self.get_m()):
+                q1 = z // self.m
+                q2 = z % self.m
+
+                j1 = q1 // self.degree
+                k1 = q1 % self.degree
+                j2 = q2 // self.degree
+                k2 = q2 % self.degree
+
+                dm = (self.interval[1] - self.interval[0]) / (
+                    (self.m // self.degree)
+                )  # delta m
+                tj1 = self.interval[0] + j1 * dm
+                tj2 = self.interval[0] + j2 * dm
+                lim1 = [tj1, tj1 + dm]
+                lim2 = [tj2, tj2 + dm]
+                c = np.zeros(shape=(self.degree, 1))
+                c[k1] = 1.0
+                bp = BPoly(c, lim1)
+                vol1 = bp.integrate(xa, xb)
+                c = np.zeros(shape=(self.degree, 1))
+                c[k2] = 1.0
+                bp = BPoly(c, lim2)
+                vol2 = bp.integrate(ya, yb)
+                psi[z] = vol1 * vol2
+
+        Gamma_half = self.cov()
+        return psi @ Gamma_half
 
 
 class BernsteinSplinesEmbedding(PositiveEmbedding):
 
-	def __init__(self, *args, degree=4, derivatives=2, **kwargs):
-		super().__init__(*args, **kwargs)
-		self.degree = degree
-		self.derivatives = derivatives
-
-	# def basis_fun(self, x, j, k, derivative = 0, extrapolate = False): #1d
-	def basis_fun(self, x, q, derivative=0, extrapolate=False):  # 1d
-		"""
-		Return the value of basis function \phi_j(x)
-
-		:param x: double, need to be in the interval
-		:param j: integer, index of hat functions, 0 <= j <= m-1
-		:return: \phi_j(x)
-		"""
-
-		j = q // self.degree
-		k = q % self.degree
-
-		dm = (self.interval[1] - self.interval[0]) / ((self.m // self.degree))  # delta m
-		tj = self.interval[0] + j * dm
-
-		lim = [tj, tj + dm]
-		c = np.zeros(shape=(self.degree, 1))
-		c[k] = 1.
-		bp = BPoly(c, lim)
-		res = bp(x.numpy(), nu=derivative, extrapolate=extrapolate)
-
-		if extrapolate == False:
-			mask = x.numpy() == (tj + dm)
-			res[mask] = np.nan
-		value = torch.from_numpy(np.nan_to_num(res))
-		return value
-
-	def embed_internal_derivative(self, x, l=1, extrapolate=False):
-		if self.d == 1:
-			out = torch.zeros(size=(x.size()[0], self.m), dtype=torch.float64)
-			for j in range(0, self.m, 1):
-				out[:, j] = self.basis_fun(x, j, derivative=l, extrapolate=extrapolate).view(-1)
-			return out
-
-	def get_constraints(self):
-		s = self.m ** self.d
-
-		# positivity constraints
-		l = np.full(s, self.b)
-		u = np.full(s, self.B)
-		I = np.identity(s)
-
-		# pointwise fix
-		Zs = []
-		vs = []
-		for j in range(self.derivatives + 1):
-			no_nodes = (self.m // self.degree) - 1
-			Z = np.zeros(shape=(no_nodes, s))
-			dm = (self.interval[1] - self.interval[0]) / ((self.m // self.degree))  # delta m
-
-			for i in range(no_nodes):
-				ti = torch.from_numpy(np.array(self.interval[0] + (i + 1) * dm)).view(1, -1)
-				Z[i, i * self.degree:i * self.degree + self.degree] = self.embed_internal_derivative(ti, l=j,
-																									 extrapolate=True).numpy().reshape(
-					-1)[i * self.degree:i * self.degree + self.degree]
-				Z[i, (i + 1) * self.degree:(i + 1) * self.degree + self.degree] = -self.embed_internal_derivative(ti,
-																												  l=j,
-																												  extrapolate=True).numpy().reshape(
-					-1)[(i + 1) * self.degree:(i + 1) * self.degree + self.degree]
-			v = np.zeros(self.m // self.degree - 1)
-			Zs.append(Z)
-			vs.append(v)
-
-		Lambda = np.concatenate([I] + Zs)
-		l = np.concatenate([l] + vs)
-		u = np.concatenate([u] + vs)
-		return (l, Lambda, u)
-
-	def integral(self, S):
-		assert (S.d == self.d)
-		psi = torch.zeros(self.get_m()).double()
-
-		if self.d == 1:
-			a, b = float(S.bounds[0, 0]), float(S.bounds[0, 1])
-			for q in range(self.get_m()):
-				j = q // self.degree
-				k = q % self.degree
-
-				dm = (self.interval[1] - self.interval[0]) / ((self.m // self.degree))  # delta m
-				tj = self.interval[0] + j * dm
-				lim = [tj, tj + dm]
-				c = np.zeros(shape=(self.degree, 1))
-				c[k] = 1.
-				bp = BPoly(c, lim)
-				xa = np.maximum(tj, a)
-				xb = np.minimum(tj + dm, b)
-				psi[q] = np.nan_to_num(bp.integrate(xa, xb, extrapolate=False))
-
-		elif self.d == 2:
-			xa, xb = S.bounds[0, 0], S.bounds[0, 1]
-			ya, yb = S.bounds[1, 0], S.bounds[1, 1]
-			for z in range(self.get_m()):
-				q1 = z // self.m
-				q2 = z % self.m
-
-				j1 = q1 // self.degree
-				k1 = q1 % self.degree
-				j2 = q2 // self.degree
-				k2 = q2 % self.degree
-
-				dm = (self.interval[1] - self.interval[0]) / ((self.m // self.degree))  # delta m
-				tj1 = self.interval[0] + j1 * dm
-				tj2 = self.interval[0] + j2 * dm
-				lim1 = [tj1, tj1 + dm]
-				lim2 = [tj2, tj2 + dm]
-				c = np.zeros(shape=(self.degree, 1))
-				c[k1] = 1.
-				bp = BPoly(c, lim1)
-				vol1 = bp.integrate(xa, xb)
-				c = np.zeros(shape=(self.degree, 1))
-				c[k2] = 1.
-				bp = BPoly(c, lim2)
-				vol2 = bp.integrate(ya, yb)
-				psi[z] = vol1 * vol2
-
-		Gamma_half = self.cov()
-		return psi @ Gamma_half
-
-	def product_integral(self, S):
-		pass
+    def __init__(self, *args, degree=4, derivatives=2, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.degree = degree
+        self.derivatives = derivatives
+
+    # def basis_fun(self, x, j, k, derivative = 0, extrapolate = False): #1d
+    def basis_fun(self, x, q, derivative=0, extrapolate=False):  # 1d
+        r"""
+        Return the value of basis function \phi_j(x)
+
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: \phi_j(x)
+        """
+
+        j = q // self.degree
+        k = q % self.degree
+
+        dm = (self.interval[1] - self.interval[0]) / (
+            (self.m // self.degree)
+        )  # delta m
+        tj = self.interval[0] + j * dm
+
+        lim = [tj, tj + dm]
+        c = np.zeros(shape=(self.degree, 1))
+        c[k] = 1.0
+        bp = BPoly(c, lim)
+        res = bp(x.numpy(), nu=derivative, extrapolate=extrapolate)
+
+        if extrapolate == False:
+            mask = x.numpy() == (tj + dm)
+            res[mask] = np.nan
+        value = torch.from_numpy(np.nan_to_num(res))
+        return value
+
+    def embed_internal_derivative(self, x, l=1, extrapolate=False):
+        if self.d == 1:
+            out = torch.zeros(size=(x.size()[0], self.m), dtype=torch.float64)
+            for j in range(0, self.m, 1):
+                out[:, j] = self.basis_fun(
+                    x, j, derivative=l, extrapolate=extrapolate
+                ).view(-1)
+            return out
+
+    def get_constraints(self):
+        s = self.m**self.d
+
+        # positivity constraints
+        l = np.full(s, self.b)
+        u = np.full(s, self.B)
+        I = np.identity(s)
+
+        # pointwise fix
+        Zs = []
+        vs = []
+        for j in range(self.derivatives + 1):
+            no_nodes = (self.m // self.degree) - 1
+            Z = np.zeros(shape=(no_nodes, s))
+            dm = (self.interval[1] - self.interval[0]) / (
+                (self.m // self.degree)
+            )  # delta m
+
+            for i in range(no_nodes):
+                ti = torch.from_numpy(np.array(self.interval[0] + (i + 1) * dm)).view(
+                    1, -1
+                )
+                Z[i, i * self.degree : i * self.degree + self.degree] = (
+                    self.embed_internal_derivative(ti, l=j, extrapolate=True)
+                    .numpy()
+                    .reshape(-1)[i * self.degree : i * self.degree + self.degree]
+                )
+                Z[i, (i + 1) * self.degree : (i + 1) * self.degree + self.degree] = (
+                    -self.embed_internal_derivative(ti, l=j, extrapolate=True)
+                    .numpy()
+                    .reshape(-1)[
+                        (i + 1) * self.degree : (i + 1) * self.degree + self.degree
+                    ]
+                )
+            v = np.zeros(self.m // self.degree - 1)
+            Zs.append(Z)
+            vs.append(v)
+
+        Lambda = np.concatenate([I] + Zs)
+        l = np.concatenate([l] + vs)
+        u = np.concatenate([u] + vs)
+        return (l, Lambda, u)
+
+    def integral(self, S):
+        assert S.d == self.d
+        psi = torch.zeros(self.get_m()).double()
+
+        if self.d == 1:
+            a, b = float(S.bounds[0, 0]), float(S.bounds[0, 1])
+            for q in range(self.get_m()):
+                j = q // self.degree
+                k = q % self.degree
+
+                dm = (self.interval[1] - self.interval[0]) / (
+                    (self.m // self.degree)
+                )  # delta m
+                tj = self.interval[0] + j * dm
+                lim = [tj, tj + dm]
+                c = np.zeros(shape=(self.degree, 1))
+                c[k] = 1.0
+                bp = BPoly(c, lim)
+                xa = np.maximum(tj, a)
+                xb = np.minimum(tj + dm, b)
+                psi[q] = np.nan_to_num(bp.integrate(xa, xb, extrapolate=False))
+
+        elif self.d == 2:
+            xa, xb = S.bounds[0, 0], S.bounds[0, 1]
+            ya, yb = S.bounds[1, 0], S.bounds[1, 1]
+            for z in range(self.get_m()):
+                q1 = z // self.m
+                q2 = z % self.m
+
+                j1 = q1 // self.degree
+                k1 = q1 % self.degree
+                j2 = q2 // self.degree
+                k2 = q2 % self.degree
+
+                dm = (self.interval[1] - self.interval[0]) / (
+                    (self.m // self.degree)
+                )  # delta m
+                tj1 = self.interval[0] + j1 * dm
+                tj2 = self.interval[0] + j2 * dm
+                lim1 = [tj1, tj1 + dm]
+                lim2 = [tj2, tj2 + dm]
+                c = np.zeros(shape=(self.degree, 1))
+                c[k1] = 1.0
+                bp = BPoly(c, lim1)
+                vol1 = bp.integrate(xa, xb)
+                c = np.zeros(shape=(self.degree, 1))
+                c[k2] = 1.0
+                bp = BPoly(c, lim2)
+                vol2 = bp.integrate(ya, yb)
+                psi[z] = vol1 * vol2
+
+        Gamma_half = self.cov()
+        return psi @ Gamma_half
+
+    def product_integral(self, S):
+        pass
 
 
 if __name__ == "__main__":
-	from stpy.continuous_processes.gauss_procc import GaussianProcess
-	from stpy.helpers.helper import interval
-	import matplotlib.pyplot as plt
-	from stpy.kernels import KernelFunction
-	from stpy.embeddings.bump_bases import FaberSchauderEmbedding
-
-	d = 1
-	m = 32
-	n = 64
-	N = 10
-
-	sqrtbeta = 2
-	s = 0.001
-	b = 0.0
-	B = 200
-
-	gamma = 0.1
-	kernel_object = KernelFunction(gamma=gamma)
-
-	# Emb = BernsteinSplinesEmbedding(d, m,kernel_object=kernel_object, offset=0.5,b=b,B=B,s = s)
-	EmbBern = BernsteinEmbedding(d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s)
-	EmbFaber = FaberSchauderEmbedding(d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s)
-	GP = GaussianProcess(d=d, s=s, kernel=kernel_object)
-	# GPNyst = KernelizedFeatures(embedding=EmbNys.GP,m = m, s = s,)
-
-	xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1.1))
-	x = torch.from_numpy(np.random.uniform(-1, 1, N)).view(-1, 1)
-
-	F_true = lambda x: torch.sin(x) ** 2 - 0.1
-	F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
-	y = F(x)
-
-	# Emb.fit_gp(x,y)
-	EmbBern.fit(x, y)
-	EmbFaber.fit(x, y)
-
-	GP.fit_gp(x, y)
-
-	# mu = Emb.mean_std(xtest)
-	mu_true, _ = GP.mean_std(xtest)
-	mu_bern = EmbBern.mean_std(xtest)
-	mu_faber = EmbFaber.mean_std(xtest)
-
-	plt.plot(xtest, xtest * 0 + b, 'k--')
-	# plt.plot(xtest, xtest * 0 + B, 'k--')
-
-	plt.plot(xtest, F_true(xtest), 'r', label='true')
-	# plt.plot(xtest,mu_true_nyst,color = 'lightblue', label = 'Nystrom')
-	plt.plot(xtest, mu_true, 'b--', label='no-constraints')
-
-	plt.plot(x, y, 'ro')
-	# plt.plot(xtest, mu, 'g-x', label = 'splines Bernstein')
-	plt.plot(xtest, mu_bern, 'y-o', label='Bernstein basis')
-	plt.plot(xtest, mu_faber, 'g-o', label='Faber basis')
-	plt.legend()
-	plt.show()
+    from stpy.continuous_processes.gauss_procc import GaussianProcess
+    from stpy.helpers.helper import interval
+    import matplotlib.pyplot as plt
+    from stpy.kernels import KernelFunction
+    from stpy.embeddings.bump_bases import FaberSchauderEmbedding
+
+    d = 1
+    m = 32
+    n = 64
+    N = 10
+
+    sqrtbeta = 2
+    s = 0.001
+    b = 0.0
+    B = 200
+
+    gamma = 0.1
+    kernel_object = KernelFunction(gamma=gamma)
+
+    # Emb = BernsteinSplinesEmbedding(d, m,kernel_object=kernel_object, offset=0.5,b=b,B=B,s = s)
+    EmbBern = BernsteinEmbedding(
+        d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s
+    )
+    EmbFaber = FaberSchauderEmbedding(
+        d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s
+    )
+    GP = GaussianProcess(d=d, s=s, kernel=kernel_object)
+    # GPNyst = KernelizedFeatures(embedding=EmbNys.GP,m = m, s = s,)
+
+    xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1.1))
+    x = torch.from_numpy(np.random.uniform(-1, 1, N)).view(-1, 1)
+
+    F_true = lambda x: torch.sin(x) ** 2 - 0.1
+    F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
+    y = F(x)
+
+    # Emb.fit_gp(x,y)
+    EmbBern.fit(x, y)
+    EmbFaber.fit(x, y)
+
+    GP.fit_gp(x, y)
+
+    # mu = Emb.mean_std(xtest)
+    mu_true, _ = GP.mean_std(xtest)
+    mu_bern = EmbBern.mean_std(xtest)
+    mu_faber = EmbFaber.mean_std(xtest)
+
+    plt.plot(xtest, xtest * 0 + b, "k--")
+    # plt.plot(xtest, xtest * 0 + B, 'k--')
+
+    plt.plot(xtest, F_true(xtest), "r", label="true")
+    # plt.plot(xtest,mu_true_nyst,color = 'lightblue', label = 'Nystrom')
+    plt.plot(xtest, mu_true, "b--", label="no-constraints")
+
+    plt.plot(x, y, "ro")
+    # plt.plot(xtest, mu, 'g-x', label = 'splines Bernstein')
+    plt.plot(xtest, mu_bern, "y-o", label="Bernstein basis")
+    plt.plot(xtest, mu_faber, "g-o", label="Faber basis")
+    plt.legend()
+    plt.show()
diff --git a/stpy/embeddings/bump_bases.py b/stpy/embeddings/bump_bases.py
index 3e71555..5d86e2c 100644
--- a/stpy/embeddings/bump_bases.py
+++ b/stpy/embeddings/bump_bases.py
@@ -11,120 +11,119 @@
 
 class TriangleEmbedding(PositiveEmbedding):
 
-	def __init__(self, *args, **kwargs):
-
-		super().__init__(*args, **kwargs)
-
-	def basis_fun(self, x, j):
-		"""
-		Return the value of basis function \phi_j(x)
-
-		:param x: double, need to be in the interval
-		:param j: integer, index of hat functions, 0 <= j <= m-1
-		:return: \phi_j(x)
-		"""
-
-		dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
-		tj = self.interval[0] + (j) * dm
-		res = 1 - torch.abs((x - tj) / dm)
-		res[res < 0] = 0
-		return res
-
-	def integrate_1d(self, a, b, tj, dm):
-		"""
-		:param a: from
-		:param b: to
-		:param tj: node
-		:param dm: width
-		:return:
-		"""
-		if a <= tj - dm and b >= tj + dm:  # contained
-			vol = 1. * dm
-
-		elif a >= tj + dm or b <= tj - dm:  # outside
-			vol = 0.
-
-		elif a <= tj - dm and b >= tj and b <= tj + dm:  # a out , b inside second half
-			res = max(1. - np.abs((b - tj) / dm), 0)
-			vol = dm * 0.5 + (b - tj) * (1. + res) / 2.
-
-		elif b >= tj + dm and a <= tj and a >= tj - dm:  # b out, a inside first half
-			res = max(1. - np.abs((a - tj) / dm), 0)
-			vol = dm * 0.5 + (tj - a) * (1. + res) / 2.
-
-		elif a <= tj - dm and b <= tj and b >= tj - dm:  # a out, b inside first half
-			res = max(1. - np.abs((b - tj) / dm), 0)
-			vol = 0.5 * (b - (tj - dm)) * res
-
-		elif b >= tj + dm and a >= tj and a <= tj + dm:  # b out, a inside second half
-			res = max(1. - np.abs((a - tj) / dm), 0)
-			vol = 0.5 * ((tj + dm) - a) * res
-
-
-		else:  # inside
-			resa = max(1. - np.abs((a - tj) / dm), 0)
-			resb = max(1. - np.abs((b - tj) / dm), 0)
-
-			if b <= tj:
-				vol = (b - a) * (resb + resa) / 2.
-			elif a >= tj:
-				vol = (b - a) * (resa + resb) / 2.
-			else:
-				vol = (tj - a) * (1 + resa) / 2. + (b - tj) * (resb + 1) / 2.
-
-		return vol
-
-	def integral(self, S):
-		"""
-		Integrate the Phi(x) over S
-		:param S: borel set
-		:return:
-		"""
-		if S in self.procomp_integrals.keys():
-			return self.procomp_integrals[S]
-
-
-		else:
-			assert (S.d == self.d)
-			psi = torch.zeros(self.get_m()).double()
-			if S.type == "box":
-				if self.d == 1:
-					dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
-					a, b = S.bounds[0, 0], S.bounds[0, 1]
-					for j in range(self.get_m()):
-						tj = self.interval[0] + j * dm
-						vol = self.integrate_1d(a.numpy(), b.numpy(), tj, dm)
-						psi[j] = vol
-
-				elif self.d == 2:
-					dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
-
-					xa, xb = S.bounds[0, 0], S.bounds[0, 1]
-					ya, yb = S.bounds[1, 0], S.bounds[1, 1]
-
-					for j in range(self.get_m()):
-						tj = self.interval[0] + (j % self.m) * dm
-						tk = self.interval[0] + (j // self.m) * dm
-
-						# triangle center point
-						# center_point = torch.Tensor( [tj,tk]).view(-1,1)
-						vol = self.integrate_1d(xa.numpy(), xb.numpy(), tk, dm)
-						vol2 = self.integrate_1d(ya.numpy(), yb.numpy(), tj, dm)
-						psi[j] = vol * vol2
-				# if torch.sum(S.is_inside(center_point)):
-				# psi[j] = (dm**2)/3.
-				else:
-					raise ("more than 2D not implemented.")
-
-			elif S.type == "round":
-				weights, nodes = S.return_legendre_discretization(30)
-				vals = self.embed_internal(nodes)
-				psi = weights.view(1, -1) @ vals
-
-			Gamma_half = self.cov()
-			emb = psi @ Gamma_half
-			self.procomp_integrals[S] = emb
-			return emb
+    def __init__(self, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+
+    def basis_fun(self, x, j):
+        r"""
+        Return the value of basis function \phi_j(x)
+
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: \phi_j(x)
+        """
+
+        dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
+        tj = self.interval[0] + (j) * dm
+        res = 1 - torch.abs((x - tj) / dm)
+        res[res < 0] = 0
+        return res
+
+    def integrate_1d(self, a, b, tj, dm):
+        """
+        :param a: from
+        :param b: to
+        :param tj: node
+        :param dm: width
+        :return:
+        """
+        if a <= tj - dm and b >= tj + dm:  # contained
+            vol = 1.0 * dm
+
+        elif a >= tj + dm or b <= tj - dm:  # outside
+            vol = 0.0
+
+        elif a <= tj - dm and b >= tj and b <= tj + dm:  # a out , b inside second half
+            res = max(1.0 - np.abs((b - tj) / dm), 0)
+            vol = dm * 0.5 + (b - tj) * (1.0 + res) / 2.0
+
+        elif b >= tj + dm and a <= tj and a >= tj - dm:  # b out, a inside first half
+            res = max(1.0 - np.abs((a - tj) / dm), 0)
+            vol = dm * 0.5 + (tj - a) * (1.0 + res) / 2.0
+
+        elif a <= tj - dm and b <= tj and b >= tj - dm:  # a out, b inside first half
+            res = max(1.0 - np.abs((b - tj) / dm), 0)
+            vol = 0.5 * (b - (tj - dm)) * res
+
+        elif b >= tj + dm and a >= tj and a <= tj + dm:  # b out, a inside second half
+            res = max(1.0 - np.abs((a - tj) / dm), 0)
+            vol = 0.5 * ((tj + dm) - a) * res
+
+        else:  # inside
+            resa = max(1.0 - np.abs((a - tj) / dm), 0)
+            resb = max(1.0 - np.abs((b - tj) / dm), 0)
+
+            if b <= tj:
+                vol = (b - a) * (resb + resa) / 2.0
+            elif a >= tj:
+                vol = (b - a) * (resa + resb) / 2.0
+            else:
+                vol = (tj - a) * (1 + resa) / 2.0 + (b - tj) * (resb + 1) / 2.0
+
+        return vol
+
+    def integral(self, S):
+        r"""
+        Integrate the Phi(x) over S
+        :param S: borel set
+        :return: $\int_S \Phi(x) dx$
+        """
+        if S in self.precomp_integral.keys():
+            return self.precomp_integral[S]
+
+        else:
+            assert S.d == self.d
+            psi = torch.zeros(self.get_m()).double()
+            if S.type == "box":
+                if self.d == 1:
+                    dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
+                    a, b = S.bounds[0, 0], S.bounds[0, 1]
+                    for j in range(self.get_m()):
+                        tj = self.interval[0] + j * dm
+                        vol = self.integrate_1d(a.numpy(), b.numpy(), tj, dm)
+                        psi[j] = float(vol)
+
+                elif self.d == 2:
+                    dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
+
+                    xa, xb = S.bounds[0, 0], S.bounds[0, 1]
+                    ya, yb = S.bounds[1, 0], S.bounds[1, 1]
+
+                    for j in range(self.get_m()):
+                        tj = self.interval[0] + (j % self.m) * dm
+                        tk = self.interval[0] + (j // self.m) * dm
+
+                        # triangle center point
+                        # center_point = torch.tensor( [tj,tk]).view(-1,1)
+                        vol = self.integrate_1d(xa.numpy(), xb.numpy(), tk, dm)
+                        vol2 = self.integrate_1d(ya.numpy(), yb.numpy(), tj, dm)
+                        psi[j] = float(vol * vol2)
+                # if torch.sum(S.is_inside(center_point)):
+                # psi[j] = (dm**2)/3.
+                else:
+                    raise ("more than 2D not implemented.")
+
+            elif S.type == "round":
+                weights, nodes = S.return_legendre_discretization(30)
+                vals = self.embed_internal(nodes)
+                psi = weights.view(1, -1) @ vals
+
+            Gamma_half = self.cov()
+            emb = psi @ Gamma_half
+            self.precomp_integral[S] = emb
+            return emb
+
 
 # def product_integral(self, S):
 # 	assert( S.d == self.d)
@@ -145,286 +144,303 @@ def integral(self, S):
 
 class FaberSchauderEmbedding(TriangleEmbedding):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
-		if int(np.log2(self.m)) != np.log2(self.m):
-			raise AssertionError("This basis works only with log_2(n) is integer.")
-
-	def basis_fun(self, x, j):
-		"""
-		Return the value of basis function \phi_j(x)
-
-		:param x: double, need to be in the interval
-		:param j: integer, index of hat functions, 0 <= j <= m-1
-		:return: \phi_j(x)
-		"""
-		if j == 0:
-			res = x * 0 + 1
-		elif j == 1:
-			dm = (self.interval[1] - self.interval[0]) / 2  # delta m
-			res = 1 - torch.abs((x) / dm)
-			res[res < 0] = 0
-		else:
-			level = np.floor(np.log2(j))
-			start = 2 ** level
-			dm = (self.interval[1] - self.interval[0]) / (2 * start)
-			tj = self.interval[0] + (j - start) * 2 * dm + dm
-			res = 1 - torch.abs((x - tj) / dm)
-			res[res < 0] = 0
-		return res
-
-	def integral(self, S):
-		assert (S.d == self.d)
-		psi = torch.zeros(self.get_m()).double()
-
-		if self.d == 1:
-			a, b = S.bounds[0, 0], S.bounds[0, 1]
-			for j in range(self.get_m()):
-				if j == 0:
-					vol = (b - a)
-				elif j == 1:
-					dm = (self.interval[1] - self.interval[0]) / 2  # delta m
-					vol = self.integrate_1d(a.numpy(), b.numpy(), 0, dm)
-				else:
-					level = np.floor(np.log2(j))
-					start = 2 ** level
-					dm = (self.interval[1] - self.interval[0]) / (2 * start)
-					tj = self.interval[0] + (j - start) * 2 * dm + dm
-					vol = self.integrate_1d(a.numpy(), b.numpy(), tj, dm)
-				psi[j] = vol
-		return psi
-
-	def hierarchical_mask(self):
-		mask = [0]
-		for i in range(int(np.log2(self.m))):
-			for j in range(2**i):
-				mask.append(i+1)
-		return torch.Tensor(mask)
-
-	def product_integral(self):
-		raise NotImplementedError("Not implemented.")
-		pass
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if int(np.log2(self.m)) != np.log2(self.m):
+            raise AssertionError("This basis works only with log_2(n) is integer.")
+
+    def basis_fun(self, x, j):
+        r"""
+        Return the value of basis function \phi_j(x)
+
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: \phi_j(x)
+        """
+        if j == 0:
+            res = x * 0 + 1
+        elif j == 1:
+            dm = (self.interval[1] - self.interval[0]) / 2  # delta m
+            res = 1 - torch.abs((x) / dm)
+            res[res < 0] = 0
+        else:
+            level = np.floor(np.log2(j))
+            start = 2**level
+            dm = (self.interval[1] - self.interval[0]) / (2 * start)
+            tj = self.interval[0] + (j - start) * 2 * dm + dm
+            res = 1 - torch.abs((x - tj) / dm)
+            res[res < 0] = 0
+        return res
+
+    def integral(self, S):
+        assert S.d == self.d
+        psi = torch.zeros(self.get_m()).double()
+
+        if self.d == 1:
+            a, b = S.bounds[0, 0], S.bounds[0, 1]
+            for j in range(self.get_m()):
+                if j == 0:
+                    vol = b - a
+                elif j == 1:
+                    dm = (self.interval[1] - self.interval[0]) / 2  # delta m
+                    vol = self.integrate_1d(a.numpy(), b.numpy(), 0, dm)
+                else:
+                    level = np.floor(np.log2(j))
+                    start = 2**level
+                    dm = (self.interval[1] - self.interval[0]) / (2 * start)
+                    tj = self.interval[0] + (j - start) * 2 * dm + dm
+                    vol = self.integrate_1d(a.numpy(), b.numpy(), tj, dm)
+                psi[j] = vol
+        return psi
+
+    def hierarchical_mask(self):
+        mask = [0]
+        for i in range(int(np.log2(self.m))):
+            for j in range(2**i):
+                mask.append(i + 1)
+        return torch.tensor(mask)
+
+    def product_integral(self):
+        raise NotImplementedError("Not implemented.")
+        pass
 
 
 class KuhnExponentialEmbedding(PositiveEmbedding):
-	"""
-	Basis from: Covering numbers of Gaussian reproducing kernel Hilbert spaces
-	by Thomas Kuhn
+    """
+    Basis from: Covering numbers of Gaussian reproducing kernel Hilbert spaces
+    by Thomas Kuhn
 
-	"""
+    """
 
-	def __init__(self, *args, gamma=0.1, **kwargs):
-		super().__init__(self, *args, **kwargs)
-		self.gamma = gamma
+    def __init__(self, *args, gamma=0.1, **kwargs):
+        super().__init__(self, *args, **kwargs)
+        self.gamma = gamma
 
-	def basis_fun(self, x, j):
-		k = np.exp(j / 2 * np.log(1. / self.gamma) - (j / 2) * scipy.special.gammaln(j + 1))
-		res = k * (x ** j) * torch.exp(- (x ** 2) / (2 * self.gamma ** 2))
-		mask1 = x < 0
-		mask2 = x > 1
-		res[mask1] = 0.
-		res[mask2] = 0.
-		return res
+    def basis_fun(self, x, j):
+        k = np.exp(
+            j / 2 * np.log(1.0 / self.gamma) - (j / 2) * scipy.special.gammaln(j + 1)
+        )
+        res = k * (x**j) * torch.exp(-(x**2) / (2 * self.gamma**2))
+        mask1 = x < 0
+        mask2 = x > 1
+        res[mask1] = 0.0
+        res[mask2] = 0.0
+        return res
 
 
 class CustomHaarBumps(PositiveEmbedding):
-	"""
+    """
 
-	Custom Haar basis that cover different sized pockets of domain
+    Custom Haar basis that cover different sized pockets of domain
 
-	"""
+    """
 
-	# def __init__(self, *args, **kwargs):
-	# 	super().__init__(self,*args, **kwargs)
-	# 	nodes = None
-	# 	widths = None
-	# 	self.nodes = nodes
-	# 	self.widths = widths
+    # def __init__(self, *args, **kwargs):
+    # 	super().__init__(self,*args, **kwargs)
+    # 	nodes = None
+    # 	widths = None
+    # 	self.nodes = nodes
+    # 	self.widths = widths
 
-	def __init__(self, d, m, nodes, widths, weights, **kwargs):
-		super().__init__(d, m, **kwargs)
-		self.nodes = nodes
-		self.widths = widths
-		self.weights = weights
+    def __init__(self, d, m, nodes, widths, weights, **kwargs):
+        super().__init__(d, m, **kwargs)
+        self.nodes = nodes
+        self.widths = widths
+        self.weights = weights
 
-	def basis_fun(self, x, j):
+    def basis_fun(self, x, j):
 
-		if self.nodes is None or self.widths is None:
-			super().basis_fun(x, j)
-		else:
-			mask = np.abs(x - self.nodes[j]) < self.widths[j]
-			out = x * 0
-			out[mask] = self.weights[j]
-			return out
+        if self.nodes is None or self.widths is None:
+            super().basis_fun(x, j)
+        else:
+            mask = np.abs(x - self.nodes[j]) < self.widths[j]
+            out = x * 0
+            out[mask] = self.weights[j]
+            return out
 
 
 class BumpsEmbedding(PositiveEmbedding):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-	def integrate(self, a, b, j):
-		vol = 0.
-		return vol
+    def integrate(self, a, b, j):
+        vol = 0.0
+        return vol
 
-	def integral(self, S):
-		"""
-		Integrate the Phi(x) over S
-		:param S: borel set
-		:return:
-		"""
-		assert (S.d == self.d)
-		psi = torch.zeros(self.get_m()).double()
+    def integral(self, S):
+        """
+        Integrate the Phi(x) over S
+        :param S: borel set
+        :return:
+        """
+        assert S.d == self.d
+        psi = torch.zeros(self.get_m()).double()
 
-		a, b = S.bounds[0, 0], S.bounds[0, 1]
-		for j in range(self.get_m()):
-			vol = self.integrate(a.numpy(), b.numpy(), j)
-			psi[j] = vol
+        a, b = S.bounds[0, 0], S.bounds[0, 1]
+        for j in range(self.get_m()):
+            vol = self.integrate(a.numpy(), b.numpy(), j)
+            psi[j] = vol
 
-	def basis_fun(self, x, j):  # 1d
-		"""
-		Return the value of basis function \phi_j(x)
+    def basis_fun(self, x, j):  # 1d
+        r"""
+        Return the value of basis function \phi_j(x)
 
-		:param x: double, need to be in the interval
-		:param j: integer, index of hat functions, 0 <= j <= m-1
-		:return: \phi_j(x)
-		"""
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: \phi_j(x)
+        """
 
-		dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
-		tj = self.interval[0] + (j) * dm
-		res = -(x - tj) * (x - (tj + (2 * dm))) * (1. / (dm ** 2))
-		res[res < 0] = 0
-		return res
+        dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
+        tj = self.interval[0] + (j) * dm
+        res = -(x - tj) * (x - (tj + (2 * dm))) * (1.0 / (dm**2))
+        res[res < 0] = 0
+        return res
 
 
 class PositiveNystromEmbeddingBump(PositiveEmbedding):
 
-	def __init__(self, *args, samples=300, **kwargs):
-		super().__init__(*args, **kwargs)
-		self.samples = np.maximum(samples, self.m)
-
-		B = BorelSet(1, torch.Tensor([[self.interval[0], self.interval[1]]]).double())
-		x = B.return_discretization(256)
-		y = x[:, 0].view(-1, 1) * 0
-
-		print("Starting optimal basis construction, with m =", self.m)
-		self.new_kernel_object = KernelFunction(kernel_name=self.kernel_object.optkernel,
-												gamma=self.kernel_object.gamma)
-		self.GP = NystromFeatures(self.new_kernel_object, m=self.m, approx='positive_svd',
-								  samples=self.samples)
-		self.GP.fit_gp(x, y)
-		print("Optimal basis constructed.")
-		if torch.sum(torch.isnan(self.GP.embed(x))) > 0:
-			print("Failed basis? (zero is good):", torch.sum(torch.isnan(self.GP.embed(x))))
-
-		self.precomp_integral = {}
-
-	def basis_fun(self, x, j):
-		return self.GP.embed(x)[:, j].view(-1, 1)
-
-	def get_constraints(self):
-		s = self.m ** self.d
-		l = np.full(s, 0.0).astype(float)
-		u = np.full(s, 10e10)
-		Lambda = np.identity(s)
-		return (l, Lambda, u)
-
-	def integral(self, S):
-		assert (S.d == self.d)
-
-		if S in self.precomp_integral.keys():
-			return self.precomp_integral[S]
-		else:
-			if S.d == 1:
-				weights, nodes = S.return_legendre_discretization(n=256)
-				psi = torch.sum(torch.diag(weights) @ self.GP.embed(nodes), dim=0)
-				Gamma_half = self.cov()
-				psi = Gamma_half.T @ psi
-				self.precomp_integral[S] = psi
-			elif S.d == 2:
-				weights, nodes = S.return_legendre_discretization(n=50)
-				vals = self.embed_internal(nodes)
-				psi = torch.sum(torch.diag(weights) @ vals, dim=0)
-				Gamma_half = self.cov()
-				psi = Gamma_half.T @ psi
-				self.precomp_integral[S] = psi
-				if torch.sum(torch.isnan(psi)) > 0:
-					print("Failed integrals? (0 is good):", torch.sum(torch.isnan(psi)))
-
-			else:
-				raise NotImplementedError("Higher dimension not implemented.")
-			return psi
-
-	def cov(self, inverse=False):
-
-		if self.precomp == False:
-
-			x = torch.linspace(self.interval[0], self.interval[1], 256)
-			vals = self.GP.embed(x)
-			indices = torch.argmax(vals, dim=0)
-			t = x[indices]
-
-			if self.d == 1:
-				t = t.view(-1, 1).double()
-			elif self.d == 2:
-				t = torch.from_numpy(cartesian([t.numpy(), t.numpy()])).double()
-			elif self.d == 3:
-				t = torch.from_numpy(cartesian([t.numpy(), t.numpy(), t.numpy()])).double()
-
-			self.Gamma = self.kernel(t, t)
-			Z = self.embed_internal(t)
-
-			M = torch.pinverse(Z.T @ Z + (self.s) * torch.eye(self.Gamma.size()[0]))
-			self.M = torch.from_numpy(np.real(scipy.linalg.sqrtm(M.numpy())))
-
-			# self.Gamma_half = torch.cholesky(Gamma \
-			#	+ self.s * self.s * torch.eye(Gamma.size()[0]).double(), upper = True	)
-
-			self.Gamma_half = torch.from_numpy(
-				np.real(scipy.linalg.sqrtm(self.Gamma.numpy() + (self.s ** 2) * np.eye(self.Gamma.size()[0]))))
-			self.Gamma_half = self.M @ self.Gamma_half
-			self.invGamma_half = torch.pinverse(self.Gamma_half)
-			self.precomp = True
-		else:
-			pass
-
-		if inverse == True:
-			return self.Gamma_half, self.invGamma_half
-		else:
-			return self.Gamma_half
+    def __init__(self, *args, samples=300, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.samples = np.maximum(samples, self.m)
+
+        B = BorelSet(1, torch.tensor([[self.interval[0], self.interval[1]]]).double())
+        x = B.return_discretization(256)
+        y = x[:, 0].view(-1, 1) * 0
+
+        print("Starting optimal basis construction, with m =", self.m)
+        self.new_kernel_object = KernelFunction(
+            kernel_name=self.kernel_object.optkernel, gamma=self.kernel_object.gamma
+        )
+        self.GP = NystromFeatures(
+            self.new_kernel_object,
+            m=self.m,
+            approx="positive_svd",
+            samples=self.samples,
+        )
+        self.GP.fit_gp(x, y)
+        print("Optimal basis constructed.")
+        if torch.sum(torch.isnan(self.GP.embed(x))) > 0:
+            print(
+                "Failed basis? (zero is good):",
+                torch.sum(torch.isnan(self.GP.embed(x))),
+            )
+
+        self.precomp_integral = {}
+
+    def basis_fun(self, x, j):
+        return self.GP.embed(x)[:, j].view(-1, 1)
+
+    def get_constraints(self):
+        s = self.m**self.d
+        l = np.full(s, 0.0).astype(float)
+        u = np.full(s, 10e10)
+        Lambda = np.identity(s)
+        return (l, Lambda, u)
+
+    def integral(self, S):
+        assert S.d == self.d
+
+        if S in self.precomp_integral.keys():
+            return self.precomp_integral[S]
+        else:
+            if S.d == 1:
+                weights, nodes = S.return_legendre_discretization(n=256)
+                psi = torch.sum(torch.diag(weights) @ self.GP.embed(nodes), dim=0)
+                Gamma_half = self.cov()
+                psi = Gamma_half.T @ psi
+                self.precomp_integral[S] = psi
+            elif S.d == 2:
+                weights, nodes = S.return_legendre_discretization(n=50)
+                vals = self.embed_internal(nodes)
+                psi = torch.sum(torch.diag(weights) @ vals, dim=0)
+                Gamma_half = self.cov()
+                psi = Gamma_half.T @ psi
+                self.precomp_integral[S] = psi
+                if torch.sum(torch.isnan(psi)) > 0:
+                    print("Failed integrals? (0 is good):", torch.sum(torch.isnan(psi)))
+
+            else:
+                raise NotImplementedError("Higher dimension not implemented.")
+            return psi
+
+    def cov(self, inverse=False):
+
+        if self.precomp == False:
+
+            x = torch.linspace(self.interval[0], self.interval[1], 256)
+            vals = self.GP.embed(x)
+            indices = torch.argmax(vals, dim=0)
+            t = x[indices]
+
+            if self.d == 1:
+                t = t.view(-1, 1).double()
+            elif self.d == 2:
+                t = torch.from_numpy(cartesian([t.numpy(), t.numpy()])).double()
+            elif self.d == 3:
+                t = torch.from_numpy(
+                    cartesian([t.numpy(), t.numpy(), t.numpy()])
+                ).double()
+
+            self.Gamma = self.kernel(t, t)
+            Z = self.embed_internal(t)
+
+            M = torch.pinverse(Z.T @ Z + (self.s) * torch.eye(self.Gamma.size()[0]))
+            self.M = torch.from_numpy(np.real(scipy.linalg.sqrtm(M.numpy())))
+
+            # self.Gamma_half = torch.cholesky(Gamma \
+            # 	+ self.s * self.s * torch.eye(Gamma.size()[0]).double(), upper = True	)
+
+            self.Gamma_half = torch.from_numpy(
+                np.real(
+                    scipy.linalg.sqrtm(
+                        self.Gamma.numpy() + (self.s**2) * np.eye(self.Gamma.size()[0])
+                    )
+                )
+            )
+            self.Gamma_half = self.M @ self.Gamma_half
+            self.invGamma_half = torch.pinverse(self.Gamma_half)
+            self.precomp = True
+        else:
+            pass
+
+        if inverse == True:
+            return self.Gamma_half, self.invGamma_half
+        else:
+            return self.Gamma_half
 
 
 if __name__ == "__main__":
-	from stpy.continuous_processes.gauss_procc import GaussianProcess
-	from stpy.helpers.helper import interval
-	import matplotlib.pyplot as plt
-
-	d = 1
-	m = 32
-	n = 64
-	N = 20
-	sqrtbeta = 2
-	s = 0.01
-	b = 0
-	gamma = 0.1
-	k = KernelFunction(gamma=gamma)
-
-	Emb = FaberSchauderEmbedding(d, m, offset=0.2, s=s, b=b, B=1000., kernel_object=k)
-	GP = GaussianProcess(d=d, s=s)
-	xtest = torch.from_numpy(interval(n, d))
-
-	x = torch.from_numpy(np.random.uniform(-1, 1, N)).view(-1, 1)
-
-	F_true = lambda x: torch.sin(x) ** 2 - 0.1
-	F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
-	y = F(x)
-	Emb.fit(x, y)
-	GP.fit_gp(x, y)
-	mu = Emb.mean_std(xtest)
-	mu_true, _ = GP.mean_std(xtest)
-	plt.plot(xtest, F_true(xtest), 'b', label='true')
-	plt.plot(xtest, mu_true, 'b--', label='GP')
-	plt.plot(x, y, 'ro')
-	plt.plot(xtest, mu, 'g-', label='positive basis ')
-	plt.legend()
-	plt.show()
+    from stpy.continuous_processes.gauss_procc import GaussianProcess
+    from stpy.helpers.helper import interval
+    import matplotlib.pyplot as plt
+
+    d = 1
+    m = 32
+    n = 64
+    N = 20
+    sqrtbeta = 2
+    s = 0.01
+    b = 0
+    gamma = 0.1
+    k = KernelFunction(gamma=gamma)
+
+    Emb = FaberSchauderEmbedding(d, m, offset=0.2, s=s, b=b, B=1000.0, kernel_object=k)
+    GP = GaussianProcess(d=d, s=s)
+    xtest = torch.from_numpy(interval(n, d))
+
+    x = torch.from_numpy(np.random.uniform(-1, 1, N)).view(-1, 1)
+
+    F_true = lambda x: torch.sin(x) ** 2 - 0.1
+    F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
+    y = F(x)
+    Emb.fit(x, y)
+    GP.fit_gp(x, y)
+    mu = Emb.mean_std(xtest)
+    mu_true, _ = GP.mean_std(xtest)
+    plt.plot(xtest, F_true(xtest), "b", label="true")
+    plt.plot(xtest, mu_true, "b--", label="GP")
+    plt.plot(x, y, "ro")
+    plt.plot(xtest, mu, "g-", label="positive basis ")
+    plt.legend()
+    plt.show()
diff --git a/stpy/embeddings/embedding.py b/stpy/embeddings/embedding.py
index f8a5394..3d2ac08 100755
--- a/stpy/embeddings/embedding.py
+++ b/stpy/embeddings/embedding.py
@@ -6,7 +6,7 @@
 __email__ = "mojmir.mutny@inf.ethz.ch"
 __status__ = "DEV"
 
-"""
+r"""
 This file implements code used in paper:
 
 	Mojmir Mutny & Andreas Krause, "Efficient High Dimensional Bayesian Optimization 
@@ -50,83 +50,105 @@
 import stpy.helpers.quadrature_helper as quad_help
 
 
-class Embedding():
-	"""
-	Base class for Embeddings to approximate kernels with a higher dimensional linear product.
-	"""
-
-	def __init__(self, gamma=0.1, nu=0.5, m=100, d=1, diameter=1.0, groups=None, kappa=1.0,
-				 kernel="squared_exponential", cosine=False, approx="rff", **kwargs):
-		"""
-		Called to calculate the embedding weights (either via sampling or deterministically)
-
-		Args:
-			gamma: (positional, 0.1) bandwidth of the squared exponential kernel
-			nu: (positional, 0.5) the parameter of Matern family
-			m: (positional, 1)
-			d: dimension of the
-
-		Returns:
-			None
-		"""
-		self.gamma = float(gamma)
-		self.n = nu
-		self.m = int(m)
-		self.d = int(d)
-		self.nu = nu
-		self.kappa = kappa
-		self.cosine = cosine
-		self.diameter = diameter
-		self.groups = groups
-		self.kernel = kernel
-		self.approx = approx
-		self.gradient_avail = 0
-		if self.m % 2 == 1:
-			raise AssertionError("Number of random features has to be even.")
-
-	def sample(self):
-		"""
-		Called to calculate the embedding weights (either via sampling or deterministically)
-
-		Args:
-		    None
-
-		Returns:
-			None
-		"""
-		raise AttributeError("Only derived classes can call this method.")
-
-	def embed(self, x):
-		"""
-		Called to calculate the embedding weights (either via sampling or deterministically)
-
-		Args:
-		    x: numpy array containing the points to be embedded in the format (n,d)
-
-		Returns:
-			y: numpy array containg the embedded points (n,m), where m is the embedding dimension
-		"""
-
-		raise AttributeError("Only derived classes can call this method.")
-
-	def get_m(self):
-		"""
-
-		:return:
-
-		"""
-		return self.m
-
-	def integral(self, S):
-		a = S.bounds[:, 0]
-		b = S.bounds[:, 1]
-		psi = torch.zeros(self.m).double()
-
-		for i in range(self.m // 2):
-			omegas = self.W[i, :].view(-1)
-			psi[i] = quad_help.integrate_cos_multidimensional(a.numpy(), b.numpy(), omegas.numpy())
-			psi[self.m // 2 + i] = quad_help.integrate_sin_multidimensional(a.numpy(), b.numpy(), omegas.numpy())
-		return psi
+class Embedding:
+    """
+    Base class for Embeddings to approximate kernels with a higher dimensional linear product.
+    """
+
+    def __init__(
+        self,
+        gamma=0.1,
+        nu=0.5,
+        m=100,
+        d=1,
+        diameter=1.0,
+        groups=None,
+        kappa=1.0,
+        kernel="squared_exponential",
+        cosine=False,
+        approx="rff",
+        **kwargs
+    ):
+        """
+        Called to calculate the embedding weights (either via sampling or deterministically)
+
+        Args:
+                gamma: (positional, 0.1) bandwidth of the squared exponential kernel
+                nu: (positional, 0.5) the parameter of Matern family
+                m: (positional, 1)
+                d: dimension of the
+
+        Returns:
+                None
+        """
+        self.gamma = float(gamma)
+        self.n = nu
+        self.m = int(m)
+        self.d = int(d)
+        self.nu = nu
+        self.kappa = kappa
+        self.cosine = cosine
+        self.diameter = diameter
+        self.groups = groups
+        self.kernel = kernel
+        self.approx = approx
+        self.gradient_avail = 0
+        if self.m % 2 == 1:
+            raise AssertionError("Number of random features has to be even.")
+
+    def sample(self):
+        """
+        Called to calculate the embedding weights (either via sampling or deterministically)
+
+        Args:
+            None
+
+        Returns:
+                None
+        """
+        raise AttributeError("Only derived classes can call this method.")
+
+    def embed(self, x):
+        """
+        Called to calculate the embedding weights (either via sampling or deterministically)
+
+        Args:
+            x: numpy array containing the points to be embedded in the format (n,d)
+
+        Returns:
+                y: numpy array containg the embedded points (n,m), where m is the embedding dimension
+        """
+
+        raise AttributeError("Only derived classes can call this method.")
+
+    def get_m(self):
+        """
+
+        :return:
+
+        """
+        return self.m
+
+    def integral(self, S):
+        """
+        Compute the integral of the kernel over the set S
+
+        :param S: Borel set
+        :return: array of length self.m of integrals of each basis function over the set S
+        """
+        a = S.bounds[:, 0]
+        b = S.bounds[:, 1]
+        psi = torch.zeros(self.m).double()
+
+        for i in range(self.m // 2):
+            omegas = self.W[i, :].view(-1)
+            psi[i] = quad_help.integrate_cos_multidimensional(
+                a.numpy(), b.numpy(), omegas.numpy()
+            )
+            psi[self.m // 2 + i] = quad_help.integrate_sin_multidimensional(
+                a.numpy(), b.numpy(), omegas.numpy()
+            )
+        return psi
 
 
 """
@@ -137,108 +159,124 @@ def integral(self, S):
 
 
 class RFFEmbedding(Embedding):
-	"""
-		Random Fourier Features emebedding
-	"""
-
-	def __init__(self, biased=False, **kwargs):
-		super().__init__(**kwargs)
-		self.biased = biased
-		self.sample()
-
-	def sampler(self, size):
-		"""
-			Defines the sampler object
-
-		Args:
-		 	size:
-
-		Return:
-		"""
-		if self.kernel == "squared_exponential":
-			distribution = lambda size: np.random.normal(size=size) * (1. / self.gamma)
-			inv_cum_dist = lambda x: norm.ppf(x) * (1. / self.gamma)
-
-		elif self.kernel == "laplace":
-			distribution = None
-			inv_cum_dist = lambda x: (np.tan(np.pi * x - np.pi) / self.gamma)
-
-		elif self.kernel == "modified_matern":
-			if self.nu == 2:
-				distribution = None
-				inv_cum_dist = None
-				pdf = lambda x: np.prod(2 * (self.gamma) / (np.power((1. + self.gamma ** 2 * x ** 2), 2) * np.pi),
-										axis=1)
-			elif self.nu == 3:
-				distribution = None
-				inv_cum_dist = None
-				pdf = lambda x: np.prod((8. * self.gamma) / (np.power((1. + self.gamma ** 2 * x ** 2), 3) * 3 * np.pi),
-										axis=1)
-			elif self.nu == 4:
-				distribution = None
-				inv_cum_dist = None
-				pdf = lambda x: np.prod((16. * self.gamma) / (np.power((1. + self.gamma ** 2 * x ** 2), 4) * 5 * np.pi),
-										axis=1)
-
-		# Random Fourier Features
-		if self.approx == "rff":
-			if distribution == None:
-				if inv_cum_dist == None:
-					self.W = helper.rejection_sampling(pdf, size=size)
-				else:
-					self.W = helper.sample_custom(inv_cum_dist, size=size)
-			else:
-				self.W = distribution(size)
-
-		# Quasi Fourier Features
-		elif self.approx == "halton":
-			if inv_cum_dist != None:
-				self.W = helper.sample_qmc_halton(inv_cum_dist, size=size)
-			else:
-				raise AssertionError("Inverse Cumulative Distribution could not be deduced")
-
-		elif self.approx == "orf":
-			distribution = lambda size: np.random.normal(size=size) * (1.)
-			self.W = distribution(size)
-
-			# QR decomposition
-			self.Q, _ = np.linalg.qr(self.W)
-			# df and size
-			self.S = np.diag(chi.rvs(size[1], size=size[0]))
-			self.W = np.dot(self.S, self.Q) / self.gamma ** 2
-
-		return self.W
-
-	def sample(self):
-		"""
-			Samples Random Fourier Features
-		"""
-		self.W = self.sampler(size=(self.m, self.d))
-		self.W = torch.from_numpy(self.W)
-
-		if self.biased == True:
-			self.b = 2. * np.pi * np.random.uniform(size=(self.m))
-			self.bs = self.b.reshape(self.m, 1)
-			self.b = torch.from_numpy(self.b)
-			self.bs = torch.from_numpy(self.bs)
-
-	def embed(self, x):
-		"""
-		:param x: torch array
-		:return: embeded vector
-		"""
-		(times, d) = x.shape
-		if self.biased == True:
-			z = np.sqrt(2. / self.m) * torch.t(torch.cos(self.W[:, 0:d].mm(torch.t(x)) + self.b.view(self.m, 1)))
-		else:
-			q = self.W[:, 0:d].mm(torch.t(x))
-			# z[0:int(self.m / 2), :] = \
-			z1 = np.sqrt(2. / float(self.m)) * torch.cos(q[0:int(self.m / 2), :])
-			# z[int(self.m / 2):self.m, :] = np.sqrt(2. / float(self.m)) * torch.sin(q[int(self.m / 2):self.m, :])
-			z2 = np.sqrt(2. / float(self.m)) * torch.sin(q[int(self.m / 2):self.m, :])
-			z = torch.cat([z1, z2])
-
-		return torch.t(z) * np.sqrt(self.kappa)
+    """
+    Random Fourier Features emebedding
+    """
+
+    def __init__(self, biased=False, **kwargs):
+        super().__init__(**kwargs)
+        self.biased = biased
+        self.sample()
+
+    def sampler(self, size):
+        """
+                Defines the sampler object
+
+        Args:
+                size:
+
+        Return:
+        """
+        if self.kernel == "squared_exponential":
+            distribution = lambda size: np.random.normal(size=size) * (1.0 / self.gamma)
+            inv_cum_dist = lambda x: norm.ppf(x) * (1.0 / self.gamma)
+
+        elif self.kernel == "laplace":
+            distribution = None
+            inv_cum_dist = lambda x: (np.tan(np.pi * x - np.pi) / self.gamma)
+
+        elif self.kernel == "modified_matern":
+            if self.nu == 2:
+                distribution = None
+                inv_cum_dist = None
+                pdf = lambda x: np.prod(
+                    2
+                    * (self.gamma)
+                    / (np.power((1.0 + self.gamma**2 * x**2), 2) * np.pi),
+                    axis=1,
+                )
+            elif self.nu == 3:
+                distribution = None
+                inv_cum_dist = None
+                pdf = lambda x: np.prod(
+                    (8.0 * self.gamma)
+                    / (np.power((1.0 + self.gamma**2 * x**2), 3) * 3 * np.pi),
+                    axis=1,
+                )
+            elif self.nu == 4:
+                distribution = None
+                inv_cum_dist = None
+                pdf = lambda x: np.prod(
+                    (16.0 * self.gamma)
+                    / (np.power((1.0 + self.gamma**2 * x**2), 4) * 5 * np.pi),
+                    axis=1,
+                )
+
+        # Random Fourier Features
+        if self.approx == "rff":
+            if distribution == None:
+                if inv_cum_dist == None:
+                    self.W = helper.rejection_sampling(pdf, size=size)
+                else:
+                    self.W = helper.sample_custom(inv_cum_dist, size=size)
+            else:
+                self.W = distribution(size)
+
+        # Quasi Fourier Features
+        elif self.approx == "halton":
+            if inv_cum_dist != None:
+                self.W = helper.sample_qmc_halton(inv_cum_dist, size=size)
+            else:
+                raise AssertionError(
+                    "Inverse Cumulative Distribution could not be deduced"
+                )
+
+        elif self.approx == "orf":
+            distribution = lambda size: np.random.normal(size=size) * (1.0)
+            self.W = distribution(size)
+
+            # QR decomposition
+            self.Q, _ = np.linalg.qr(self.W)
+            # df and size
+            self.S = np.diag(chi.rvs(size[1], size=size[0]))
+            self.W = np.dot(self.S, self.Q) / self.gamma**2
+
+        return self.W
+
+    def sample(self):
+        """
+        Samples Random Fourier Features
+        """
+        self.W = self.sampler(size=(self.m, self.d))
+        self.W = torch.from_numpy(self.W)
+
+        if self.biased == True:
+            self.b = 2.0 * np.pi * np.random.uniform(size=(self.m))
+            self.bs = self.b.reshape(self.m, 1)
+            self.b = torch.from_numpy(self.b)
+            self.bs = torch.from_numpy(self.bs)
+
+    def embed(self, x):
+        """
+        :param x: torch array
+        :return: embeded vector
+        """
+        (times, d) = x.shape
+        if self.biased == True:
+            z = np.sqrt(2.0 / self.m) * torch.t(
+                torch.cos(self.W[:, 0:d].mm(torch.t(x)) + self.b.view(self.m, 1))
+            )
+        else:
+            q = self.W[:, 0:d].mm(torch.t(x))
+            # z[0:int(self.m / 2), :] = \
+            z1 = np.sqrt(2.0 / float(self.m)) * torch.cos(q[0 : int(self.m / 2), :])
+            # z[int(self.m / 2):self.m, :] = np.sqrt(2. / float(self.m)) * torch.sin(q[int(self.m / 2):self.m, :])
+            z2 = np.sqrt(2.0 / float(self.m)) * torch.sin(
+                q[int(self.m / 2) : self.m, :]
+            )
+            z = torch.cat([z1, z2])
+
+        return torch.t(z) * np.sqrt(self.kappa)
 
 
 """
@@ -249,523 +287,611 @@ def embed(self, x):
 
 
 class QuadratureEmbedding(Embedding):
-	"""
-		General quadrature embedding
-	"""
-
-	def __init__(self, scale=1.0, **kwargs):
-		Embedding.__init__(self, **kwargs)
-		self.scale = scale
-		self.compute()
-
-	def reorder_complexity(self, omegas, weights):
-		abs_omegas = np.abs(omegas)
-		order = np.argsort(abs_omegas)
-		new_omegas = omegas[order]
-		new_weights = weights[order]
-		return new_omegas, new_weights
-
-	def derivative_1(self, x):
-		(times, d) = tuple(x.size())
-		# z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
-		z = torch.zeros(self.d, self.m, times, dtype=x.dtype)
-		q = torch.mm(self.W[:, 0:d], torch.t(x))  # (m,d)x(d,n)
-
-		omegas = self.W[:, 0:d]  # (m,d)
-
-		if self.cosine == False:
-			z[:, 0:int(self.m / 2), :] = -torch.einsum('ij,ik->jik', omegas,
-													   torch.sqrt(self.weights.view(-1, 1)) * torch.sin(
-														   q))  # (m,d)  (m,n)
-			z[:, int(self.m / 2):self.m, :] = torch.einsum('ij,ik->jik', omegas,
-														   torch.sqrt(self.weights.view(-1, 1)) * torch.cos(q))
-		else:
-			raise NotImplementedError("Cosine only features derivative not implemented")
-
-		return np.sqrt(self.kappa) * z
-
-	def derivative_2(self, x):
-		(times, d) = tuple(x.size())
-		# z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
-		z = torch.zeros(self.d, self.d, self.m, times, dtype=x.dtype)
-		q = torch.mm(self.W[:, 0:d], torch.t(x))  # (m,d)x(d,n)
-
-		omegas = self.W[:, 0:d]  # (m,d)
-
-		if self.cosine == False:
-			z[:, :, 0:int(self.m / 2), :] = -torch.einsum('il,ij,ik->jlik', omegas, omegas,
-														  torch.sqrt(self.weights.view(-1, 1)) * torch.cos(
-															  q))  # (m,d)  (m,d)  (m,n)
-			z[:, :, int(self.m / 2):self.m, :] = -torch.einsum('il,ij,ik->jlik', omegas, omegas,
-															   torch.sqrt(self.weights.view(-1, 1)) * torch.sin(q))
-		else:
-			raise NotImplementedError("Cosine only features derivative not implemented")
-
-		return np.sqrt(self.kappa) * z
-
-	def product_integral(self, S):
-		"""
-		Compute the outer product integral
-		:param S: Borel set
-		:return: m times m matrix with integrate entries
-		"""
-		assert S.d == 1 or S.d == 2
-		if S.d == 1:
-			a = S.bounds[0, 0]
-			b = S.bounds[0, 1]
-			h = self.m // 2
-			Psi = torch.zeros(size=(self.m, self.m)).double()
-			for i in range(h):
-				for j in range(h):
-					Psi[i, j] = torch.sqrt(self.weights[i] * self.weights[j]) * quad_help.integrate_cos_cos(a, b,
-																											self.W[
-																												i, 0],
-																											self.W[
-																												j, 0])  # cos cos
-					Psi[i, j + h] = torch.sqrt(self.weights[i] * self.weights[j]) * quad_help.integrate_sin_cos(a, b,
-																												self.W[
-																													i, 0],
-																												self.W[
-																													j, 0])  # cos sin
-					Psi[i + h, j] = torch.sqrt(self.weights[j] * self.weights[i]) * quad_help.integrate_sin_cos(a, b,
-																												self.W[
-																													j, 0],
-																												self.W[
-																													i, 0])  # sin cos
-					Psi[i + h, j + h] = torch.sqrt(self.weights[i] * self.weights[j]) * quad_help.integrate_sin_sin(a,
-																													b,
-																													self.W[
-																														i, 0],
-																													self.W[
-																														j, 0])  # sin sin
-		elif S.d == 2:
-			xa = S.bounds[0, 0]
-			xb = S.bounds[0, 1]
-			ya = S.bounds[1, 0]
-			yb = S.bounds[1, 1]
-			h = self.m // 2
-			Psi = torch.zeros(size=(self.m, self.m)).double()
-			for i in range(h):
-				for j in range(h):
-					Psi[i, j] = torch.sqrt(self.weights[i] * self.weights[j]) \
-								* quad_help.integrate2d_cos_cos(xa, ya, xb, yb, self.W[i, 0], self.W[i, 1],
-																self.W[j, 0], self.W[j, 1])  # cos cos
-					Psi[i, j + h] = torch.sqrt(self.weights[i] * self.weights[j]) \
-									* quad_help.integrate2d_sin_cos(xa, ya, xb, yb, self.W[i, 0], self.W[i, 1],
-																	self.W[j, 0], self.W[j, 1])  # cos cos
-					Psi[i + h, j] = torch.sqrt(self.weights[j] * self.weights[i]) \
-									* quad_help.integrate2d_sin_cos(xa, ya, xb, yb, self.W[j, 0], self.W[j, 1],
-																	self.W[i, 0], self.W[i, 1])  # cos cos
-					Psi[i + h, j + h] = torch.sqrt(self.weights[i] * self.weights[j]) \
-										* quad_help.integrate2d_sin_sin(xa, ya, xb, yb, self.W[i, 0], self.W[i, 1],
-																		self.W[j, 0], self.W[j, 1])  # cos cos
-		return self.kappa * Psi
-
-	def compute(self, complexity_reorder=True):
-		"""
-			Computes the tensor grid for Fourier features
-		:return:
-		"""
-
-		if self.cosine == False:
-			self.q = int(np.power(self.m // 2, 1. / self.d))
-			self.m = self.q ** self.d
-		else:
-			self.q = int(np.power(self.m, 1. / self.d))
-			self.m = self.q ** self.d
-
-		(omegas, weights) = self.nodesAndWeights(self.q)
-
-		if complexity_reorder == True:
-			(omegas, weights) = self.reorder_complexity(omegas, weights)
-
-		self.weights = helper.cartesian([weights for weight in range(self.d)])
-		self.weights = np.prod(self.weights, axis=1)
-
-		v = [omegas for omega in range(self.d)]
-		self.W = helper.cartesian(v)
-
-		if self.cosine == False:
-			self.m = self.m * 2
-		else:
-			pass
-
-		self.W = torch.from_numpy(self.W)
-		self.weights = torch.from_numpy(self.weights)
-
-	def transform(self):
-		"""
-
-		:return: spectral density of a kernel
-		"""
-		if self.kernel == "squared_exponential":
-			p = lambda omega: np.exp(-np.sum(omega ** 2, axis=1).reshape(-1, 1) / 2 * (self.gamma ** 2)) * np.power(
-				(self.gamma / np.sqrt(2 * np.pi)), 1.) * np.power(np.pi / 2, 1.)
-
-		elif self.kernel == "laplace":
-			p = lambda omega: np.prod(1. / ((self.gamma ** 2) * (omega ** 2) + 1.), axis=1).reshape(-1, 1) * np.power(
-				self.gamma / 2., 1.)
-
-		elif self.kernel == "modified_matern":
-			if self.nu == 2:
-				p = lambda omega: np.prod(1. / ((self.gamma ** 2) * (omega ** 2) + 1.) ** self.nu, axis=1).reshape(-1,
-																												   1) * np.power(
-					self.gamma * 1, 1.)
-			elif self.nu == 3:
-				p = lambda omega: np.prod(1. / ((self.gamma ** 2) * (omega ** 2) + 1.) ** self.nu, axis=1).reshape(-1,
-																												   1) * np.power(
-					self.gamma * 4 / 3, 1.)
-			elif self.nu == 4:
-				p = lambda omega: np.prod(1. / ((self.gamma ** 2) * (omega ** 2) + 1.) ** self.nu, axis=1).reshape(-1,
-																												   1) * np.power(
-					self.gamma * 8 / 5, 1.)
-
-		return p
-
-	def nodesAndWeights(self, q):
-		"""
-		Compute nodes and weights of the quadrature scheme in 1D
-
-		:param q: degree of quadrature
-		:return: tuple of (nodes, weights)
-		"""
-
-		# For osciallatory integrands even this has good properties.
-		# weights = np.ones(self.q) * self.scale * np.pi / (self.q + 1)
-		# omegas = (np.linspace(0, self.q - 1, self.q)) + 1
-		# omegas = omegas * (np.pi / (self.q + 1))
-
-		(omegas, weights) = np.polynomial.legendre.leggauss(2 * q)
-
-		omegas = omegas[q:]
-		weights = 2 * weights[q:]
-
-		omegas = ((omegas + 1.) / 2.) * np.pi
-		sine_scale = (1. / (np.sin(omegas) ** 2))
-		omegas = self.scale / np.tan(omegas)
-		prob = self.transform()
-		weights = self.scale * sine_scale * weights * prob(omegas.reshape(-1, 1)).flatten()
-		return (omegas, weights)
-
-	def embed(self, x):
-		"""
-		:param x: torch array
-		:return: embeding of the x
-		"""
-		(times, d) = tuple(x.size())
-		# z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
-		z = torch.zeros(self.m, times, dtype=x.dtype)
-		q = torch.mm(self.W[:, 0:d], torch.t(x))
-
-		if self.cosine == False:
-			z[0:int(self.m / 2), :] = torch.sqrt(self.weights.view(-1, 1)) * torch.cos(q)
-			z[int(self.m / 2):self.m, :] = torch.sqrt(self.weights.view(-1, 1)) * torch.sin(q)
-		else:
-			z = torch.sqrt(self.weights.view(-1, 1)) * torch.cos(q)
-
-		return torch.t(z) * np.sqrt(self.kappa)
-
-	def get_sub_indices(self, group):
-		"""
-		:param group: group part of the embeding to embed
-		:return: embeding of x in group
-		"""
-		m2 = self.m
-		mhalf = int(np.power(self.m // 2, 1. / self.d))
-
-		m = 2 * mhalf
-		mquater = mhalf // 2
-
-		if group == 0:
-			ind = np.arange(mquater * mhalf, (mquater + 1) * mhalf, 1).tolist() + np.arange(m2 // 2 + (mquater * mhalf),
-																							m2 // 2 + (
-																										mquater + 1) * mhalf,
-																							1).tolist()
-			return ind
-		else:
-			ind = np.arange(mquater, m2 // 2, mhalf).tolist() + np.arange(m2 // 2 + mquater, m2, mhalf).tolist()
-			return ind
-
-	def get_sum_sub_indices(self, group):
-
-		# idenitfy unique values
-		arr = self.W[:, group]
-		values = np.unique(arr)
-		# find indices of each unique value
-		ind = []
-		for value in values:
-			ind_inside = []
-			for index, elem in enumerate(arr):
-				if elem == value:
-					ind_inside.append(index)
-			ind.append(ind_inside)
-			ind_inside2 = [i + self.m // 2 for i in ind_inside]
-			ind.append(ind_inside2)
-		return ind
+    """
+    General quadrature embedding
+    """
+
+    def __init__(self, scale=1.0, **kwargs):
+        Embedding.__init__(self, **kwargs)
+        self.scale = scale
+        self.compute()
+
+    def reorder_complexity(self, omegas, weights):
+        abs_omegas = np.abs(omegas)
+        order = np.argsort(abs_omegas)
+        new_omegas = omegas[order]
+        new_weights = weights[order]
+        return new_omegas, new_weights
+
+    def derivative_1(self, x):
+        (times, d) = tuple(x.size())
+        # z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
+        z = torch.zeros(self.d, self.m, times, dtype=x.dtype)
+        q = torch.mm(self.W[:, 0:d], torch.t(x))  # (m,d)x(d,n)
+
+        omegas = self.W[:, 0:d]  # (m,d)
+
+        if self.cosine == False:
+            z[:, 0 : int(self.m / 2), :] = -torch.einsum(
+                "ij,ik->jik",
+                omegas,
+                torch.sqrt(self.weights.view(-1, 1)) * torch.sin(q),
+            )  # (m,d)  (m,n)
+            z[:, int(self.m / 2) : self.m, :] = torch.einsum(
+                "ij,ik->jik",
+                omegas,
+                torch.sqrt(self.weights.view(-1, 1)) * torch.cos(q),
+            )
+        else:
+            raise NotImplementedError("Cosine only features derivative not implemented")
+
+        return np.sqrt(self.kappa) * z
+
+    def derivative_2(self, x):
+        (times, d) = tuple(x.size())
+        # z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
+        z = torch.zeros(self.d, self.d, self.m, times, dtype=x.dtype)
+        q = torch.mm(self.W[:, 0:d], torch.t(x))  # (m,d)x(d,n)
+
+        omegas = self.W[:, 0:d]  # (m,d)
+
+        if self.cosine == False:
+            z[:, :, 0 : int(self.m / 2), :] = -torch.einsum(
+                "il,ij,ik->jlik",
+                omegas,
+                omegas,
+                torch.sqrt(self.weights.view(-1, 1)) * torch.cos(q),
+            )  # (m,d)  (m,d)  (m,n)
+            z[:, :, int(self.m / 2) : self.m, :] = -torch.einsum(
+                "il,ij,ik->jlik",
+                omegas,
+                omegas,
+                torch.sqrt(self.weights.view(-1, 1)) * torch.sin(q),
+            )
+        else:
+            raise NotImplementedError("Cosine only features derivative not implemented")
+
+        return np.sqrt(self.kappa) * z
+
+    def product_integral(self, S):
+        """
+        Compute the outer product integral
+        :param S: Borel set
+        :return: m times m matrix with integrate entries
+        """
+        assert S.d == 1 or S.d == 2
+        if S.d == 1:
+            a = S.bounds[0, 0]
+            b = S.bounds[0, 1]
+            h = self.m // 2
+            Psi = torch.zeros(size=(self.m, self.m)).double()
+            for i in range(h):
+                for j in range(h):
+                    Psi[i, j] = torch.sqrt(
+                        self.weights[i] * self.weights[j]
+                    ) * quad_help.integrate_cos_cos(
+                        a, b, self.W[i, 0], self.W[j, 0]
+                    )  # cos cos
+                    Psi[i, j + h] = torch.sqrt(
+                        self.weights[i] * self.weights[j]
+                    ) * quad_help.integrate_sin_cos(
+                        a, b, self.W[i, 0], self.W[j, 0]
+                    )  # cos sin
+                    Psi[i + h, j] = torch.sqrt(
+                        self.weights[j] * self.weights[i]
+                    ) * quad_help.integrate_sin_cos(
+                        a, b, self.W[j, 0], self.W[i, 0]
+                    )  # sin cos
+                    Psi[i + h, j + h] = torch.sqrt(
+                        self.weights[i] * self.weights[j]
+                    ) * quad_help.integrate_sin_sin(
+                        a, b, self.W[i, 0], self.W[j, 0]
+                    )  # sin sin
+        elif S.d == 2:
+            xa = S.bounds[0, 0]
+            xb = S.bounds[0, 1]
+            ya = S.bounds[1, 0]
+            yb = S.bounds[1, 1]
+            h = self.m // 2
+            Psi = torch.zeros(size=(self.m, self.m)).double()
+            for i in range(h):
+                for j in range(h):
+                    Psi[i, j] = torch.sqrt(
+                        self.weights[i] * self.weights[j]
+                    ) * quad_help.integrate2d_cos_cos(
+                        xa,
+                        ya,
+                        xb,
+                        yb,
+                        self.W[i, 0],
+                        self.W[i, 1],
+                        self.W[j, 0],
+                        self.W[j, 1],
+                    )  # cos cos
+                    Psi[i, j + h] = torch.sqrt(
+                        self.weights[i] * self.weights[j]
+                    ) * quad_help.integrate2d_sin_cos(
+                        xa,
+                        ya,
+                        xb,
+                        yb,
+                        self.W[i, 0],
+                        self.W[i, 1],
+                        self.W[j, 0],
+                        self.W[j, 1],
+                    )  # cos cos
+                    Psi[i + h, j] = torch.sqrt(
+                        self.weights[j] * self.weights[i]
+                    ) * quad_help.integrate2d_sin_cos(
+                        xa,
+                        ya,
+                        xb,
+                        yb,
+                        self.W[j, 0],
+                        self.W[j, 1],
+                        self.W[i, 0],
+                        self.W[i, 1],
+                    )  # cos cos
+                    Psi[i + h, j + h] = torch.sqrt(
+                        self.weights[i] * self.weights[j]
+                    ) * quad_help.integrate2d_sin_sin(
+                        xa,
+                        ya,
+                        xb,
+                        yb,
+                        self.W[i, 0],
+                        self.W[i, 1],
+                        self.W[j, 0],
+                        self.W[j, 1],
+                    )  # cos cos
+        return self.kappa * Psi
+
+    def compute(self, complexity_reorder=True):
+        """
+                Computes the tensor grid for Fourier features
+        :return:
+        """
+
+        if self.cosine == False:
+            self.q = int(np.power(self.m // 2, 1.0 / self.d))
+            self.m = self.q**self.d
+        else:
+            self.q = int(np.power(self.m, 1.0 / self.d))
+            self.m = self.q**self.d
+
+        (omegas, weights) = self.nodesAndWeights(self.q)
+
+        if complexity_reorder == True:
+            (omegas, weights) = self.reorder_complexity(omegas, weights)
+
+        self.weights = helper.cartesian([weights for weight in range(self.d)])
+        self.weights = np.prod(self.weights, axis=1)
+
+        v = [omegas for omega in range(self.d)]
+        self.W = helper.cartesian(v)
+
+        if self.cosine == False:
+            self.m = self.m * 2
+        else:
+            pass
+
+        self.W = torch.from_numpy(self.W)
+        self.weights = torch.from_numpy(self.weights)
+
+    def transform(self):
+        """
+
+        :return: spectral density of a kernel
+        """
+        if self.kernel == "squared_exponential":
+            p = (
+                lambda omega: np.exp(
+                    -np.sum(omega**2, axis=1).reshape(-1, 1) / 2 * (self.gamma**2)
+                )
+                * np.power((self.gamma / np.sqrt(2 * np.pi)), 1.0)
+                * np.power(np.pi / 2, 1.0)
+            )
+
+        elif self.kernel == "laplace":
+            p = lambda omega: np.prod(
+                1.0 / ((self.gamma**2) * (omega**2) + 1.0), axis=1
+            ).reshape(-1, 1) * np.power(self.gamma / 2.0, 1.0)
+
+        elif self.kernel == "modified_matern":
+            if self.nu == 2:
+                p = lambda omega: np.prod(
+                    1.0 / ((self.gamma**2) * (omega**2) + 1.0) ** self.nu, axis=1
+                ).reshape(-1, 1) * np.power(self.gamma * 1, 1.0)
+            elif self.nu == 3:
+                p = lambda omega: np.prod(
+                    1.0 / ((self.gamma**2) * (omega**2) + 1.0) ** self.nu, axis=1
+                ).reshape(-1, 1) * np.power(self.gamma * 4 / 3, 1.0)
+            elif self.nu == 4:
+                p = lambda omega: np.prod(
+                    1.0 / ((self.gamma**2) * (omega**2) + 1.0) ** self.nu, axis=1
+                ).reshape(-1, 1) * np.power(self.gamma * 8 / 5, 1.0)
+
+        return p
+
+    def nodesAndWeights(self, q):
+        """
+        Compute nodes and weights of the quadrature scheme in 1D
+
+        :param q: degree of quadrature
+        :return: tuple of (nodes, weights)
+        """
+
+        # For osciallatory integrands even this has good properties.
+        # weights = np.ones(self.q) * self.scale * np.pi / (self.q + 1)
+        # omegas = (np.linspace(0, self.q - 1, self.q)) + 1
+        # omegas = omegas * (np.pi / (self.q + 1))
+
+        (omegas, weights) = np.polynomial.legendre.leggauss(2 * q)
+
+        omegas = omegas[q:]
+        weights = 2 * weights[q:]
+
+        omegas = ((omegas + 1.0) / 2.0) * np.pi
+        sine_scale = 1.0 / (np.sin(omegas) ** 2)
+        omegas = self.scale / np.tan(omegas)
+        prob = self.transform()
+        weights = (
+            self.scale * sine_scale * weights * prob(omegas.reshape(-1, 1)).flatten()
+        )
+        return (omegas, weights)
+
+    def embed(self, x):
+        """
+        :param x: torch array
+        :return: embeding of the x
+        """
+        (times, d) = tuple(x.size())
+        # z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
+        z = torch.zeros(self.m, times, dtype=x.dtype)
+        q = torch.mm(self.W[:, 0:d], torch.t(x))
+
+        if self.cosine == False:
+            z[0 : int(self.m / 2), :] = torch.sqrt(
+                self.weights.view(-1, 1)
+            ) * torch.cos(q)
+            z[int(self.m / 2) : self.m, :] = torch.sqrt(
+                self.weights.view(-1, 1)
+            ) * torch.sin(q)
+        else:
+            z = torch.sqrt(self.weights.view(-1, 1)) * torch.cos(q)
+
+        return torch.t(z) * np.sqrt(self.kappa)
+
+    def get_sub_indices(self, group):
+        """
+        :param group: group part of the embeding to embed
+        :return: embeding of x in group
+        """
+        m2 = self.m
+        mhalf = int(np.power(self.m // 2, 1.0 / self.d))
+
+        m = 2 * mhalf
+        mquater = mhalf // 2
+
+        if group == 0:
+            ind = (
+                np.arange(mquater * mhalf, (mquater + 1) * mhalf, 1).tolist()
+                + np.arange(
+                    m2 // 2 + (mquater * mhalf), m2 // 2 + (mquater + 1) * mhalf, 1
+                ).tolist()
+            )
+            return ind
+        else:
+            ind = (
+                np.arange(mquater, m2 // 2, mhalf).tolist()
+                + np.arange(m2 // 2 + mquater, m2, mhalf).tolist()
+            )
+            return ind
+
+    def get_sum_sub_indices(self, group):
+
+        # idenitfy unique values
+        arr = self.W[:, group]
+        values = np.unique(arr)
+        # find indices of each unique value
+        ind = []
+        for value in values:
+            ind_inside = []
+            for index, elem in enumerate(arr):
+                if elem == value:
+                    ind_inside.append(index)
+            ind.append(ind_inside)
+            ind_inside2 = [i + self.m // 2 for i in ind_inside]
+            ind.append(ind_inside2)
+        return ind
 
 
 class TrapezoidalEmbedding(QuadratureEmbedding):
 
-	def __init__(self, **kwargs):
-		QuadratureEmbedding.__init__(self, **kwargs)
-		if self.kernel != "squared_exponential":
-			raise AssertionError("This embeding is allowed only with Squared Exponential Kernel")
+    def __init__(self, **kwargs):
+        QuadratureEmbedding.__init__(self, **kwargs)
+        if self.kernel != "squared_exponential":
+            raise AssertionError(
+                "This embeding is allowed only with Squared Exponential Kernel"
+            )
 
-	def nodesAndWeights(self, q):
-		prob = self.transform()
-		# prob = lambda x:
-		h = np.sqrt(np.pi / q) / self.gamma ** 2
+    def nodesAndWeights(self, q):
+        prob = self.transform()
+        # prob = lambda x:
+        h = np.sqrt(np.pi / q) / self.gamma**2
 
-		nodes = np.linspace(-q // 2, q // 2, q) * h
-		# print (nodes)
+        nodes = np.linspace(-q // 2, q // 2, q) * h
+        # print (nodes)
 
-		weights = h * prob(nodes.reshape(-1, 1)).flatten() * (2 / np.pi)
+        weights = h * prob(nodes.reshape(-1, 1)).flatten() * (2 / np.pi)
 
-		# nodes = np.sqrt(2) * nodes / self.gamma
+        # nodes = np.sqrt(2) * nodes / self.gamma
 
-		return (nodes, weights)
+        return (nodes, weights)
 
 
 class ClenshawCurtisEmbedding(QuadratureEmbedding):
 
-	def __init__(self, **kwargs):
-		QuadratureEmbedding.__init__(self, **kwargs)
-		if self.kernel != "squared_exponential":
-			raise AssertionError("This embeding is allowed only with Squared Exponential Kernel")
+    def __init__(self, **kwargs):
+        QuadratureEmbedding.__init__(self, **kwargs)
+        if self.kernel != "squared_exponential":
+            raise AssertionError(
+                "This embeding is allowed only with Squared Exponential Kernel"
+            )
 
-	def nodesAndWeights(self, q):
-		L = 1. / self.gamma
-		prob = self.transform()
-		# prob = lambda x:
+    def nodesAndWeights(self, q):
+        L = 1.0 / self.gamma
+        prob = self.transform()
+        # prob = lambda x:
 
-		nodes_0 = np.linspace(0, q + 1, q + 2)
-		nodes_0 = np.pi * nodes_0[1:-1] / (q + 2)
-		nodes = L / np.tan(nodes_0)
+        nodes_0 = np.linspace(0, q + 1, q + 2)
+        nodes_0 = np.pi * nodes_0[1:-1] / (q + 2)
+        nodes = L / np.tan(nodes_0)
 
-		weights = L * (np.pi / (q + 2)) * (1. / np.sin(nodes_0) ** 2)
-		weights = weights * prob(nodes.reshape(-1, 1)).flatten() * (2. / np.pi)
+        weights = L * (np.pi / (q + 2)) * (1.0 / np.sin(nodes_0) ** 2)
+        weights = weights * prob(nodes.reshape(-1, 1)).flatten() * (2.0 / np.pi)
 
-		return (nodes, weights)
+        return (nodes, weights)
 
-	def nodesAndWeights2(self, q):
-		prob = self.transform()
+    def nodesAndWeights2(self, q):
+        prob = self.transform()
 
-		nodes_0 = np.linspace(0, q + 1, q + 2)
-		nodes_0 = nodes_0[1:-1] / (q + 2) * np.pi
+        nodes_0 = np.linspace(0, q + 1, q + 2)
+        nodes_0 = nodes_0[1:-1] / (q + 2) * np.pi
 
-		nodes = np.sqrt(-np.log(np.sin(nodes_0[0:q // 2])))
-		nodes2 = -np.sqrt(-np.log(np.sin(nodes_0[q // 2:])))
+        nodes = np.sqrt(-np.log(np.sin(nodes_0[0 : q // 2])))
+        nodes2 = -np.sqrt(-np.log(np.sin(nodes_0[q // 2 :])))
 
-		n1 = nodes_0[0:q // 2]
-		n2 = nodes_0[q // 2:]
+        n1 = nodes_0[0 : q // 2]
+        n2 = nodes_0[q // 2 :]
 
-		weights = (1. / np.tan(n1)) * (1. / np.sqrt(-np.log(np.sin(n1)))) * prob(
-			nodes.reshape(-1, 1)).flatten() * np.pi / (q + 2)
-		weights2 = -(1. / np.tan(n2)) * (1. / np.sqrt(-np.log(np.sin(n2)))) * prob(
-			nodes.reshape(-1, 1)).flatten() * np.pi / (q + 2)
+        weights = (
+            (1.0 / np.tan(n1))
+            * (1.0 / np.sqrt(-np.log(np.sin(n1))))
+            * prob(nodes.reshape(-1, 1)).flatten()
+            * np.pi
+            / (q + 2)
+        )
+        weights2 = (
+            -(1.0 / np.tan(n2))
+            * (1.0 / np.sqrt(-np.log(np.sin(n2))))
+            * prob(nodes.reshape(-1, 1)).flatten()
+            * np.pi
+            / (q + 2)
+        )
 
-		nodes = np.concatenate((nodes, nodes2))
-		weights = np.concatenate((weights, weights2))
+        nodes = np.concatenate((nodes, nodes2))
+        weights = np.concatenate((weights, weights2))
 
-		return (nodes, weights)
+        return (nodes, weights)
 
 
 class HermiteEmbedding(QuadratureEmbedding):
-	"""
-		Hermite Quadrature Fourier Features for squared exponential kernel
-	"""
-
-	def __init__(self, ones=False, cosine=False, **kwargs):
-		self.ones = ones
-		self.cosine = cosine
-		QuadratureEmbedding.__init__(self, **kwargs)
-		if self.kernel != "squared_exponential":
-			raise AssertionError("Hermite Embedding is allowed only with Squared Exponential Kernel")
-
-	def nodesAndWeights(self, q):
-		"""
-		Compute nodes and weights of the quadrature scheme in 1D
-
-		:param q: degree of quadrature
-		:return: tuple of (nodes, weights)
-		"""
-		(nodes, weights) = np.polynomial.hermite.hermgauss(2 * q)
-		# print (nodes)
-		nodes = nodes[q:]
-		weights = 2 * weights[q:]
-
-		if self.ones == True:
-			weights = np.ones(q)
-
-		nodes = np.sqrt(2) * nodes / self.gamma
-		weights = weights / np.sqrt(np.pi)
-		return (nodes, weights)
+    """
+    Hermite Quadrature Fourier Features for squared exponential kernel
+    """
+
+    def __init__(self, ones=False, cosine=False, **kwargs):
+        self.ones = ones
+        self.cosine = cosine
+        QuadratureEmbedding.__init__(self, **kwargs)
+        if self.kernel != "squared_exponential":
+            raise AssertionError(
+                "Hermite Embedding is allowed only with Squared Exponential Kernel"
+            )
+
+    def nodesAndWeights(self, q):
+        """
+        Compute nodes and weights of the quadrature scheme in 1D
+
+        :param q: degree of quadrature
+        :return: tuple of (nodes, weights)
+        """
+        (nodes, weights) = np.polynomial.hermite.hermgauss(2 * q)
+        # print (nodes)
+        nodes = nodes[q:]
+        weights = 2 * weights[q:]
+
+        if self.ones == True:
+            weights = np.ones(q)
+
+        nodes = np.sqrt(2) * nodes / self.gamma
+        weights = weights / np.sqrt(np.pi)
+        return (nodes, weights)
 
 
 class OverCompleteHermiteEmbedding(HermiteEmbedding):
 
-	def nodesAndWeights(self, q):
-		"""
-		Compute nodes and weights of the quadrature scheme in 1D
+    def nodesAndWeights(self, q):
+        """
+        Compute nodes and weights of the quadrature scheme in 1D
 
-		:param q: degree of quadrature
-		:return: tuple of (nodes, weights)
-		"""
-		(nodes, weights) = np.polynomial.hermite.hermgauss(q)
-		nodes = nodes
-		weights = weights
+        :param q: degree of quadrature
+        :return: tuple of (nodes, weights)
+        """
+        (nodes, weights) = np.polynomial.hermite.hermgauss(q)
+        nodes = nodes
+        weights = weights
 
-		nodes = np.sqrt(2) * nodes / self.gamma
-		weights = weights / np.sqrt(np.pi)
-		return (nodes, weights)
+        nodes = np.sqrt(2) * nodes / self.gamma
+        weights = weights / np.sqrt(np.pi)
+        return (nodes, weights)
 
 
 class MaternEmbedding(QuadratureEmbedding):
-	"""
-		Matern specific quadrature based Fourier Features
-	"""
+    """
+    Matern specific quadrature based Fourier Features
+    """
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		if self.kernel != "modified_matern" and self.kernel != "laplace":
-			raise AssertionError("Matern Embedding is allowed only with Matern Kernel")
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.kernel != "modified_matern" and self.kernel != "laplace":
+            raise AssertionError("Matern Embedding is allowed only with Matern Kernel")
 
-	def nodesAndWeights(self, q):
-		"""
-		Compute nodes and weights of the quadrature scheme in 1D
+    def nodesAndWeights(self, q):
+        """
+        Compute nodes and weights of the quadrature scheme in 1D
 
-		:param q: degree of quadrature
-		:return: tuple of (nodes, weights)
-		"""
-		(nodes, weights) = np.polynomial.hermite.hermgauss(q)
-		nodes = np.sqrt(2) * nodes / self.gamma
-		weights = weights / np.sqrt(np.pi)
-		return (nodes, weights)
+        :param q: degree of quadrature
+        :return: tuple of (nodes, weights)
+        """
+        (nodes, weights) = np.polynomial.hermite.hermgauss(q)
+        nodes = np.sqrt(2) * nodes / self.gamma
+        weights = weights / np.sqrt(np.pi)
+        return (nodes, weights)
 
 
 class QuadPeriodicEmbedding(QuadratureEmbedding):
-	"""
-		General class implementing
-	"""
+    """
+    General class implementing
+    """
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
 
-	def nodesAndWeights(self, q):
-		"""
-		Compute nodes and weights of the quadrature scheme in 1D
+    def nodesAndWeights(self, q):
+        """
+        Compute nodes and weights of the quadrature scheme in 1D
 
-		:param q: degree of quadrature
-		:return: tuple of (nodes, weights)
-		"""
-		weights = np.ones(self.q) * self.scale * 2 / (self.q + 1)
-		omegas = (np.linspace(0, self.q - 1, self.q)) + 1
-		omegas = omegas * (np.pi / (self.q + 1))
+        :param q: degree of quadrature
+        :return: tuple of (nodes, weights)
+        """
+        weights = np.ones(self.q) * self.scale * 2 / (self.q + 1)
+        omegas = (np.linspace(0, self.q - 1, self.q)) + 1
+        omegas = omegas * (np.pi / (self.q + 1))
 
-		sine_scale = (1. / (np.sin(omegas) ** 2))
-		omegas = self.scale / np.tan(omegas)
-		prob = self.transform()
-		weights = self.scale * sine_scale * weights * prob(omegas.reshape(-1, 1)).flatten()
-		return (omegas, weights)
+        sine_scale = 1.0 / (np.sin(omegas) ** 2)
+        omegas = self.scale / np.tan(omegas)
+        prob = self.transform()
+        weights = (
+            self.scale * sine_scale * weights * prob(omegas.reshape(-1, 1)).flatten()
+        )
+        return (omegas, weights)
 
 
 class KLEmbedding(QuadratureEmbedding):
-	"""
-		General class implementing Karhunen-Loeve expansion
-	"""
+    """
+    General class implementing Karhunen-Loeve expansion
+    """
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
 
 
 class LatticeEmbedding(QuadratureEmbedding):
-	"""
-		Class for standard basis indexed by natural numbers
-	"""
+    """
+    Class for standard basis indexed by natural numbers
+    """
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
 
-	# if self.kernel != "modified_matern" and self.kernel !="laplace":
-	#	raise AssertionError("Matern Embedding is allowed only with Matern Kernel")
+    # if self.kernel != "modified_matern" and self.kernel !="laplace":
+    # 	raise AssertionError("Matern Embedding is allowed only with Matern Kernel")
 
-	def nodesAndWeights(self, q):
-		"""
-		Compute nodes and weights of the quadrature scheme in 1D
+    def nodesAndWeights(self, q):
+        """
+        Compute nodes and weights of the quadrature scheme in 1D
 
-		:param q: degree of quadrature
-		:return: tuple of (nodes, weights)
-		"""
-		nodes = np.arange(1, q + 1, 1)
-		nodes = np.sqrt(2) * nodes / self.gamma
-		weights = np.ones(q) / (2 * q)
-		return (nodes, weights)
+        :param q: degree of quadrature
+        :return: tuple of (nodes, weights)
+        """
+        nodes = np.arange(1, q + 1, 1)
+        nodes = np.sqrt(2) * nodes / self.gamma
+        weights = np.ones(q) / (2 * q)
+        return (nodes, weights)
 
 
 class ConcatEmbedding(Embedding):
 
-	def __init__(self, embeddings: List[Embedding]):
+    def __init__(self, embeddings: List[Embedding]):
 
-		self.embeddings = embeddings
-		self.m = sum([emb.get_m() for emb in embeddings])
+        self.embeddings = embeddings
+        self.m = sum([emb.get_m() for emb in embeddings])
 
-	def embed(self, xtest):
-		return torch.hstack([emb.embed(xtest) for emb in self.embeddings])
+    def embed(self, xtest):
+        return torch.hstack([emb.embed(xtest) for emb in self.embeddings])
 
 
 class MaskedEmbedding(Embedding):
 
-	def __init__(self, embedding: Embedding, mask: Callable):
-		self.embedding = embedding
-		self.m = self.embedding.get_m()
-		self.mask = mask
-
-	def embed(self, xtest):
-		return torch.diag(self.mask(xtest))@self.embedding.embed(xtest)
-
-
-class AdditiveEmbeddings():
-
-	def __init__(self, embeddings, ms, groups=None, scaling=None, additive=True):
-		self.emebeddings = embeddings
-		if scaling is None:
-			self.scaling = torch.ones(len(self.emebeddings)).double()  # /np.sqrt(len(self.emebeddings))
-		else:
-			self.scaling = scaling
-		self.additive = additive
-
-		if groups is not None:
-			self.groups = groups
-		else:
-			self.groups = [[i] for i in range(len(self.emebeddings))]
-
-		try:
-			self.ms = torch.Tensor(ms)
-		except:
-			self.ms = ms
-
-		self.no_emb = len(self.emebeddings)
-		self.m  = torch.sum(self.ms)
-
-	def embed(self, x):
-		if self.additive:
-			r = torch.zeros(size=(x.size()[0], int(torch.sum(self.ms)))).double()
-			count = 0
-			for index, embedding in enumerate(self.emebeddings):
-				r[:, count:count + int(self.ms[index])] = \
-					embedding.embed(x[:, self.groups[index]].view(-1, len(self.groups[index]))) * self.scaling[index]
-				count = count + int(self.ms[index])
-			return r
-		else:
-			pass
-
-
-class ProjectiveEmbeddings():
-
-	def __init__(self, embedding, project):
-		self.embedding = embedding
-		self.project = project
-
-	def embed(self, x):
-		r = self.embedding.embed(self.project(x))
-		return r
+    def __init__(self, embedding: Embedding, mask: Callable):
+        self.embedding = embedding
+        self.m = self.embedding.get_m()
+        self.mask = mask
+
+    def embed(self, xtest):
+        return torch.diag(self.mask(xtest)) @ self.embedding.embed(xtest)
+
+
+class AdditiveEmbeddings:
+
+    def __init__(self, embeddings, ms, groups=None, scaling=None, additive=True):
+        self.emebeddings = embeddings
+        if scaling is None:
+            self.scaling = torch.ones(
+                len(self.emebeddings)
+            ).double()  # /np.sqrt(len(self.emebeddings))
+        else:
+            self.scaling = scaling
+        self.additive = additive
+
+        if groups is not None:
+            self.groups = groups
+        else:
+            self.groups = [[i] for i in range(len(self.emebeddings))]
+
+        try:
+            self.ms = torch.tensor(ms)
+        except:
+            self.ms = ms
+
+        self.no_emb = len(self.emebeddings)
+        self.m = torch.sum(self.ms)
+
+    def embed(self, x):
+        if self.additive:
+            r = torch.zeros(size=(x.size()[0], int(torch.sum(self.ms)))).double()
+            count = 0
+            for index, embedding in enumerate(self.emebeddings):
+                r[:, count : count + int(self.ms[index])] = (
+                    embedding.embed(
+                        x[:, self.groups[index]].view(-1, len(self.groups[index]))
+                    )
+                    * self.scaling[index]
+                )
+                count = count + int(self.ms[index])
+            return r
+        else:
+            pass
+
+
+class ProjectiveEmbeddings:
+
+    def __init__(self, embedding, project):
+        self.embedding = embedding
+        self.project = project
+
+    def embed(self, x):
+        r = self.embedding.embed(self.project(x))
+        return r
diff --git a/stpy/embeddings/onehot_embedding.py b/stpy/embeddings/onehot_embedding.py
index cdfdda7..1b32a63 100644
--- a/stpy/embeddings/onehot_embedding.py
+++ b/stpy/embeddings/onehot_embedding.py
@@ -6,30 +6,34 @@
 
 class OnehotEmbedding(Embedding):
 
-	def __init__(self, p, d):
-		self.p = p # max value
-		self.d = d # sites
-		self.m = p*d
+    def __init__(self, p, d):
+        self.p = p  # max value
+        self.d = d  # sites
+        self.m = p * d
 
-	def get_m(self):
-		return self.p*self.d
+    def get_m(self):
+        return self.p * self.d
 
+    def apply(self, x, f):
+        return torch.stack(
+            [f(x_i) for i, x_i in enumerate(torch.unbind(x, dim=0), 0)], dim=0
+        )
 
-	def apply(self,x,f):
-		return torch.stack([f(x_i) for i, x_i in enumerate(torch.unbind(x, dim=0), 0)], dim=0)
+    def embed(self, x):
+        n, d = x.size()
+        out = torch.zeros(n, self.p * self.d).double()
 
-	def embed(self, x):
-		n,d = x.size()
-		out = torch.zeros(n,self.p*self.d).double()
+        f = lambda x: torch.from_numpy(
+            np.array([x[i] + 20 * i for i in range(self.d)])
+        ).int()
+        indices = self.apply(x, f).long()
+        for i in range(n):
+            out[i, indices[i]] = 1.0
 
-		f = lambda x: torch.from_numpy(np.array([x[i]+20*i for i in range(self.d)])).int()
-		indices = self.apply(x,f).long()
-		for i in range(n):
-			out[i,indices[i]] = 1.
+        return out
 
-		return out
 
 if __name__ == "__main__":
-	emb = OnehotEmbedding(20,2)
-	x = torch.Tensor([[2,3],[4,5],[10,19]])
-	print (emb.embed(x))
\ No newline at end of file
+    emb = OnehotEmbedding(20, 2)
+    x = torch.tensor([[2, 3], [4, 5], [10, 19]])
+    print(emb.embed(x))
diff --git a/stpy/embeddings/optimal_positive_basis.py b/stpy/embeddings/optimal_positive_basis.py
index 170d018..2768527 100644
--- a/stpy/embeddings/optimal_positive_basis.py
+++ b/stpy/embeddings/optimal_positive_basis.py
@@ -1,187 +1,430 @@
-import pickle
+from typing import Literal
 
 import numpy as np
 import scipy
+from stpy.helpers.voxel_grid import voxel_grid
+from stpy.helpers.parallel_interpolation import InterpolatorArray
 import torch
 
 from stpy.borel_set import BorelSet
-from stpy.continuous_processes.nystrom_fea import NystromFeatures
 from stpy.embeddings.positive_embedding import PositiveEmbedding
 from stpy.kernels import KernelFunction
+from sklearn.decomposition import NMF
+from nmf import run_nmf
+from stpy.helpers.posterior_sampling import tmg
+from fast_pytorch_kmeans import KMeans
 
 
 class OptimalPositiveBasis(PositiveEmbedding):
 
-	def __init__(self, *args, samples=300, discretization_size=30, saved=False, **kwargs):
-		super().__init__(*args, **kwargs)
-		self.samples = np.maximum(samples, self.m)
-
-		B = BorelSet(self.d, torch.Tensor([[self.interval[0], self.interval[1]] for _ in range(self.d)]).double())
-		self.discretized_domain = B.return_discretization(discretization_size)
-
-		y = self.discretized_domain[:, 0].view(-1, 1) * 0
-
-		print("Optimal basis with arbitrary dimension, namely d =", self.d)
-		print("Starting optimal basis construction, with m =", self.m)
-		# self.new_kernel_object = KernelFunction(kernel_name=self.kernel_object.optkernel,
-		#										gamma = self.kernel_object.gamma, d = self.kernel_object.d)
-
-		self.new_kernel_object = self.kernel_object
-		if saved == True:
-			print("Did not load GP object, it needs to loaded")
-		else:
-			self.GP = NystromFeatures(self.new_kernel_object, m=self.m, approx='positive_svd',
-									  samples=self.samples)
-			self.GP.fit_gp(self.discretized_domain, y)
-			print("Optimal basis constructed.")
-			if torch.sum(torch.isnan(self.GP.embed(self.discretized_domain))) > 0:
-				print("Failed basis? (zero is good):", torch.sum(torch.isnan(self.GP.embed(self.discretized_domain))))
-		self.precomp_integral = {}
-
-	def get_m(self):
-		return self.m
-
-	def basis_fun(self, x, j):
-		return self.GP.embed(x)[:, j].view(-1, 1)
-
-	def embed_internal(self, x):
-		out = torch.zeros(size=(x.size()[0], self.m), dtype=torch.float64)
-		for j in range(self.m):
-			out[:, j] = self.basis_fun(x, j).view(-1)
-		return out
-
-	def save_embedding(self, filename):
-		filehandler = open(filename, 'w')
-		pickle.dump(self.GP, filehandler)
-
-	def load_embedding(self, filename):
-		file_pi2 = open(filename, 'r')
-		self.GP = pickle.load(file_pi2)
-
-	def get_constraints(self):
-		s = self.get_m()
-		l = np.full(s, 0.0).astype(float)
-		u = np.full(s, 10e10)
-		Lambda = np.identity(s)
-		return (l, Lambda, u)
-
-	def integral(self, S):
-		assert (S.d == self.d)
-
-		if S in self.precomp_integral.keys():
-			return self.precomp_integral[S]
-		else:
-			if S.d == 1:
-				weights, nodes = S.return_legendre_discretization(n=256)
-				psi = torch.sum(torch.diag(weights) @ self.GP.embed(nodes), dim=0)
-				Gamma_half = self.cov()
-				psi = Gamma_half.T @ psi
-				self.precomp_integral[S] = psi
-			elif S.d == 2:
-				weights, nodes = S.return_legendre_discretization(n=50)
-				vals = self.embed_internal(nodes)
-				psi = torch.sum(torch.diag(weights) @ vals, dim=0)
-				Gamma_half = self.cov()
-				psi = Gamma_half.T @ psi
-				self.precomp_integral[S] = psi
-				if torch.sum(torch.isnan(psi)) > 0:
-					print("Failed integrals? (0 is good):", torch.sum(torch.isnan(psi)))
-
-			else:
-				raise NotImplementedError("Higher dimension not implemented.")
-			return psi
-
-	def cov(self, inverse=False):
-
-		if self.precomp == False:
-
-			x = self.discretized_domain
-			vals = self.GP.embed(x)
-			indices = torch.argmax(vals, dim=0)  # the nodes are the maxima of the bump functions
-			t = x[indices]
-			print("nodes of functions", t.size())
-
-			self.Gamma = self.kernel(t, t)
-			Z = self.embed_internal(t)
-
-			M = torch.pinverse(Z.T @ Z + (self.s) * torch.eye(self.Gamma.size()[0]))
-			self.M = torch.from_numpy(np.real(scipy.linalg.sqrtm(M.numpy())))
-
-			self.Gamma_half = torch.from_numpy(
-				np.real(scipy.linalg.sqrtm(self.Gamma.numpy() + (self.s ** 2) * np.eye(self.Gamma.size()[0]))))
-			self.Gamma_half = self.M @ self.Gamma_half
-			self.invGamma_half = torch.pinverse(self.Gamma_half)
-			self.precomp = True
-		else:
-			pass
-
-		if inverse == True:
-			return self.Gamma_half, self.invGamma_half
-		else:
-			return self.Gamma_half
+    def __init__(
+        self,
+        *args,
+        samples=300,
+        discretization_size=30,
+        data: torch.Tensor | BorelSet,
+        fast_sampling=True,  # samples using squared gaussian instead of truncated gausian
+        memory_limit=5,  # Limits the amount of points used for optimal basis construction
+        sample_algorithm: Literal[
+            "grid", "kmeans"
+        ] = "grid",  # How to subsample if points are limited
+        **kwargs,
+    ):
+        # roi is the set of points that the basis is optimal for if it is a tensor
+        # else it is the region that the basis if optimal for that will be discretized
+        # by discretization_size. If it is not given the entire domain will be used.
+        super().__init__(*args, **kwargs)
+        self.sample_algorithm = sample_algorithm
+        self.num_samples = np.maximum(samples, self.m)
+        self.fast = fast_sampling
+        self.memory_limit = memory_limit if memory_limit is not None else 40
+        self.interpolators = None
+
+        if data is None:
+            B = BorelSet(
+                self.d,
+                torch.tensor(
+                    [[self.interval[0], self.interval[1]] for _ in range(self.d)]
+                ).double(),
+            )
+            self.discretized_domain = B.return_discretization(discretization_size)
+        elif isinstance(data, BorelSet):
+            self.discretized_domain = data.return_discretization(discretization_size)
+        else:
+            self.discretized_domain = data
+
+        y = self.discretized_domain[:, 0].view(-1, 1) * 0
+
+        print("Optimal basis with arbitrary dimension, namely d =", self.d)
+        print("Starting optimal basis construction, with m =", self.m)
+        # self.new_kernel_object = KernelFunction(kernel_name=self.kernel_object.optkernel,
+        # 										gamma = self.kernel_object.gamma, d = self.kernel_object.d)
+
+        self.new_kernel_object = self.kernel_object
+        self._fit_data(data=data)
+        print("Optimal basis constructed.")
+        if torch.sum(torch.isnan(self.embed_internal(self.discretized_domain))) > 0:
+            print(
+                "Failed basis? (zero is good):",
+                torch.sum(torch.isnan(self.embed_internal(self.discretized_domain))),
+            )
+        self.precomp_integral = {}
+
+    def get_m(self):
+        return self.m
+
+    def embed_internal(self, x):
+        out = torch.zeros([len(x), self.m], dtype=torch.float64)
+        for j in range(self.m):
+            out[:, j] = self.basis_fun(x, j).view(-1)
+        return out
+
+    def basis_fun(self, x, j):
+        raise Exception("Fit on data before using")
+
+    def get_constraints(self):
+        s = self.get_m()
+        l = np.full(s, 0.0).astype(float)
+        u = np.full(s, 10e10)
+        Lambda = np.identity(s)
+        return (l, Lambda, u)
+
+    def integral(self, S):
+        assert S.d == self.d
+
+        if S in self.precomp_integral.keys():
+            return self.precomp_integral[S]
+        else:
+            if S.d == 1:
+                weights, nodes = S.return_legendre_discretization(n=256)
+                psi = torch.sum(torch.diag(weights) @ self.embed_internal(nodes), dim=0)
+                Gamma_half = self.cov()
+                psi = Gamma_half.T @ psi
+                self.precomp_integral[S] = psi
+            elif S.d == 2:
+                weights, nodes = S.return_legendre_discretization(n=50)
+                vals = self.embed_internal(nodes)
+                psi = torch.sum(torch.diag(weights) @ vals, dim=0)
+                Gamma_half = self.cov()
+                psi = Gamma_half.T @ psi
+                self.precomp_integral[S] = psi
+                if torch.sum(torch.isnan(psi)) > 0:
+                    print("Failed integrals? (0 is good):", torch.sum(torch.isnan(psi)))
+
+            else:
+                raise NotImplementedError("Higher dimension not implemented.")
+            return psi
+
+    def cov(self, inverse=False):
+
+        if self.precomp == False:
+
+            x = self.discretized_domain
+            vals = self.embed_internal(x)
+            indices = torch.argmax(
+                vals, dim=0
+            )  # the nodes are the maxima of the bump functions
+            t = x[indices]
+            print("nodes of functions", t.size())
+
+            self.Gamma = self.kernel(t, t)
+            Z = self.embed_internal(t)
+
+            M = torch.pinverse(Z.T @ Z + (self.s) * torch.eye(self.Gamma.size()[0]))
+            self.M = torch.tensor(np.real(scipy.linalg.sqrtm(M.cpu().numpy())))
+
+            self.Gamma_half = torch.tensor(
+                np.real(
+                    scipy.linalg.sqrtm(
+                        self.Gamma.cpu().numpy()
+                        + (self.s**2) * np.eye(self.Gamma.size()[0])
+                    )
+                )
+            )
+            self.Gamma_half = self.M @ self.Gamma_half
+            self.invGamma_half = torch.pinverse(self.Gamma_half)
+            self.precomp = True
+        else:
+            pass
+
+        if inverse == True:
+            return self.Gamma_half, self.invGamma_half
+        else:
+            return self.Gamma_half
+
+    def _sample_gaussian_prior(self, x: torch.Tensor):
+        n = self.num_samples
+        dim = len(x)
+        Cov = self.kernel_object.kernel(x, x) + 10e-7 * torch.eye(
+            dim, dtype=torch.float64
+        )
+        L = torch.linalg.cholesky(Cov)
+        if self.fast:
+            random_vector = torch.normal(
+                mean=torch.zeros(dim, n, dtype=torch.float64), std=1.0
+            )
+            y = torch.mm(L, random_vector) ** 2
+        else:
+            y = torch.tensor(
+                tmg(
+                    n,
+                    np.zeros([dim], dtype=np.float64),
+                    Cov.cpu().numpy(),
+                    np.ones([dim], dtype=np.float64),
+                    np.eye(dim, dtype=np.float64),
+                    np.zeros(dim, dtype=np.float64),
+                    verbose=True,
+                ),
+                dtype=torch.float64,
+            )
+        return y, L
+
+    def _sample_gaussian_conditional(self, x_old, L_old, y_old, x):
+        dim = len(x)  # dimensionality of input
+        n = y_old.size(1)  # number of samples
+
+        K_new_new = self.kernel_object.kernel(x, x) + 1e-7 * torch.eye(
+            dim, dtype=torch.float64
+        )
+        K_new_old = self.kernel_object.kernel(x_old, x)
+
+        alpha = torch.linalg.solve_triangular(L_old, y_old, upper=False)
+        alpha = torch.linalg.solve_triangular(L_old.T, alpha, upper=True)
+
+        mu_star = K_new_old @ alpha  # shape (dim, n)
+        # TODO check if kernel is always symmetric
+        K_old_new = K_new_old.T  # shape (dim_old, dim)
+
+        tmp = torch.linalg.solve_triangular(L_old, K_old_new, upper=False)
+        tmp2 = torch.linalg.solve_triangular(L_old.T, tmp, upper=True)
+
+        Sigma_star = (
+            K_new_new - (K_new_old @ tmp2) + 1e-7 * torch.eye(dim, dtype=torch.float64)
+        )
+
+        L_star = torch.linalg.cholesky(Sigma_star)
+        if self.fast:
+            random_vector_new = torch.normal(
+                mean=torch.zeros(dim, n, dtype=torch.float64), std=1.0
+            )
+            y_new = (mu_star + L_star @ random_vector_new) ** 2
+        else:
+            y_new = torch.tensor(
+                tmg(
+                    n,
+                    mu_star.cpu().numpy(),
+                    Sigma_star.cpu().numpy(),
+                    np.ones([dim], dtype=np.float64),
+                    np.eye(dim, dtype=np.float64),
+                    np.zeros(dim, dtype=np.float64),
+                    verbose=True,
+                ),
+                dtype=torch.float64,
+            )
+
+        return y_new
+
+    def _subsample_if_necessary(self, x: torch.Tensor):
+        # Calculate number of clusters
+        n_clusters = (self.memory_limit * 1_000_000_000) / x.element_size()
+        # Since we want to calculate the cholesky decomp of the cov matrix of the data plus roi (expected to be 1% of data)
+        n_clusters = int(np.sqrt(n_clusters) * 0.99 / 2.0)
+
+        if len(x) > n_clusters:
+            if self.sample_algorithm == "grid":
+                centroids = voxel_grid(x, max_n_voxels=n_clusters)
+                print(
+                    f"Approximated data set with {len(centroids)} points for optimal"
+                    " basis."
+                )
+                return centroids
+            elif self.sample_algorithm == "kmeans":
+                # Calculate maximum size of mini batch
+                n_samples, n_features = x.shape
+                SAFETY_FACTOR = 1.5
+                max_batch_size = int(
+                    (
+                        self.memory_limit * 1_000_000_000
+                        - 0.8 * n_samples
+                        - 2 * n_clusters * n_features * x.element_size()
+                    )
+                    // (
+                        (
+                            n_features * n_clusters * x.element_size()
+                            + n_features * x.element_size()
+                        )
+                        * SAFETY_FACTOR
+                    )
+                )
+                if max_batch_size >= n_samples:
+                    max_batch_size = None
+
+                print(
+                    f"Approximating data set with {n_clusters} points from"
+                    f" {len(x)} points for optimal basis."
+                    + (
+                        f"Using batch size {max_batch_size}"
+                        if max_batch_size is not None
+                        else ""
+                    )
+                )
+                kmeans = KMeans(
+                    n_clusters=n_clusters,
+                    mode="euclidean",
+                    verbose=1,
+                    minibatch=max_batch_size,
+                )
+                kmeans.fit_predict(x)
+                centroids = kmeans.centroids
+
+                return centroids
+        else:
+            print("No subsampling necessary because data fits into memory")
+            return x
+
+    def _fit_data(self, data):
+        self.data_m = self.m
+        data = self._subsample_if_necessary(data)
+        self.F_data, self.L_data = self._sample_gaussian_prior(data)
+        self.F_data = self.F_data**2
+        self.W_data, self.H_data, err = run_nmf(
+            self.F_data,
+            n_components=self.m,
+            tol=1e-12,
+            use_gpu=self.F_data.is_cuda,
+            batch_max_iter=2000,
+            fp_precision=self.F_data.dtype,
+        )
+        self.W_data = torch.tensor(self.W_data)
+        self.H_data = torch.tensor(self.H_data)
+        self.W_data = self.W_data / torch.linalg.norm(self.W_data, dim=0)
+        self.data = data
+        W_norm = self.W_data
+        self._set_interpolators(data, W_norm)
+
+    def basis_fun(self, q: torch.Tensor, j: int):
+        if self.interpolators is None:
+            raise Exception("Fit on data before using")
+
+        return self.interpolators(j, q)
+
+    def _set_interpolators(self, x: torch.Tensor, phi: torch.Tensor):
+        assert x.dtype == phi.dtype
+        self.interpolators = InterpolatorArray(x, phi, self.m)
+
+    def fit(self, roi: torch.Tensor):
+        assert self.data is not None, "Data must be given first"
+        print("Refitting optimal basis")
+        self.precomp = False
+        x = torch.cat((self.data, roi), dim=0)
+        F, _ = self._sample_gaussian_prior(x)
+        F = F**2
+        # Note: using cpu based NMF here since run_nmf has no way to pass initialization
+        model = NMF(n_components=self.data_m, max_iter=200, tol=1e-8, init="custom")
+        phi_roi_init = torch.zeros([len(roi), self.data_m], dtype=torch.float64)
+        W_start = torch.cat((self.W_data, phi_roi_init), dim=0)
+        W = torch.tensor(
+            model.fit_transform(
+                F.cpu().numpy(),
+                W=W_start.cpu().numpy(),
+                H=self.H_data.cpu().numpy(),
+            )
+        )
+        self.Phi = W / torch.linalg.norm(W, dim=0)
+        self.m = self.data_m
+        self._set_interpolators(x, self.Phi)
+        self.precomp = False
+        self.precomp_integral = {}
+
+    def add_new_functions(self, roi: torch.Tensor, n: int):
+        x = torch.cat((self.data, roi), dim=0)
+        F_new = self._sample_gaussian_conditional(
+            self.data, self.L_data, self.F_data, roi
+        )
+        F = torch.cat([self.F_data, F_new])
+        Phi_old = (
+            torch.stack([self.basis_fun(x, j) for j in range(self.data_m)]).squeeze(2).T
+        )
+        Theta_old = self.H_data
+        # TODO, theoretically this is wrong and we would have to solve over both Phi_old and Phi_new
+        # also, caping at 0 has no theoretical underpinning
+        objective = torch.clamp(F - Phi_old @ Theta_old, min=0)
+        Phi_new, Theta_new, err = run_nmf(
+            objective,
+            n_components=n,
+            tol=1e-7,
+            use_gpu=True,
+            batch_max_iter=100,
+            fp_precision=objective.dtype,
+        )
+        Phi_new = torch.tensor(Phi_new)
+        self.Phi = Phi_new / torch.linalg.norm(Phi_new, dim=0)
+        self.m = self.data_m + n
+        self.interpolators.set(1, x, self.Phi, n)
+        self.precomp = False
+        self.precomp_integral = {}
 
 
 if __name__ == "__main__":
 
-	from stpy.continuous_processes.gauss_procc import GaussianProcess
-	from stpy.helpers.helper import interval
-	import matplotlib.pyplot as plt
-	from scipy.interpolate import griddata
-
-	d = 2
-	m = 64
-	n = 64
-	N = 20
-	sqrtbeta = 2
-	s = 0.01
-	b = 0
-	gamma = 0.5
-	k = KernelFunction(gamma=gamma, d=2)
-
-	Emb = OptimalPositiveBasis(d, m, offset=0.2, s=s, b=b, discretization_size=n, B=1000., kernel_object=k)
-
-	GP = GaussianProcess(d=d, s=s)
-	xtest = torch.from_numpy(interval(n, d))
-
-	x = torch.from_numpy(np.random.uniform(-1, 1, size=(N, d)))
-
-	F_true = lambda x: torch.sum(torch.sin(x) ** 2 - 0.1, dim=1).view(-1, 1)
-	F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
-	y = F(x)
-
-	# Try to plot the basis functions
-	msqrt = int(np.sqrt(m))
-	fig, axs = plt.subplots(msqrt, msqrt, figsize=(15, 7))
-	for i in range(m):
-		f_i = Emb.basis_fun(xtest, i)  ## basis function
-		xx = xtest[:, 0].numpy()
-		yy = xtest[:, 1].numpy()
-		ax = axs[int(i // msqrt), (i % msqrt)]
-		grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-		grid_z_f = griddata((xx, yy), f_i[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-		cs = ax.contourf(grid_x, grid_y, grid_z_f, levels=10)
-		ax.contour(cs, colors='k')
-		# cbar = fig.colorbar(cs)
-		# if self.x is not None:
-		#	ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), c='r', s=100, marker="o")
-		ax.grid(c='k', ls='-', alpha=0.1)
-
-	plt.savefig("positive.png")
-	plt.show()
-
-	Emb.fit(x, y)
-	GP.fit_gp(x, y)
-
-	mu, _ = Emb.mean_std(xtest)
-	mu_true, _ = GP.mean_std(xtest)
-
-	Emb.visualize_function(xtest, [F_true, lambda x: GP.mean_std(x)[0], lambda x: Emb.mean_std(x)[0]])
-	# Emb.visualize_function(xtest,GP.mean_std)
-	# Emb.visualize_function(xtest,Emb.mean_std)
-
-	# plt.plot(xtest,mu_true,'b--', label = 'GP')
-
-	# plt.plot(x,y,'ro')
-	# plt.plot(xtest, mu, 'g-', label = 'positive basis ')
-	# plt.legend()
-	plt.show()
+    from stpy.continuous_processes.gauss_procc import GaussianProcess
+    from stpy.helpers.helper import interval
+    import matplotlib.pyplot as plt
+    from scipy.interpolate import griddata
+
+    d = 2
+    m = 5
+    n = 64
+    s = 0.01
+    b = 0
+    gamma = 0.5
+    k = KernelFunction(gamma=gamma, d=2)
+
+    xtest = torch.tensor(interval(n, d))
+
+    xnew = xtest[:1000]
+
+    xtest = xtest[1000:]
+
+    Emb = OptimalPositiveBasis(
+        d,
+        m,
+        offset=0.2,
+        s=s,
+        b=b,
+        discretization_size=n,
+        B=1000.0,
+        kernel_object=k,
+        data=xtest,
+    )
+
+    y, L = Emb._sample_prior(xtest, 1)
+
+    fig, ax = plt.subplots(figsize=(10, 6))
+    xx = xtest[:, 0].cpu().numpy()
+    yy = xtest[:, 1].cpu().numpy()
+    sc = ax.scatter(xx, yy, c=y.detach().numpy().reshape(-1), cmap="viridis")
+    ax.grid(c="k", ls="-", alpha=0.1)
+    plt.colorbar(sc)
+    plt.title("Interpolated plot of y over xtest")
+    plt.xlabel("x1")
+    plt.ylabel("x2")
+    plt.show()
+
+    ynew = Emb._sample_conditional(xtest, L, y, xnew)
+
+    xtest = torch.cat([xtest, xnew])
+    y = torch.cat([y, ynew])
+
+    fig, ax = plt.subplots(figsize=(10, 6))
+    xx = xtest[:, 0].cpu().numpy()
+    yy = xtest[:, 1].cpu().numpy()
+    sc = ax.scatter(xx, yy, c=y.detach().numpy().reshape(-1), cmap="viridis")
+    ax.grid(c="k", ls="-", alpha=0.1)
+    plt.colorbar(sc)
+    plt.title("Interpolated plot of y over xtest")
+    plt.xlabel("x1")
+    plt.ylabel("x2")
+    plt.show()
+
+    print("hi")
diff --git a/stpy/embeddings/packing_embedding.py b/stpy/embeddings/packing_embedding.py
index a08d2a7..ce6c77a 100755
--- a/stpy/embeddings/packing_embedding.py
+++ b/stpy/embeddings/packing_embedding.py
@@ -10,111 +10,116 @@
 
 class PackingEmbedding(Embedding):
 
-	def __init__(self, d, m, kernel_object, interval=[-1, 1], n=100, method='svd'):
-		self.d = d
-		self.m = m
-		self.interval = interval
-		self.size = self.get_m()
-		self.kernel_object = kernel_object
-
-		self.kernel = kernel_object.kernel
-		self.n = n
-		self.method = method
-		self.construct()
-
-	def construct(self):
-		xtest = interval_torch(self.n, self.d, offset=[self.interval for _ in range(self.d)])
-		y = xtest[:, 0].view(-1, 1) * 0
-
-		self.new_kernel_object = KernelFunction(kernel_name=self.kernel_object.optkernel,
-												gamma=self.kernel_object.gamma, d=self.d)
-		self.GP = NystromFeatures(self.new_kernel_object, m=self.m, approx=self.method)
-		self.GP.fit_gp(xtest, y)
-
-	def basis_fun(self, x, j):
-		return self.GP.embed(x)[:, j].view(-1, 1)
-
-	def embed(self, x):
-		return self.GP.embed(x)
-
-	def _derivative_1(self, x):
-		dphi = batch_jacobian(self.embed, x).transpose(0, 1)
-		return dphi
-
-	def _derivative_2(self, x):
-		d2phi = batch_hessian(self.embed, x).transpose(0, 1).transpose(0, 2)
-		return d2phi
-
-	def derivative_1(self, x):
-		if self.kernel_object.optkernel == "squared_exponential":
-			xs = self.GP.xs
-			M = self.GP.M
-			derivative = self.kernel_object.derivative_1(xs, x)
-			res = torch.einsum('ij,kil->kjl', M, derivative)
-			return res
-		else:
-			dphi = self._derivative_1(x)
-		return dphi
-
-	def derivative_2(self, x):
-		if self.kernel_object.optkernel == "squared_exponential":
-			xs = self.GP.xs
-			M = self.GP.M
-			derivative = self.kernel_object.derivative_2(xs, x)
-			res = torch.einsum('ij,kilm->kjlm', M, derivative)
-			return res
-		else:
-			d2phi = self._derivative_2(x)
-		return d2phi
+    def __init__(self, d, m, kernel_object, interval=[-1, 1], n=100, method="svd"):
+        self.d = d
+        self.m = m
+        self.interval = interval
+        self.size = self.get_m()
+        self.kernel_object = kernel_object
+
+        self.kernel = kernel_object.kernel
+        self.n = n
+        self.method = method
+        self.construct()
+
+    def construct(self):
+        xtest = interval_torch(
+            self.n, self.d, offset=[self.interval for _ in range(self.d)]
+        )
+        y = xtest[:, 0].view(-1, 1) * 0
+
+        self.new_kernel_object = KernelFunction(
+            kernel_name=self.kernel_object.optkernel,
+            gamma=self.kernel_object.gamma,
+            d=self.d,
+        )
+        self.GP = NystromFeatures(self.new_kernel_object, m=self.m, approx=self.method)
+        self.GP.fit_gp(xtest, y)
+
+    def basis_fun(self, x, j):
+        return self.GP.embed(x)[:, j].view(-1, 1)
+
+    def embed(self, x):
+        return self.GP.embed(x)
+
+    def _derivative_1(self, x):
+        dphi = batch_jacobian(self.embed, x).transpose(0, 1)
+        return dphi
+
+    def _derivative_2(self, x):
+        d2phi = batch_hessian(self.embed, x).transpose(0, 1).transpose(0, 2)
+        return d2phi
+
+    def derivative_1(self, x):
+        if self.kernel_object.optkernel == "squared_exponential":
+            xs = self.GP.xs
+            M = self.GP.M
+            derivative = self.kernel_object.derivative_1(xs, x)
+            res = torch.einsum("ij,kil->kjl", M, derivative)
+            return res
+        else:
+            dphi = self._derivative_1(x)
+        return dphi
+
+    def derivative_2(self, x):
+        if self.kernel_object.optkernel == "squared_exponential":
+            xs = self.GP.xs
+            M = self.GP.M
+            derivative = self.kernel_object.derivative_2(xs, x)
+            res = torch.einsum("ij,kilm->kjlm", M, derivative)
+            return res
+        else:
+            d2phi = self._derivative_2(x)
+        return d2phi
 
 
 if __name__ == "__main__":
-	from stpy.continuous_processes.kernelized_features import KernelizedFeatures
+    from stpy.continuous_processes.kernelized_features import KernelizedFeatures
 
-	d = 1
-	m = 200
-	n = 128
-	N = 10
+    d = 1
+    m = 200
+    n = 128
+    N = 10
 
-	lam = 1.
+    lam = 1.0
 
-	s = 0.0001
-	gamma = 0.1
+    s = 0.0001
+    gamma = 0.1
 
-	xtest = torch.from_numpy(interval(n, d))
-	x = torch.from_numpy(interval(N, d))
+    xtest = torch.from_numpy(interval(n, d))
+    x = torch.from_numpy(interval(N, d))
 
-	kernel_object = KernelFunction(gamma=gamma)
-	Emb = PackingEmbedding(d, m, kernel_object=kernel_object, n=256, method='nothing')
-	print(Emb.GP.M.size())
-	GP = KernelizedFeatures(embedding=Emb, m=m, s=s, lam=lam, d=d)
-	y = GP.sample(x) * 0
-	y[5, 0] = 0.5
+    kernel_object = KernelFunction(gamma=gamma)
+    Emb = PackingEmbedding(d, m, kernel_object=kernel_object, n=256, method="nothing")
+    print(Emb.GP.M.size())
+    GP = KernelizedFeatures(embedding=Emb, m=m, s=s, lam=lam, d=d)
+    y = GP.sample(x) * 0
+    y[5, 0] = 0.5
 
-	GP.fit_gp(x, y)
-	mu, std = GP.mean_std(xtest)
+    GP.fit_gp(x, y)
+    mu, std = GP.mean_std(xtest)
 
-	der = Emb.derivative_1(xtest)[:, :, 0]
-	der_comp = Emb._derivative_1(xtest)[:, :, 0]
+    der = Emb.derivative_1(xtest)[:, :, 0]
+    der_comp = Emb._derivative_1(xtest)[:, :, 0]
 
-	print(torch.norm(der - der_comp))
+    print(torch.norm(der - der_comp))
 
-	der = der @ GP.theta_mean()
-	der_comp = der_comp @ GP.theta_mean()
+    der = der @ GP.theta_mean()
+    der_comp = der_comp @ GP.theta_mean()
 
-	der2 = Emb.derivative_2(xtest)[:, :, 0, 0]
-	der2_comp = Emb._derivative_2(xtest)[:, :, 0, 0]
+    der2 = Emb.derivative_2(xtest)[:, :, 0, 0]
+    der2_comp = Emb._derivative_2(xtest)[:, :, 0, 0]
 
-	print(torch.norm(der2 - der2_comp))
+    print(torch.norm(der2 - der2_comp))
 
-	der2 = der2 @ GP.theta_mean()
-	der2_comp = der2_comp @ GP.theta_mean()
+    der2 = der2 @ GP.theta_mean()
+    der2_comp = der2_comp @ GP.theta_mean()
 
-	plt.plot(xtest, mu)
-	plt.plot(xtest, der)
-	plt.plot(xtest, der_comp, '--')
-	plt.plot(xtest, der2)
-	plt.plot(xtest, der2_comp, '--')
-	plt.plot(x, y, 'bo')
-	plt.grid()
-	plt.show()
+    plt.plot(xtest, mu)
+    plt.plot(xtest, der)
+    plt.plot(xtest, der_comp, "--")
+    plt.plot(xtest, der2)
+    plt.plot(xtest, der2_comp, "--")
+    plt.plot(x, y, "bo")
+    plt.grid()
+    plt.show()
diff --git a/stpy/embeddings/polynomial_embedding.py b/stpy/embeddings/polynomial_embedding.py
index eba9a74..07be852 100755
--- a/stpy/embeddings/polynomial_embedding.py
+++ b/stpy/embeddings/polynomial_embedding.py
@@ -6,7 +6,7 @@
 __email__ = "mojmir.mutny@inf.ethz.ch"
 __status__ = "DEV"
 
-"""
+r"""
 This file implements a polynomial embedding 
 	k(x,y) = \Phi(x)^\top \Phi(y)
 	for kernels of the form (x^\top y + 1)^p
@@ -41,174 +41,189 @@
 from sklearn.preprocessing import PolynomialFeatures
 
 
-class CustomEmbedding():
-	def __init__(self, d, embedding_function, m, groups=None, quadrature="fixed"):
-		self.d = d
-		self.groups = groups
-		self.embedding_function = embedding_function
-		self.m = m
-		self.quadrature = quadrature
-
-	def embed(self, x):
-		return self.embedding_function(x)
-
-	def get_m(self):
-		return self.m
-
-	def integral(self, S):
-		varphi = torch.zeros(size=(self.m, 1)).double()
-
-		if self.quadrature == "fixed":
-			if S.d == 1:
-				weights, nodes = S.return_legendre_discretization(n=512)
-				Z = self.embed(nodes)
-				varphi = torch.einsum('i,ij->j', weights, Z)
-				return varphi.view(-1, 1)
-			elif S.d == 2:
-				weights, nodes = S.return_legendre_discretization(n=50)
-				Z = self.embed(nodes)
-				varphi = torch.einsum('i,ij->j', weights, Z)
-				return varphi.view(-1, 1)
-		else:
-			if S.d == 1:
-				for i in range(self.m):
-					Fi = lambda x: self.embed(torch.from_numpy(np.array(x)).view(1, -1)).view(-1).numpy()
-					val, status = integrate.quad(Fi, float(S.bounds[0, 0]), float(S.bounds[0, 1]))
-					varphi[i] = val
-			elif S.d == 2:
-				for i in range(self.m):
-					Fi = lambda x: self.embed(x).view(-1)[i]
-					integrand = lambda x, y: Fi(torch.Tensor([x, y]).view(1, 2).double()).numpy()
-					val, status = integrate.dblquad(integrand, float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-													lambda x: float(S.bounds[1, 0]),
-													lambda x: float(S.bounds[1, 1]), epsabs=1.49e-03, epsrel=1.49e-03)
-					varphi[i] = val
-			return varphi
-
-
-class PolynomialEmbedding():
-
-	def __init__(self, d, p, kappa=1., groups=None, include_bias=True):
-		self.d = d
-		self.p = p
-		self.kappa = kappa
-		self.groups = groups
-		self.compute(include_bias=include_bias)
-		self.include_bias = include_bias
-
-	def compute(self, include_bias=True):
-		self.poly = PolynomialFeatures(degree=self.p, include_bias=include_bias)
-		if self.groups is None:
-			self.poly.fit_transform(np.random.randn(1, self.d))
-			self.degrees = torch.from_numpy(self.poly.powers_).double()
-			self.size = self.degrees.size()[0]
-		else:
-			self.degrees = []
-			self.size = 0
-			self.sizes = []
-			for group in self.groups:
-				self.poly.fit_transform(np.random.randn(1, len(group)))
-				z = torch.from_numpy(self.poly.powers_).double()
-				self.degrees.append(z)
-				self.sizes.append(z.size()[0])
-				self.size += z.size()[0]
-
-	def embed_group(self, x, j):
-		(n, d) = x.size()
-		x = x.view(n, -1)
-		Phi = torch.zeros(size=(n, self.sizes[j]), dtype=torch.float64)
-		group = self.groups[j]
-		for i in range(n):
-			y = x[i, :]
-			z = y.view(1, len(group))
-			Phi[i, :] = torch.prod(torch.pow(z, self.degrees[j]), dim=1).view(-1)
-		return Phi
-
-	def get_sub_indices(self, group):
-		ind = []
-		for index, elem in enumerate(self.degrees):
-			z = torch.sum(elem[0:group - 2]) + torch.sum(elem[group + 1:])
-			if (elem[group] >= 0.0) and (z <= 0.):
-				ind.append(index)
-		return ind
-
-	def embed(self, x):
-		(n, d) = x.size()
-		# zero = torch.pow(x[0,:] * 0, self.degrees)
-		Phi = torch.zeros(size=(n, self.size), dtype=torch.float64)
-
-		if self.groups is None:
-			for i in range(n):
-				y = x[i, :]
-				Phi[i, :] = torch.prod(torch.pow(y, self.degrees), dim=1)
-		else:
-			for i in range(n):
-				y = x[i, :]
-				for j, group in enumerate(self.groups):
-					z = y[group].view(1, len(group))
-					start = int(np.sum(self.sizes[0:j]))
-					end = np.sum(self.sizes[0:j + 1])
-					Phi[i, start:end] = torch.prod(torch.pow(z, self.degrees[j]), dim=1).view(-1)
-		return np.sqrt(self.kappa) * Phi
-
-	def derivative_1(self, x):
-		pass
-
-	def derivative_2(self, x):
-		pass
-
-
-class ChebyschevEmbedding():
-
-
-	def get_m(self):
-		return self.m
-
-	def __init__(self, d, p, groups=None, include_bias=True):
-		self.d = d
-		self.p = p
-		self.groups = groups
-		self.c = np.ones(self.p)
-		self.poly = cheb.Chebyshev(self.c)
-		self.size = self.p
-		self.m = self.p
-
-	def embed(self, x):
-		out = np.zeros(shape=(int(x.size()[0]), self.p))
-		z = None
-		for p in np.arange(1, self.p + 1, 1):
-			c = np.ones(p)
-			if p > 1:
-				zold = z
-				z = cheb.chebval(x.numpy(), c)
-				out[:, p - 1] = (z - zold).reshape(-1)
-			else:
-				z = cheb.chebval(x.numpy(), c)
-				out[:, p - 1] = z.reshape(-1)
-		return torch.from_numpy(out)
-
-	def derivative_1(self, x):
-		pass
-
-	def derivative_2(self, x):
-		pass
+class CustomEmbedding:
+    def __init__(self, d, embedding_function, m, groups=None, quadrature="fixed"):
+        self.d = d
+        self.groups = groups
+        self.embedding_function = embedding_function
+        self.m = m
+        self.quadrature = quadrature
+
+    def embed(self, x):
+        return self.embedding_function(x)
+
+    def get_m(self):
+        return self.m
+
+    def integral(self, S):
+        varphi = torch.zeros(size=(self.m, 1)).double()
+
+        if self.quadrature == "fixed":
+            if S.d == 1:
+                weights, nodes = S.return_legendre_discretization(n=512)
+                Z = self.embed(nodes)
+                varphi = torch.einsum("i,ij->j", weights, Z)
+                return varphi.view(-1, 1)
+            elif S.d == 2:
+                weights, nodes = S.return_legendre_discretization(n=50)
+                Z = self.embed(nodes)
+                varphi = torch.einsum("i,ij->j", weights, Z)
+                return varphi.view(-1, 1)
+        else:
+            if S.d == 1:
+                for i in range(self.m):
+                    Fi = (
+                        lambda x: self.embed(torch.from_numpy(np.array(x)).view(1, -1))
+                        .view(-1)
+                        .numpy()
+                    )
+                    val, status = integrate.quad(
+                        Fi, float(S.bounds[0, 0]), float(S.bounds[0, 1])
+                    )
+                    varphi[i] = val
+            elif S.d == 2:
+                for i in range(self.m):
+                    Fi = lambda x: self.embed(x).view(-1)[i]
+                    integrand = lambda x, y: Fi(
+                        torch.tensor([x, y]).view(1, 2).double()
+                    ).numpy()
+                    val, status = integrate.dblquad(
+                        integrand,
+                        float(S.bounds[0, 0]),
+                        float(S.bounds[0, 1]),
+                        lambda x: float(S.bounds[1, 0]),
+                        lambda x: float(S.bounds[1, 1]),
+                        epsabs=1.49e-03,
+                        epsrel=1.49e-03,
+                    )
+                    varphi[i] = val
+            return varphi
+
+
+class PolynomialEmbedding:
+
+    def __init__(self, d, p, kappa=1.0, groups=None, include_bias=True):
+        self.d = d
+        self.p = p
+        self.kappa = kappa
+        self.groups = groups
+        self.compute(include_bias=include_bias)
+        self.include_bias = include_bias
+
+    def compute(self, include_bias=True):
+        self.poly = PolynomialFeatures(degree=self.p, include_bias=include_bias)
+        if self.groups is None:
+            self.poly.fit_transform(np.random.randn(1, self.d))
+            self.degrees = torch.from_numpy(self.poly.powers_).double()
+            self.size = self.degrees.size()[0]
+        else:
+            self.degrees = []
+            self.size = 0
+            self.sizes = []
+            for group in self.groups:
+                self.poly.fit_transform(np.random.randn(1, len(group)))
+                z = torch.from_numpy(self.poly.powers_).double()
+                self.degrees.append(z)
+                self.sizes.append(z.size()[0])
+                self.size += z.size()[0]
+
+    def embed_group(self, x, j):
+        (n, d) = x.size()
+        x = x.view(n, -1)
+        Phi = torch.zeros(size=(n, self.sizes[j]), dtype=torch.float64)
+        group = self.groups[j]
+        for i in range(n):
+            y = x[i, :]
+            z = y.view(1, len(group))
+            Phi[i, :] = torch.prod(torch.pow(z, self.degrees[j]), dim=1).view(-1)
+        return Phi
+
+    def get_sub_indices(self, group):
+        ind = []
+        for index, elem in enumerate(self.degrees):
+            z = torch.sum(elem[0 : group - 2]) + torch.sum(elem[group + 1 :])
+            if (elem[group] >= 0.0) and (z <= 0.0):
+                ind.append(index)
+        return ind
+
+    def embed(self, x):
+        (n, d) = x.size()
+        # zero = torch.pow(x[0,:] * 0, self.degrees)
+        Phi = torch.zeros(size=(n, self.size), dtype=torch.float64)
+
+        if self.groups is None:
+            for i in range(n):
+                y = x[i, :]
+                Phi[i, :] = torch.prod(torch.pow(y, self.degrees), dim=1)
+        else:
+            for i in range(n):
+                y = x[i, :]
+                for j, group in enumerate(self.groups):
+                    z = y[group].view(1, len(group))
+                    start = int(np.sum(self.sizes[0:j]))
+                    end = np.sum(self.sizes[0 : j + 1])
+                    Phi[i, start:end] = torch.prod(
+                        torch.pow(z, self.degrees[j]), dim=1
+                    ).view(-1)
+        return np.sqrt(self.kappa) * Phi
+
+    def derivative_1(self, x):
+        pass
+
+    def derivative_2(self, x):
+        pass
+
+
+class ChebyschevEmbedding:
+
+    def get_m(self):
+        return self.m
+
+    def __init__(self, d, p, groups=None, include_bias=True):
+        self.d = d
+        self.p = p
+        self.groups = groups
+        self.c = np.ones(self.p)
+        self.poly = cheb.Chebyshev(self.c)
+        self.size = self.p
+        self.m = self.p
+
+    def embed(self, x):
+        out = np.zeros(shape=(int(x.size()[0]), self.p))
+        z = None
+        for p in np.arange(1, self.p + 1, 1):
+            c = np.ones(p)
+            if p > 1:
+                zold = z
+                z = cheb.chebval(x.numpy(), c)
+                out[:, p - 1] = (z - zold).reshape(-1)
+            else:
+                z = cheb.chebval(x.numpy(), c)
+                out[:, p - 1] = z.reshape(-1)
+        return torch.from_numpy(out)
+
+    def derivative_1(self, x):
+        pass
+
+    def derivative_2(self, x):
+        pass
 
 
 if __name__ == "__main__":
-	d = 2
-	p = 4
-	emb = PolynomialEmbedding(d, p, groups=[[0], [1]])
-	x1 = torch.randn(size=(1, d), dtype=torch.float64)
-	x2 = torch.randn(size=(1, d), dtype=torch.float64)
-	xc = torch.cat((x1, x2))
-
-	print(emb.embed(x1).size())
-	print(emb.embed(x2).size())
-	print(emb.embed(xc).size())
-
-	print("--------")
-	emb = PolynomialEmbedding(d, p)
-	print(emb.get_sub_indices(0))
+    d = 2
+    p = 4
+    emb = PolynomialEmbedding(d, p, groups=[[0], [1]])
+    x1 = torch.randn(size=(1, d), dtype=torch.float64)
+    x2 = torch.randn(size=(1, d), dtype=torch.float64)
+    xc = torch.cat((x1, x2))
+
+    print(emb.embed(x1).size())
+    print(emb.embed(x2).size())
+    print(emb.embed(xc).size())
+
+    print("--------")
+    emb = PolynomialEmbedding(d, p)
+    print(emb.get_sub_indices(0))
 # d = 1
 # emb = ChebyschevEmbedding(d,3)
 # x1 = torch.randn(size = (1,d), dtype = torch.float64)
diff --git a/stpy/embeddings/positive_embedding.py b/stpy/embeddings/positive_embedding.py
index 7899ad0..bef991b 100644
--- a/stpy/embeddings/positive_embedding.py
+++ b/stpy/embeddings/positive_embedding.py
@@ -1,7 +1,9 @@
+from typing import Optional
 import cvxpy as cp
 import mosek
 import numpy as np
 import scipy
+from stpy.kernels import KernelFunction
 import torch
 
 from stpy.borel_set import BorelSet
@@ -11,160 +13,246 @@
 
 class PositiveEmbedding(Embedding):
 
-	def __init__(self, d, m, kernel_object=None, interval=(-1, 1), B=1, b=0, s=0.001, offset=0.):
-		self.d = d
-		self.m = m
-		self.b = b
-		self.size = self.get_m()
-		self.interval = interval
-		if kernel_object is None:
-			#self.kernel_object = KernelFunction()
-			#self.kernel = lambda x, y: self.kernel_object.kernel(x, y)
-			self.kernel = None
-		else:
-			self.kernel_object = kernel_object
-			self.kernel = self.kernel_object.kernel
-		self.B = B
-		self.s = s
-		self.offset = offset
-
-		self.interval = (self.interval[0] - offset, self.interval[1] + offset)
-
-		self.borel_set = BorelSet(d=1, bounds=torch.Tensor([[self.interval[0], self.interval[1]]]).double())
-		self.mu = None
-		self.precomp = False
-		self.procomp_integrals = {}
-
-	def get_size(self):
-		return self.m ** self.d
-
-	def integral(self, S):
-		pass
-
-	def basis_fun(self, x, j):
-		pass
-
-	def get_constraints(self):
-		s = self.m ** self.d
-		l = torch.from_numpy(np.full(s, self.b))
-		u = torch.from_numpy(np.full(s, self.B))
-		Lambda = torch.from_numpy(np.identity(s))
-		return (l, Lambda, u)
-
-	def cov(self, inverse=False):
-		if self.precomp == False:
-			dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
-			t = self.interval[0] + torch.linspace(0, self.m - 1, self.m) * dm
-
-			if self.d == 1:
-				t = t.view(-1, 1).double()
-			elif self.d == 2:
-				t = torch.from_numpy(cartesian([t.numpy(), t.numpy()])).double()
-			elif self.d == 3:
-				t = torch.from_numpy(cartesian([t.numpy(), t.numpy(), t.numpy()])).double()
-			if self.kernel is not None:
-				self.Gamma = self.kernel(t, t)
-				Z = self.embed_internal(t)
-				M = torch.pinverse(Z.T @ Z + (self.s) * torch.eye(self.Gamma.size()[0]))
-				self.M = torch.from_numpy(np.real(scipy.linalg.sqrtm(M.numpy())))
-				self.Gamma_half = torch.from_numpy(
-					np.real(scipy.linalg.sqrtm(self.Gamma.numpy() + 1e-5 * (self.s ** 2) * np.eye(self.Gamma.size()[0]))))
-				self.Gamma_half = self.M @ self.Gamma_half
-				self.invGamma_half = torch.pinverse(self.Gamma_half)
-			else:
-				self.Gamma_half = torch.eye(self.m).double()
-			self.precomp = True
-		else:
-			pass
-
-		if inverse == True:
-			return self.Gamma_half, self.invGamma_half
-		else:
-			return self.Gamma_half
-
-	def embed_internal(self, x):
-		if self.d == 1:
-			out = torch.zeros(size=(x.size()[0], self.m), dtype=torch.float64)
-			for j in range(self.m):
-				out[:, j] = self.basis_fun(x, j).view(-1)
-			return out
-
-		elif self.d == 2:
-			phi_1 = torch.cat([self.basis_fun(x[:, 0].view(-1, 1), j) for j in range(0, self.m)], dim=1)
-			phi_2 = torch.cat([self.basis_fun(x[:, 1].view(-1, 1), j) for j in range(0, self.m)], dim=1)
-			n = x.size()[0]
-			out = []
-			for i in range(n):
-				out.append(torch.from_numpy(np.kron(phi_1[i, :].numpy(), phi_2[i, :].numpy())).view(1, -1))
-			out = torch.cat(out, dim=0)
-			return out
-		elif self.d == 3:
-			phi_1 = torch.cat([self.basis_fun(x[:, 0].view(-1, 1), j) for j in range(0, self.m)], dim=1)
-			phi_2 = torch.cat([self.basis_fun(x[:, 1].view(-1, 1), j) for j in range(0, self.m)], dim=1)
-			phi_3 = torch.cat([self.basis_fun(x[:, 2].view(-1, 1), j) for j in range(0, self.m)], dim=1)
-
-			n = x.size()[0]
-			out = []
-			for i in range(n):
-				out.append(
-					torch.from_numpy(np.kron(phi_3[i, :], np.kron(phi_1[i, :].numpy(), phi_2[i, :].numpy()))).view(1,
-																												   -1))
-			out = torch.cat(out, dim=0)
-			return out
-
-	def fit(self, x, y, already_embeded=False):
-		m = self.get_m()
-
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov()
-
-		if already_embeded == False:
-			Phi = self.embed(x).numpy()
-		else:
-			Phi = x.numpy()
-
-		xi = cp.Variable(m)
-		obj = cp.Minimize(self.s ** 2 * cp.norm2(xi) + cp.sum_squares(Phi @ xi - y.numpy().reshape(-1)))
-
-		constraints = []
-		Lambda = Lambda @ Gamma_half.numpy()
-		if not np.all(l == -np.inf):
-			constraints.append(Lambda[l != -np.inf] @ xi >= l[l != -np.inf])
-		if not np.all(u == np.inf):
-			constraints.append(Lambda[u != np.inf] @ xi <= u[u != np.inf])
-
-		prob = cp.Problem(obj, constraints)
-		prob.solve(solver=cp.MOSEK, warm_start=False,
-				   verbose=False, mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.dual})
-
-		if prob.status != "optimal":
-			raise ValueError('cannot compute the mode')
-
-		mode = xi.value
-		self.mode = torch.from_numpy(mode).view(-1, 1)
-		self.mu = self.mode
-		return mode
-
-	def embed(self, x):
-		Gamma_half = self.cov()
-		return self.embed_internal(x) @ Gamma_half
-
-	def mean(self, xtest):
-		embeding = self.embed(xtest)
-		mean = embeding @ self.mu
-		return mean
-
-	def mean_std(self, xtest):
-		embeding = self.embed(xtest)
-		mean = embeding @ self.mu
-		return mean, None
-
-	def sample_theta(self):
-		self.mu = torch.randn(size=(self.get_m(), 1))
-		return self.mu
-
-	def sample(self, xtest, size=1):
-		return self.embed(xtest) @ self.sample_theta()
-
-	def get_m(self):
-		return self.m ** self.d
+    def __init__(
+        self,
+        d,
+        m,
+        kernel_object: Optional[KernelFunction] = None,
+        interval=(-1, 1),
+        B=1.0,
+        b=0.0,
+        s=0.001,
+        offset=0.0,
+    ):
+        """
+
+        Parameters
+        ----------
+        d
+                Dimension of the embedding
+        m
+                Number of basis functions
+        b, optional
+                Minimal value of the rate function, by default 0
+        B, optional
+                Maximal value of the rate function, by default 1
+        """
+        self.d = d
+        """ Dimension of the embedding """
+        self.m = m
+        """ Number of basis functions """
+        self.b = b
+        """ Minimal value of the rate function """
+        self.size = self.get_m()
+        """ Number of basis functions times number of dimensions """
+        self.interval = interval
+        if kernel_object is None:
+            # self.kernel_object = KernelFunction()
+            # self.kernel = lambda x, y: self.kernel_object.kernel(x, y)
+            self.kernel = None
+        else:
+            self.kernel_object = kernel_object
+            self.kernel = self.kernel_object.kernel
+        self.B = B
+        self.s = s
+        self.offset = offset
+
+        self.interval = (self.interval[0] - offset, self.interval[1] + offset)
+
+        self.borel_set = BorelSet(
+            d=1, bounds=torch.tensor([[self.interval[0], self.interval[1]]]).double()
+        )
+        self.mu = None
+        self.precomp = False
+        self.precomp_integral = {}
+
+    def get_size(self):
+        return self.m**self.d
+
+    def integral(self, S) -> torch.Tensor:
+        raise NotImplementedError("Subclasses should implement this method.")
+
+    def basis_fun(self, x, j):
+        r"""
+        Return the value of basis function \phi_j(x)
+
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: \phi_j(x)
+        """
+        pass
+
+    def get_constraints(self):
+        s = self.m**self.d
+        l = torch.tensor(np.full(s, self.b))
+        u = torch.tensor(np.full(s, self.B))
+        Lambda = torch.tensor(np.identity(s))
+        return (l, Lambda, u)
+
+    def cov(self, inverse=False):
+        r"""Should return $\Gamma^T = \sqrt{V^{-1} K V^{-1}}^T$
+
+        $\sqrt{(V^TV)^* \cdot K}$ where $V_{ij} = \phi_i(t_j)$ and
+        $K_{ij} = k(t_i, t_j)$ and the $t_i$ are equally spaced grid points
+        in the cartesian product set $i^d$ where i is `self.interval`
+
+        """
+        if self.precomp == False:
+            dm = (self.interval[1] - self.interval[0]) / (self.m - 1)  # delta m
+            t = self.interval[0] + torch.linspace(0, self.m - 1, self.m) * dm
+
+            if self.d == 1:
+                t = t.view(-1, 1).double()
+            elif self.d == 2:
+                t = torch.tensor(cartesian([t.cpu().numpy(), t.cpu().numpy()])).double()
+            elif self.d == 3:
+                t = torch.tensor(
+                    cartesian([t.cpu().numpy(), t.cpu().numpy(), t.cpu().numpy()])
+                ).double()
+            if self.kernel is not None:
+                self.Gamma = self.kernel(t, t)
+                Z = self.embed_internal(t)
+                M = torch.pinverse(Z.T @ Z + (self.s) * torch.eye(self.Gamma.size()[0]))
+                self.M = torch.tensor(np.real(scipy.linalg.sqrtm(M.cpu().numpy())))
+                self.Gamma_half = torch.tensor(
+                    np.real(
+                        scipy.linalg.sqrtm(
+                            self.Gamma.cpu().numpy()
+                            + 1e-5 * (self.s**2) * np.eye(self.Gamma.size()[0])
+                        )
+                    )
+                )
+                self.Gamma_half = self.M @ self.Gamma_half
+                self.invGamma_half = torch.pinverse(self.Gamma_half)
+            else:
+                self.Gamma_half = torch.eye(self.m).double()
+            self.precomp = True
+        else:
+            pass
+
+        if inverse == True:
+            return self.Gamma_half, self.invGamma_half
+        else:
+            return self.Gamma_half
+
+    def embed_internal(self, x):
+        r"""Returns a tensor $T$ where $T_{i,j} = \phi_j(x_i)$."""
+        if self.d == 1:
+            out = torch.zeros(size=(x.size()[0], self.m), dtype=torch.float64)
+            for j in range(self.m):
+                out[:, j] = self.basis_fun(x, j).view(-1)
+            return out
+
+        elif self.d == 2:
+            phi_1 = torch.cat(
+                [self.basis_fun(x[:, 0].view(-1, 1), j) for j in range(0, self.m)],
+                dim=1,
+            )
+            phi_2 = torch.cat(
+                [self.basis_fun(x[:, 1].view(-1, 1), j) for j in range(0, self.m)],
+                dim=1,
+            )
+            n = x.size()[0]
+            out = []
+            for i in range(n):
+                out.append(
+                    torch.tensor(
+                        np.kron(phi_1[i, :].cpu().numpy(), phi_2[i, :].cpu().numpy()),
+                    ).view(1, -1)
+                )
+            out = torch.cat(out, dim=0)
+            return out
+        elif self.d == 3:
+            phi_1 = torch.cat(
+                [self.basis_fun(x[:, 0].view(-1, 1), j) for j in range(0, self.m)],
+                dim=1,
+            )
+            phi_2 = torch.cat(
+                [self.basis_fun(x[:, 1].view(-1, 1), j) for j in range(0, self.m)],
+                dim=1,
+            )
+            phi_3 = torch.cat(
+                [self.basis_fun(x[:, 2].view(-1, 1), j) for j in range(0, self.m)],
+                dim=1,
+            )
+
+            n = x.size()[0]
+            out = []
+            for i in range(n):
+                out.append(
+                    torch.tensor(
+                        np.kron(
+                            phi_3[i, :],
+                            np.kron(
+                                phi_1[i, :].cpu().numpy(), phi_2[i, :].cpu().numpy()
+                            ),
+                        )
+                    ).view(1, -1)
+                )
+            out = torch.cat(out, dim=0)
+            return out
+
+    def fit(self, x, y, already_embeded=False):
+        m = self.get_m()
+
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov()
+
+        if already_embeded == False:
+            Phi = self.embed(x).numpy()
+        else:
+            Phi = x.cpu().numpy()
+
+        xi = cp.Variable(m)
+        obj = cp.Minimize(
+            self.s**2 * cp.norm2(xi)
+            + cp.sum_squares(Phi @ xi - y.cpu().numpy().reshape(-1))
+        )
+
+        constraints = []
+        Lambda = Lambda @ Gamma_half.cpu().numpy()
+        if not np.all(l == -np.inf):
+            constraints.append(Lambda[l != -np.inf] @ xi >= l[l != -np.inf])
+        if not np.all(u == np.inf):
+            constraints.append(Lambda[u != np.inf] @ xi <= u[u != np.inf])
+
+        prob = cp.Problem(obj, constraints)
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.dual},
+        )
+
+        if prob.status != "optimal":
+            raise ValueError("cannot compute the mode")
+
+        mode = xi.value
+        self.mode = torch.tensor(mode).view(-1, 1)
+        self.mu = self.mode
+        return mode
+
+    def embed(self, x):
+        r"""Calculates $\Phi(x)^T = \phi(x)^T \Gamma^T$"""
+        Gamma_half = self.cov()
+        return self.embed_internal(x) @ Gamma_half
+
+    def mean(self, xtest):
+        embeding = self.embed(xtest)
+        mean = embeding @ self.mu
+        return mean
+
+    def mean_std(self, xtest):
+        embeding = self.embed(xtest)
+        mean = embeding @ self.mu
+        return mean, None
+
+    def sample_theta(self):
+        self.mu = torch.randn(size=(self.get_m(), 1))
+        return self.mu
+
+    def sample(self, xtest, size=1):
+        return self.embed(xtest) @ self.sample_theta()
+
+    def get_m(self):
+        return self.m**self.d
diff --git a/stpy/embeddings/random_nn.py b/stpy/embeddings/random_nn.py
index bf5c57a..b94ae50 100755
--- a/stpy/embeddings/random_nn.py
+++ b/stpy/embeddings/random_nn.py
@@ -5,184 +5,216 @@
 
 class RandomMap(nn.Module):
 
-	def __init__(self, d, m, fun, output=2):
-		super(RandomMap, self).__init__()
-		self.W = torch.normal(mean=torch.zeros(m, d, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.W.requires_grad_(True)
-		self.w = torch.normal(mean=torch.zeros(m, output, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.w.requires_grad_(True)
-		self.b = torch.normal(mean=torch.zeros(output, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.b.requires_grad_(True)
-		self.fun = fun
-		self.output = output
-
-	def map(self, x):
-		y = self.fun(torch.mm(self.W, torch.t(x)))
-		return y
-
-	def forward(self, x):
-		z = self.map(x)
-		z = torch.mm(torch.t(z), self.w)
-		return z
-
-	def get_params(self):
-		return [self.W, self.w]
-
-	def get_params_last(self):
-		return [self.w]
-
-	def fit_map(self, x, y, epochs=1000, verbose=False, reg=0.1, lr=0.1):
-		criterion = nn.MSELoss()
-
-		import torch.optim as optim
-		optimizer = optim.SGD([self.W, self.w], lr=lr)
-
-		batch_size = 100
-
-		for i in range(epochs):
-			for j in range(x.size()[0] // batch_size):
-				optimizer.zero_grad()  # zero the gradient buffers
-				output = self.forward(x[j * batch_size:(j + 1) * batch_size])
-				loss = criterion(output, y[j * batch_size:(j + 1) * batch_size])
-				loss.backward(retain_graph=True)
-				optimizer.step()  # Does the update
-
-			if verbose == True or i % verbose == 0:
-				output = self.forward(x)
-				loss_full = criterion(output, y)
-				print(i, loss_full)
-				optimizer.step()  # Does the update
-
-	def fit_map_lasso(self, x, y, epochs=1000, verbose=False, reg=0.1, lr=0.1, l1=0.1):
-		criterion = nn.MSELoss()
-
-		import torch.optim as optim
-		optimizer = optim.SGD([self.W, self.w], lr=lr)
-
-		batch_size = 100
-
-		for i in range(epochs):
-			for j in range(x.size()[0] // batch_size):
-				optimizer.zero_grad()  # zero the gradient buffers
-				output = self.forward(x[j * batch_size:(j + 1) * batch_size])
-				loss = criterion(output, y[j * batch_size:(j + 1) * batch_size]) + l1 * torch.norm(self.W, 2)
-				loss.backward(retain_graph=True)
-				optimizer.step()  # Does the update
-
-			if verbose == True or i % verbose == 0:
-				output = self.forward(x)
-				loss_full = criterion(output, y)
-				print(i, loss_full)
-				optimizer.step()  # Does the update
-
-	def loss(self, x, y):
-		criterion = nn.MSELoss()
-		output = self.forward(x)
-		loss = criterion(output, y)
-
-		return loss
-
-	def fit_last_layer(self):
-		# same as before but different parameters
-		pass
+    def __init__(self, d, m, fun, output=2):
+        super(RandomMap, self).__init__()
+        self.W = torch.normal(
+            mean=torch.zeros(m, d, dtype=torch.float64), std=1.0 / np.sqrt(d * m) ** 2
+        )
+        self.W.requires_grad_(True)
+        self.w = torch.normal(
+            mean=torch.zeros(m, output, dtype=torch.float64),
+            std=1.0 / np.sqrt(d * m) ** 2,
+        )
+        self.w.requires_grad_(True)
+        self.b = torch.normal(
+            mean=torch.zeros(output, dtype=torch.float64), std=1.0 / np.sqrt(d * m) ** 2
+        )
+        self.b.requires_grad_(True)
+        self.fun = fun
+        self.output = output
+
+    def map(self, x):
+        y = self.fun(torch.mm(self.W, torch.t(x)))
+        return y
+
+    def forward(self, x):
+        z = self.map(x)
+        z = torch.mm(torch.t(z), self.w)
+        return z
+
+    def get_params(self):
+        return [self.W, self.w]
+
+    def get_params_last(self):
+        return [self.w]
+
+    def fit_map(self, x, y, epochs=1000, verbose=False, reg=0.1, lr=0.1):
+        criterion = nn.MSELoss()
+
+        import torch.optim as optim
+
+        optimizer = optim.SGD([self.W, self.w], lr=lr)
+
+        batch_size = 100
+
+        for i in range(epochs):
+            for j in range(x.size()[0] // batch_size):
+                optimizer.zero_grad()  # zero the gradient buffers
+                output = self.forward(x[j * batch_size : (j + 1) * batch_size])
+                loss = criterion(output, y[j * batch_size : (j + 1) * batch_size])
+                loss.backward(retain_graph=True)
+                optimizer.step()  # Does the update
+
+            if verbose == True or i % verbose == 0:
+                output = self.forward(x)
+                loss_full = criterion(output, y)
+                print(i, loss_full)
+                optimizer.step()  # Does the update
+
+    def fit_map_lasso(self, x, y, epochs=1000, verbose=False, reg=0.1, lr=0.1, l1=0.1):
+        criterion = nn.MSELoss()
+
+        import torch.optim as optim
+
+        optimizer = optim.SGD([self.W, self.w], lr=lr)
+
+        batch_size = 100
+
+        for i in range(epochs):
+            for j in range(x.size()[0] // batch_size):
+                optimizer.zero_grad()  # zero the gradient buffers
+                output = self.forward(x[j * batch_size : (j + 1) * batch_size])
+                loss = criterion(
+                    output, y[j * batch_size : (j + 1) * batch_size]
+                ) + l1 * torch.norm(self.W, 2)
+                loss.backward(retain_graph=True)
+                optimizer.step()  # Does the update
+
+            if verbose == True or i % verbose == 0:
+                output = self.forward(x)
+                loss_full = criterion(output, y)
+                print(i, loss_full)
+                optimizer.step()  # Does the update
+
+    def loss(self, x, y):
+        criterion = nn.MSELoss()
+        output = self.forward(x)
+        loss = criterion(output, y)
+
+        return loss
+
+    def fit_last_layer(self):
+        # same as before but different parameters
+        pass
 
 
 class SpecificMap(RandomMap):
 
-	def __init__(self, d, m, fun, map, output=2):
-		super(SpecificMap, self).__init__(d, m, fun, output=2)
-		self.map = map
+    def __init__(self, d, m, fun, map, output=2):
+        super(SpecificMap, self).__init__(d, m, fun, output=2)
+        self.map = map
 
-	def forward(self, x):
-		z = self.map(x)
-		z = torch.mm(torch.t(z), self.w)
-		return z
+    def forward(self, x):
+        z = self.map(x)
+        z = torch.mm(torch.t(z), self.w)
+        return z
 
-	def get_params(self):
-		return [self.w]
+    def get_params(self):
+        return [self.w]
 
 
 def RandomMapStacked(RandomMap):
-	def __init__(self, d, m, fun, output=2):
-		super(RandomMap, self).__init__()
-		self.W = torch.normal(mean=torch.zeros(m, d, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.W.requires_grad_(True)
-		self.w = torch.normal(mean=torch.zeros(m, output, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.w.requires_grad_(True)
-		self.b = torch.normal(mean=torch.zeros(m, 1, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.b.requires_grad_(True)
-		self.fun = fun
-		self.output = output
-
-	def map(self, x):
-		y = self.fun(torch.mm(self.W, torch.t(x)) + self.b)
-		return y
-
-	def fit_map(self, x, y):
-		pass
+    def __init__(self, d, m, fun, output=2):
+        super(RandomMap, self).__init__()
+        self.W = torch.normal(
+            mean=torch.zeros(m, d, dtype=torch.float64), std=1.0 / np.sqrt(d * m) ** 2
+        )
+        self.W.requires_grad_(True)
+        self.w = torch.normal(
+            mean=torch.zeros(m, output, dtype=torch.float64),
+            std=1.0 / np.sqrt(d * m) ** 2,
+        )
+        self.w.requires_grad_(True)
+        self.b = torch.normal(
+            mean=torch.zeros(m, 1, dtype=torch.float64), std=1.0 / np.sqrt(d * m) ** 2
+        )
+        self.b.requires_grad_(True)
+        self.fun = fun
+        self.output = output
+
+    def map(self, x):
+        y = self.fun(torch.mm(self.W, torch.t(x)) + self.b)
+        return y
+
+    def fit_map(self, x, y):
+        pass
 
 
 class RandomOrthogonalMap(RandomMap):
 
-	def __init__(self, d, m, fun, output=1):
-		super(RandomMap, self).__init__()
-		self.m = m
+    def __init__(self, d, m, fun, output=1):
+        super(RandomMap, self).__init__()
+        self.m = m
 
-		self.R = torch.normal(mean=torch.zeros(m, d, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.R = nn.init.orthogonal_(self.R)
-		self.R.requires_grad_(True)
+        self.R = torch.normal(
+            mean=torch.zeros(m, d, dtype=torch.float64), std=1.0 / np.sqrt(d * m) ** 2
+        )
+        self.R = nn.init.orthogonal_(self.R)
+        self.R.requires_grad_(True)
 
-		self.w = torch.normal(mean=torch.zeros(m, output, dtype=torch.float64), std=1. / np.sqrt(d * m) ** 2)
-		self.w.requires_grad_(True)
+        self.w = torch.normal(
+            mean=torch.zeros(m, output, dtype=torch.float64),
+            std=1.0 / np.sqrt(d * m) ** 2,
+        )
+        self.w.requires_grad_(True)
 
-		self.fun = fun
-		self.output = output
+        self.fun = fun
+        self.output = output
 
-	def map(self, x):
-		y = self.fun(torch.mm(self.R, torch.t(x)))
-		return y
+    def map(self, x):
+        y = self.fun(torch.mm(self.R, torch.t(x)))
+        return y
 
-	def fit_map(self, x, y, epochs=1000, verbose=False, reg=0.1, lr=0.1):
-		criterion = nn.MSELoss()
+    def fit_map(self, x, y, epochs=1000, verbose=False, reg=0.1, lr=0.1):
+        criterion = nn.MSELoss()
 
-		import torch.optim as optim
+        import torch.optim as optim
 
-		optimizer = optim.SGD([self.R, self.w], lr=lr)
-		orth_loss = torch.norm(torch.mm(self.R, torch.t(self.R)) - torch.eye(self.m, self.m, dtype=torch.float64)) ** 2
+        optimizer = optim.SGD([self.R, self.w], lr=lr)
+        orth_loss = (
+            torch.norm(
+                torch.mm(self.R, torch.t(self.R))
+                - torch.eye(self.m, self.m, dtype=torch.float64)
+            )
+            ** 2
+        )
 
-		batch_size = 100
+        batch_size = 100
 
-		for i in range(epochs):
-			for j in range(x.size()[0] // batch_size):
-				optimizer.zero_grad()  # zero the gradient buffers
-				output = self.forward(x[j * batch_size:(j + 1) * batch_size])
-				loss = criterion(output, y[j * batch_size:(j + 1) * batch_size]) + reg * orth_loss
-				loss.backward(retain_graph=True)
-				optimizer.step()  # Does the update
+        for i in range(epochs):
+            for j in range(x.size()[0] // batch_size):
+                optimizer.zero_grad()  # zero the gradient buffers
+                output = self.forward(x[j * batch_size : (j + 1) * batch_size])
+                loss = (
+                    criterion(output, y[j * batch_size : (j + 1) * batch_size])
+                    + reg * orth_loss
+                )
+                loss.backward(retain_graph=True)
+                optimizer.step()  # Does the update
 
-			if verbose == True or i % verbose == 0:
-				output = self.forward(x)
-				loss_full = criterion(output, y) + reg * orth_loss
-				print(i, loss_full)
+            if verbose == True or i % verbose == 0:
+                output = self.forward(x)
+                loss_full = criterion(output, y) + reg * orth_loss
+                print(i, loss_full)
 
 
-class RandomNestedMap():
+class RandomNestedMap:
 
-	def __init__(self):
-		pass
+    def __init__(self):
+        pass
 
 
 if __name__ == "__main__":
-	ridge = lambda x: torch.tanh(x)
+    ridge = lambda x: torch.tanh(x)
 
-	N = 1000
-	d = 10
-	m = 2
+    N = 1000
+    d = 10
+    m = 2
 
-	NetOriginal = RandomMap(d, m, ridge)
+    NetOriginal = RandomMap(d, m, ridge)
 
-	x = 10 * torch.normal(mean=torch.zeros(N, d, dtype=torch.float64) + 2, std=100.)
-	y = NetOriginal.forward(x)
+    x = 10 * torch.normal(mean=torch.zeros(N, d, dtype=torch.float64) + 2, std=100.0)
+    y = NetOriginal.forward(x)
 
-	Net = RandomMap(d, m, ridge)
-	Net.fit_map(x, y)
+    Net = RandomMap(d, m, ridge)
+    Net.fit_map(x, y)
diff --git a/stpy/embeddings/transformations.py b/stpy/embeddings/transformations.py
index 84f58a0..75cc74b 100755
--- a/stpy/embeddings/transformations.py
+++ b/stpy/embeddings/transformations.py
@@ -9,66 +9,68 @@
 
 class Transformation(Embedding):
 
-	def __init__(self):
-		pass
+    def __init__(self):
+        pass
 
-	def embed(self, x):
-		pass
+    def embed(self, x):
+        pass
 
-	def linear_embedding(self):
-		embed = lambda x: x
-		return embed
+    def linear_embedding(self):
+        embed = lambda x: x
+        return embed
 
-	def create_polynomial_embeding(self, degree, d, kappa=1., bias=False):
-		"""
-		create polynomial embeding
+    def create_polynomial_embeding(self, degree, d, kappa=1.0, bias=False):
+        """
+        create polynomial embeding
 
-		:param degree:
-		:param d:
-		:return:
-		"""
-		m = int(comb(degree + d - 1, degree - 1)) + int(bias)
-		poly = PolynomialFeatures(degree, include_bias=bias)
-		embed = lambda x: kappa * torch.from_numpy(poly.fit_transform(x.numpy()))
-		return embed, m
-		return (nodes, weights)
+        :param degree:
+        :param d:
+        :return:
+        """
+        m = int(comb(degree + d - 1, degree - 1)) + int(bias)
+        poly = PolynomialFeatures(degree, include_bias=bias)
+        embed = lambda x: kappa * torch.from_numpy(poly.fit_transform(x.numpy()))
+        return embed, m
+        return (nodes, weights)
 
-	def embed(self, x):
-		(times, d) = tuple(x.size())
-		# z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
-		z = torch.zeros(self.m, times, dtype=x.dtype)
-		q = torch.mm(self.W[:, 0:d], torch.t(x))
-		z[0:int(self.m / 2), :] = torch.cos(q)
-		z[int(self.m / 2):self.m, :] = torch.sin(q)
-		return torch.t(z)
+    def embed(self, x):
+        (times, d) = tuple(x.size())
+        # z = torch.from_numpy(np.zeros(shape=(self.m, times),dtype=x.dtype))
+        z = torch.zeros(self.m, times, dtype=x.dtype)
+        q = torch.mm(self.W[:, 0:d], torch.t(x))
+        z[0 : int(self.m / 2), :] = torch.cos(q)
+        z[int(self.m / 2) : self.m, :] = torch.sin(q)
+        return torch.t(z)
 
-	def create_fourier_embeding(self, cutoff, d, domain, bias=False):
-		self.m = 2 * cutoff - 2 * int(bias)
-		self.d = d
-		omegas = np.arange(int(bias), cutoff, 1) * 2. * np.pi / (2 * domain)
-		print(omegas)
-		v = [omegas for omega in range(self.d)]
-		self.W = torch.from_numpy(helper.cartesian(v))
-		embed = lambda x: self.embed(x)
-		return embed, self.m
+    def create_fourier_embeding(self, cutoff, d, domain, bias=False):
+        self.m = 2 * cutoff - 2 * int(bias)
+        self.d = d
+        omegas = np.arange(int(bias), cutoff, 1) * 2.0 * np.pi / (2 * domain)
+        print(omegas)
+        v = [omegas for omega in range(self.d)]
+        self.W = torch.from_numpy(helper.cartesian(v))
+        embed = lambda x: self.embed(x)
+        return embed, self.m
 
-	def create_cosine_embeding(self, cutoff, d, domain):
-		self.m = cutoff
-		self.d = d
-		omegas = np.arange(0, cutoff, 1) * 2. * np.pi / (2 * domain)
-		print(omegas)
-		v = [omegas for omega in range(self.d)]
-		self.W = torch.from_numpy(helper.cartesian(v))
-		embed = lambda x: torch.t(torch.cos(torch.mm(self.W[:, 0:d], torch.t(x))))
-		return embed, self.m
+    def create_cosine_embeding(self, cutoff, d, domain):
+        self.m = cutoff
+        self.d = d
+        omegas = np.arange(0, cutoff, 1) * 2.0 * np.pi / (2 * domain)
+        print(omegas)
+        v = [omegas for omega in range(self.d)]
+        self.W = torch.from_numpy(helper.cartesian(v))
+        embed = lambda x: torch.t(torch.cos(torch.mm(self.W[:, 0:d], torch.t(x))))
+        return embed, self.m
 
-	def create_cosine_power_embeding(self, cutoff, d, domain):
-		self.m = cutoff + 1
-		self.d = d
-		print(np.logspace(0, cutoff, num=cutoff + 1, base=2))
-		omegas = np.logspace(0, cutoff, num=cutoff + 1, base=2) * 2. * np.pi / (2 * domain)
-		print(omegas)
-		v = [omegas for omega in range(self.d)]
-		self.W = torch.from_numpy(helper.cartesian(v))
-		embed = lambda x: torch.t(torch.cos(torch.mm(self.W[:, 0:d], torch.t(x))))
-		return embed, self.m
+    def create_cosine_power_embeding(self, cutoff, d, domain):
+        self.m = cutoff + 1
+        self.d = d
+        print(np.logspace(0, cutoff, num=cutoff + 1, base=2))
+        omegas = (
+            np.logspace(0, cutoff, num=cutoff + 1, base=2) * 2.0 * np.pi / (2 * domain)
+        )
+        print(omegas)
+        v = [omegas for omega in range(self.d)]
+        self.W = torch.from_numpy(helper.cartesian(v))
+        embed = lambda x: torch.t(torch.cos(torch.mm(self.W[:, 0:d], torch.t(x))))
+        return embed, self.m
diff --git a/stpy/embeddings/triangle_base.py b/stpy/embeddings/triangle_base.py
new file mode 100644
index 0000000..4b040ad
--- /dev/null
+++ b/stpy/embeddings/triangle_base.py
@@ -0,0 +1,85 @@
+import numpy as np
+import scipy
+import torch
+
+from stpy.borel_set import BorelSet
+from stpy.continuous_processes.nystrom_fea import NystromFeatures
+from stpy.embeddings.positive_embedding import PositiveEmbedding
+from stpy.kernels import KernelFunction
+
+
+class EfficientTriangleEmbedding(PositiveEmbedding):
+
+    def __init__(self, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+
+        self._t = torch.linspace(
+            self.interval[0], self.interval[1], steps=self.m, dtype=torch.float64
+        )
+        self._dm = (self.interval[1] - self.interval[0]) / (self.m - 1)
+
+    def basis_fun(self, x: torch.Tensor, j: int):
+        r"""
+        Return the value of 1d basis function $\phi_{j}$
+        over all dimensions of x
+
+        :param x: double, need to be in the interval
+        :param j: integer, index of hat functions, 0 <= j <= m-1
+        :return: $\{\phi_j(x_1), \ldots, \phi_j(x_n)}$
+        """
+        res = torch.clamp(1 - torch.abs((x - self._t[j]) / self._dm), min=0)
+        return res
+
+    def integrate_1d(self, a: torch.Tensor, b: torch.Tensor, t: torch.Tensor):
+        """
+        :param l: from
+        :param u: to
+        :param t: tensor of triangle centers
+        :return: 1d integral over triangle basis functions given by centers and self.dm
+        """
+
+        def rising_integral(x):
+            return (x - t + self._dm) ** 2 / (self._dm * 2.0)
+
+        def falling_integral(x):
+            return -((x - t - self._dm) ** 2) / (self._dm * 2.0)
+
+        i = rising_integral(torch.clamp(b, t - self._dm, t)) - rising_integral(
+            torch.clamp(a, t - self._dm, t)
+        )
+        i += falling_integral(torch.clamp(b, t, t + self._dm)) - falling_integral(
+            torch.clamp(a, t, t + self._dm)
+        )
+
+        return i
+
+    def integral(self, S):
+        r"""
+        Integrate the Phi(x) over S
+        :param S: borel set
+        :return: $\int_S \Phi(x) dx$
+        """
+        if S in self.precomp_integral.keys():
+            return self.precomp_integral[S]
+
+        else:
+            assert S.d == self.d
+            psi = torch.ones(self.m).double()
+            if S.type == "box":
+                psi = torch.tensor([1.0]).double()
+                for i in range(self.d):
+                    a, b = S.bounds[i, 0].double(), S.bounds[i, 1].double()
+                    p = self.integrate_1d(a, b, self._t)
+                    # multiply each with each element and flatten
+                    psi = torch.outer(psi, p).flatten()
+
+            elif S.type == "round":
+                weights, nodes = S.return_legendre_discretization(30)
+                vals = self.embed_internal(nodes)
+                psi = weights.view(1, -1) @ vals
+
+            Gamma_half = self.cov()
+            emb = psi @ Gamma_half
+            self.precomp_integral[S] = emb
+            return emb
diff --git a/stpy/embeddings/weighted_embedding.py b/stpy/embeddings/weighted_embedding.py
index 0c03c99..6ff5a85 100644
--- a/stpy/embeddings/weighted_embedding.py
+++ b/stpy/embeddings/weighted_embedding.py
@@ -5,11 +5,7 @@
 
 class WeightedEmbedding(Embedding):
 
-    def __init__(self,
-                 embedding: Embedding,
-                 weights = None,
-                 weight_function = None
-                 ):
+    def __init__(self, embedding: Embedding, weights=None, weight_function=None):
         self.base_embedding = embedding
         self.m = self.base_embedding.get_m()
         self.weights = weights
@@ -29,8 +25,3 @@ def embed(self, xtest):
             return Phi @ np.diag(self.weights)
         else:
             return Phi @ np.diag(self.weight_function(self.base_embedding))
-
-
-
-
-
diff --git a/stpy/estimator.py b/stpy/estimator.py
index ec00812..d107bf4 100755
--- a/stpy/estimator.py
+++ b/stpy/estimator.py
@@ -12,632 +12,902 @@
 from stpy.helpers import helper
 from stpy.optim.custom_optimizers import bisection
 
+
 class Estimator(ABC):
 
-	def fit(self):
-		pass
-
-	@abstractmethod
-	def ucb(self, x):
-		pass
-
-	@abstractmethod
-	def lcb(self, x):
-		pass
-
-	def load_data(self,d):
-		self.x = d[0]
-		self.y = d[1]
-
-	def log_marginal(self, kernel, X, weight):
-		func = kernel.get_kernel()
-		K = func(self.x, self.x, **X) + torch.eye(self.n, dtype=torch.float64) * self.s * self.s
-		L = torch.linalg.cholesky(K)
-		logdet = -0.5 * 2 * torch.sum(torch.log(torch.diag(L))) * weight
-		alpha = torch.cholesky_solve(self.y, L)
-		logprob = -0.5 * torch.mm(torch.t(self.y), alpha) + logdet
-		logprob = -logprob
-		return logprob
-
-	def optimize_params_general(self, params={}, restarts=2,
-								optimizer="pymanopt", maxiter=1000,
-								mingradnorm=1e-4, regularizer_func=None,
-								verbose=False, scale=1., weight=1., save = False,
-								save_name = 'model.np', parallel = False, cores = None):
-		"""
-
-		:param params:
-		:param restarts:
-		:param optimizer:
-		:param maxiter:
-		:param mingradnorm:
-		:param regularizer_func:
-		:param verbose:
-		:return:
-		"""
-		manifolds = []
-		bounds = []
-		init_values = []
-
-		for key, dict_params in params.items():
-			for var_name, value in dict_params.items():
-				init_value, manifold, bound = value
-				manifolds.append(manifold)
-				bounds.append(bound)
-				init_values.append(init_value)
-
-		if optimizer == "pymanopt":
-
-			manifold = Product(tuple(manifolds))
-
-			@pymanopt.function.pytorch(manifold)
-			def cost(*args):
-				# print (args)
-				input_dict = {}
-				i = 0
-				for key, dict_params in params.items():
-					small_param = {}
-					for var_name, value in dict_params.items():
-						small_param[var_name] = args[i]
-						i = i + 1
-					input_dict[key] = small_param
-
-				if regularizer_func is not None:
-					f = self.log_marginal(self.kernel_object, input_dict, weight) + regularizer_func(args)
-				else:
-					f = self.log_marginal(self.kernel_object, input_dict, weight)
-				return f
-
-			problem = pymanopt.Problem(manifold, cost=cost)
-			solver = SteepestDescent(verbosity = verbose , max_iterations=maxiter, min_gradient_norm=mingradnorm)
-
-			# get initial point
-			objective_values = []
-			objective_params = []
-
-			for rep in range(restarts):
-				x_init = []
-				for index, man in enumerate(manifolds):
-					if init_values[index] is None:
-						x_sub = man.random_point() * scale
-					else:
-						x_sub = np.array([init_values[index]])
-					x_init.append(x_sub)
-				# try:
-				res = solver.run(problem, initial_point=x_init)
-
-				objective_params.append(res.point)
-				objective_values.append(res.cost)#log['final_values']['f(x)'])
-			# except Exception as e:
-			#	print (e)
-			#	print ("Optimization restart failed:", x_init)
-			# pick the smallest objective
-			best_index = np.argmin(objective_values)
-			x_opt = [torch.from_numpy(j) for j in objective_params[best_index]]
-
-		elif optimizer == "scipy":
-			cost_numpy = lambda x: cost(x).detach.numpy()
-			egrad_numpy = lambda x: egrad(x).detach().numpy()
-
-		elif optimizer == "bisection":
-
-			def cost(x):
-				input_dict = self.kernel_object.params_dict
-				counter = 0
-				for key, dict_params in params.items():
-					for var_name, value in dict_params.items():
-						input_dict[key][var_name] = x
-						counter += 1
-
-				if regularizer_func is not None:
-					f = self.log_marginal(self.kernel_object, input_dict, weight) + regularizer_func(x)
-				else:
-					f = self.log_marginal(self.kernel_object, input_dict, weight)
-				return f
-
-			a,b = bounds[0]
-			x_opt = [bisection(cost,a,b,100)]
-
-		elif optimizer == "pytorch-minimize":
-			var_names = []
-			dims = [0,]
-			for key, dict_params in params.items():
-				for var_name, value in dict_params.items():
-					init_value, manifold, bound = value
-
-					manifolds.append(manifold)
-					bounds.append(bound)
-					init_values.append(init_value)
-					var_names.append(var_name)
-					dims.append(manifold.dim)
-
-			dims = np.cumsum(dims).astype(int)
-
-			def cost(x):
-				input_dict = self.kernel_object.params_dict
-				counter = 0
-				for key, dict_params in params.items():
-					for var_name, value in dict_params.items():
-						if key != "likelihood":
-							input_dict[key][var_name] = x[dims[counter]:dims[counter+1]]
-						else:
-							self.s = x[dims[counter]:dims[counter+1]]
-							counter += 1
-
-				if regularizer_func is not None:
-					f = self.log_marginal(self.kernel_object, input_dict, weight) + regularizer_func(x)
-				else:
-					f = self.log_marginal(self.kernel_object, input_dict, weight)
-				return f
-
-			objective_values = []
-			objective_params = []
-			x_opt = []
-
-			dim = dims[-1]
-			self.prepared_log_marginal = False
-			for rep in range(restarts):
-				#try:
-				if init_values[0] is None:
-					x_init = torch.randn(size=(dim, 1)).double().view(-1)**2 * scale
-				else:
-					x_init = init_values[0](dim)
-
-				if bounds[0] is None:
-					res = minimize_torch(cost, x_init, method='l-bfgs', tol=1e-10, disp=verbose + 1,
-										 options={'max_iter': maxiter, 'gtol':mingradnorm})
-					objective_params.append(res.x)
-					objective_values.append(res.fun)
-				else:
-					print ("Constrained optimization with bounds", bounds[0])
-					res = minimize(cost, x_init.numpy(), backend='torch', method='L-BFGS-B',
-								   bounds=bounds[0], precision='float64', tol=1e-8,
-								   options={'ftol': 1e-10,
-											'gtol': mingradnorm, 'eps': 1e-08,
-											'maxfun': 15000, 'maxiter': maxiter,
-											'maxls': 20, 'disp' : verbose + 1})
-
-					objective_params.append(torch.from_numpy(res.x))
-					objective_values.append(torch.from_numpy(res.fun))
-				#except Exception as e:
-				#	print(e)
-			# save models
-
-			if save:
-				vals = {'params': objective_params,
-						'evidence':objective_values,
-						'repeats':restarts,
-						'dim':dims,
-						'param_names':params}
-
-				with open(save_name, 'wb') as f:
-					pickle.dump(vals, f)
-
-
-			best_index = np.argmin(objective_values)
-
-			counter = 0
-			for key, dict_params in params.items():
-				for var_name, value in dict_params.items():
-					x_opt.append(objective_params[best_index][dims[counter]:dims[counter+1]])
-					counter += 1
-
-		elif optimizer == "discrete":
-			values = []
-			configurations = manifolds[0]
-			for config in manifolds[0]:
-				values.append(cost(config))
-
-			best_index = np.argmin(values)
-			x_opt = [configurations[best_index]]
-		else:
-			raise AssertionError("Optimizer not implemented.")
-
-		# put back into default dic
-		i = 0
-		for key, dict_params in params.items():
-			for var_name, value in dict_params.items():
-				if key == "likelihood":
-					self.s = x_opt[i]
-
-				else:
-					self.kernel_object.params_dict[key][var_name] = x_opt[i]
-				i = i + 1
-
-		# print ("--------- Finished. ------------")
-		# print (self.kernel_object.params_dict)
-
-		# disable back_prop
-		self.back_prop = False
-
-		# refit the model
-		self.fitted = False
-		print(self.description())
-		self.fit_gp(self.x, self.y)
-		return True
-
-	def load_params(self, objective_params, params, dims):
-		self.fig = False
-		self.back_prop = False
-		x_opt = []
-		counter = 0
-		for key, dict_params in params.items():
-			for var_name, value in dict_params.items():
-				x_opt.append(objective_params[dims[counter]:dims[counter + 1]])
-				counter += 1
-
-		counter = 0
-		for key, dict_params in params.items():
-			for var_name, value in dict_params.items():
-				self.kernel_object.params_dict[key][var_name] = x_opt[counter]
-				counter += 1
-
-		print(self.description())
-
-
-
-	def visualize_function(self, xtest, f_trues, filename=None, colors=None, figsize = (15, 7)):
-		d = xtest.size()[1]
-		if d == 1:
-			if isinstance(f_trues, list):
-				for f_true in f_trues:
-					plt.plot(xtest, f_true(xtest))
-			else:
-				plt.plot(xtest, f_trues(xtest))
-		elif d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=figsize)
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-
-
-			if isinstance(f_trues, list):
-				for index, f_true in enumerate(f_trues):
-					grid_z = griddata((xx, yy), f_true(xtest)[:, 0].numpy(), (grid_x, grid_y), method='linear')
-					if colors is not None:
-						color = colors[index]
-					ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4, color=color)
-			else:
-				grid_z = griddata((xx, yy), f_trues(xtest)[:, 0].numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4)
-
-			if filename is not None:
-				plt.xticks(fontsize=20, rotation=0)
-				plt.yticks(fontsize=20, rotation=0)
-				plt.savefig(filename, dpi=300)
-
-	def visualize_function_contour(self, xtest, f_true,
-								   filename=None, levels=10, figsize=(15, 7),
-								   alpha = 1., colorbar = True, cmap = 'hot',
-								   mean_point = None, point_color = 'tab:red', ax = None,
-								   fig = None):
-		d = xtest.size()[1]
-		if d == 1:
-			pass
-		elif d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			f = f_true(xtest)
-			grid_z_f = griddata((xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			if ax is None:
-				fig, ax = plt.subplots(figsize=figsize)
-
-			cs = ax.contourf(grid_x, grid_y, grid_z_f, alpha = 0.5, cmap = cmap, linewidths=1, levels = [0,1])
-			ax.contour(cs, colors='k', levels = [0.5], alpha = 0.5)
-			if colorbar:
-				cbar = fig.colorbar(cs)
-			# if self.x is not None:
-			#	ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), c='r', s=100, marker="o")
-			ax.grid(c='k', ls='-', alpha=0.1)
-			if mean_point is not None:
-				plt.plot(mean_point[0],mean_point[1], 'o', ms = 10, color = point_color)
-
-			if filename is not None:
-				plt.xticks(fontsize=24, rotation=0)
-				plt.yticks(fontsize=24, rotation=0)
-				plt.savefig(filename, dpi=300)
-			return fig, ax
-	# plt.show()
-
-	def visualize(self, xtest,bounds = False, f_true=None, points=True, show=True, size=2,
-				  norm=1, fig=True, sqrtbeta=2, constrained=None, d=None,
-				  matheron_kernel=None, color = None, label = "", visualize_point = None):
-
-		if not bounds:
-			[mu, std] = self.mean_std(xtest)
-			lcb = mu - sqrtbeta *std
-			ucb = mu + sqrtbeta *std
-		else:
-			print ("using bounds")
-			lcb = self.lcb(xtest)
-			ucb = self.ucb(xtest)
-			mu = self.mean(xtest)
-
-		if d is None:
-			d = self.d
-
-
-
-		if d == 1:
-			if fig == True:
-				plt.figure(figsize=(15, 7))
-				plt.clf()
-			if self.x is not None:
-				plt.plot(self.x.detach().numpy(), self.y.detach().numpy(), 'ro', ms=10)
-
-			if visualize_point is not None:
-				[x, y] = visualize_point
-				plt.plot(x, y, 'go', ms = 10)
-
-			if size > 0:
-
-				if matheron_kernel is not None:
-					z = self.sample_matheron(xtest, matheron_kernel, size=size).numpy().T
-				else:
-					z = self.sample(xtest, size=size).numpy().T
-
-				for z_arr, label in zip(z, ['sample'] + [None for _ in range(size - 1)]):
-					plt.plot(xtest.view(-1).numpy(), z_arr, 'k--', lw=2, label=label)
-
-			plt.fill_between(xtest.view(-1).numpy(), lcb.view(-1).numpy(), ucb.view(-1).numpy(),
-							 color="#dddddd")
-
-			if f_true is not None:
-				plt.plot(xtest.numpy(), f_true(xtest).numpy(), 'b-', lw=2, label="truth")
-
-			if color is None:
-				plt.plot(xtest.numpy(), mu.numpy(), 'r-', lw=2, label="posterior mean")
-			else:
-				plt.plot(xtest.numpy(), mu.numpy(), linestyle = '-', lw=2, label="posterior mean"+label, color = color)
-
-			plt.legend()
-			if show == True:
-				plt.show()
-
-		elif d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=(15, 7))
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			ax.plot_surface(grid_x, grid_y, grid_z_mu, color='r', alpha=0.4, label="mu")
-
-			if f_true is not None:
-				grid_z = griddata((xx, yy), f_true(xtest)[:, 0].numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z, color='b', alpha=0.4, label="truth")
-
-			if points == True and self.fitted == True:
-				ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), self.y[:, 0].detach().numpy(),
-						   c='r', s=100, marker="o", depthshade=False)
-
-			if hasattr(self,"beta"):
-				if self.beta is not None:
-					beta = self.beta(norm=norm)
-					grid_z2 = griddata((xx, yy), (mu.detach() + beta * std.detach())[:, 0].detach().numpy(),
-									   (grid_x, grid_y), method='linear')
-					ax.plot_surface(grid_x, grid_y, grid_z2, color='gray', alpha=0.2)
-					grid_z3 = griddata((xx, yy), (mu.detach() - beta * std.detach())[:, 0].detach().numpy(),
-									   (grid_x, grid_y), method='linear')
-					ax.plot_surface(grid_x, grid_y, grid_z3, color='gray', alpha=0.2)
-
-				ax.plot_surface(grid_x, grid_y, grid_z_mu, color='r', alpha=0.4)
-				# plt.title('Posterior mean prediction plus 2 st.deviation')
-			plt.show()
-
-		else:
-			print("Visualization not implemented")
-
-	def visualize_subopt(self, xtest, f_true=None, points=True, show=True, size=2, norm=1, fig=True, beta=2):
-		[mu, std] = self.mean_std(xtest)
-
-		print("Visualizing in: ", self.d, "dimensions...")
-
-		if self.d == 1:
-			if fig == True:
-				plt.figure(figsize=(15, 7))
-				plt.clf()
-			if self.x is not None:
-				plt.plot(self.x.detach().numpy(), self.y.detach().numpy(), 'r+', ms=10, marker="o")
-			plt.plot(xtest.numpy(), self.sample(xtest, size=size).numpy(), 'k--', lw=2, label="sample")
-			plt.fill_between(xtest.numpy().flat, (mu - 2 * std).numpy().flat, (mu + 2 * std).numpy().flat,
-							 color="#dddddd")
-			if f_true is not None:
-				plt.plot(xtest.numpy(), f_true(xtest).numpy(), 'b-', lw=2, label="truth")
-			plt.plot(xtest.numpy(), mu.numpy(), 'r-', lw=2, label="posterior mean")
-
-			min = torch.max(mu - beta * std)
-			mask = (mu + beta * std < min)
-			v = torch.min(mu - beta * std).numpy() - 1
-			plt.plot(xtest.numpy()[mask], 0 * xtest.numpy()[mask] + v, 'ko', lw=6, label="Discarted Region")
-
-			plt.title('Posterior mean prediction plus 2 st.deviation')
-			plt.legend()
-
-			if show == True:
-				plt.show()
-
-	def visualize_slice(self, xtest, slice, show=True, eps=None, size=1, beta=2):
-		append = torch.ones(size=(xtest.size()[0], 1), dtype=torch.float64) * slice
-		xtest2 = torch.cat((xtest, append), dim=1)
-
-		[mu, std] = self.mean_std(xtest2)
-
-		plt.figure(figsize=(15, 7))
-		plt.clf()
-		plt.plot(xtest.numpy(), self.sample(xtest, size=size).numpy(), 'k--', lw=2, label="sample")
-		print(std.size(), mu.size())
-		if self.x is not None:
-			plt.plot(self.x[:, 0].detach().numpy(), self.y.detach().numpy(), 'r+', ms=10, marker="o")
-		plt.fill_between(xtest.numpy().flat, (mu - 2 * std).numpy().flat, (mu + 2 * std).numpy().flat, color="#dddddd")
-		plt.fill_between(xtest.numpy().flat, (mu + 2 * std).numpy().flat, (mu + 2 * std + 2 * self.s).numpy().flat,
-						 color="#bbdefb")
-		plt.fill_between(xtest.numpy().flat, (mu - 2 * std - 2 * self.s).numpy().flat, (mu - 2 * std).numpy().flat,
-						 color="#bbdefb")
-
-		if eps is not None:
-			mask = (beta * std < eps)
-			v = torch.min(mu - beta * std - 2 * self.s).numpy()
-			plt.plot(xtest.numpy()[mask], 0 * xtest.numpy()[mask] + v, 'k', lw=6,
-					 label="$\\mathcal{D}_E$ - $\\epsilon$ accurate domain in a subspace")
-
-		plt.plot(xtest.numpy(), mu.numpy(), 'r-', lw=2, label="posterior mean")
-		plt.title('Posterior mean prediction plus 2 st.deviation')
-		plt.legend()
-		if show == True:
-			plt.show()
-
-	def visualize_contour_with_gap(self, xtest, f_true=None, gap=None, show=False):
-		[mu, _] = self.mean_std(xtest)
-
-		if self.d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu)
-			ax.contour(cs, colors='k')
-
-			ax.plot(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), 'ro', ms=10)
-			cbar = fig.colorbar(cs)
-
-			ax.grid(c='k', ls='-', alpha=0.1)
-
-			if f_true is not None:
-				f = f_true(xtest)
-				grid_z_f = griddata((xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				fig, ax = plt.subplots(figsize=(15, 7))
-				cs = ax.contourf(grid_x, grid_y, grid_z_f)
-				ax.contour(cs, colors='k')
-				cbar = fig.colorbar(cs)
-				ax.grid(c='k', ls='-', alpha=0.1)
-			if show == True:
-				plt.show()
-
-	def visualize_contour(self, xtest, f_true=None, show=True, points=True, ms=5, levels=20):
-		[mu, _] = self.mean_std(xtest)
-
-		if self.d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu)
-			ax.contour(cs, colors='k')
-
-			if points == True:
-				ax.plot(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), 'wo', ms=ms, alpha=0.5)
-			cbar = fig.colorbar(cs)
-			ax.grid(c='k', ls='-', alpha=0.1)
-
-			if f_true is not None:
-				f = f_true(xtest)
-				grid_z_f = griddata((xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				fig, ax = plt.subplots(figsize=(15, 7))
-				cs = ax.contourf(grid_x, grid_y, grid_z_f, levels=levels)
-				ax.contour(cs, colors='k')
-				cbar = fig.colorbar(cs)
-				ax.grid(c='k', ls='-', alpha=0.1)
-			if show == True:
-				plt.show()
-			return ax
-
-	def visualize_quiver(self, xtest, size=2, norm=1):
-		[mu, std] = self.mean_std(xtest)
-		if self.d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=(15, 7))
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			#
-
-			ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), self.y[:, 0].detach().numpy(),
-					   c='r', s=100, marker="o", depthshade=False)
-
-			if self.beta is not None:
-				beta = self.beta(norm=norm)
-				grid_z2 = griddata((xx, yy), (mu.detach() + beta * std.detach())[:, 0].detach().numpy(),
-								   (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z2, color='gray', alpha=0.2)
-				grid_z3 = griddata((xx, yy), (mu.detach() - beta * std.detach())[:, 0].detach().numpy(),
-								   (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z3, color='gray', alpha=0.2)
-
-			ax.plot_surface(grid_x, grid_y, grid_z_mu, color='r', alpha=0.4)
-			plt.title('Posterior mean prediction plus 2 st.deviation')
-
-			derivatives = torch.zeros(xtest.size()[0], 2)
-			for index, point in enumerate(xtest):
-				derivatives[index, :] = self.mean_gradient_hessian(point.view(-1, 2))
-				print(derivatives[index, :])
-
-			print(derivatives.size())
-
-			grid_der_x_mu = griddata((xx, yy), derivatives[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			grid_der_y_mu = griddata((xx, yy), derivatives[:, 1].detach().numpy(), (grid_x, grid_y), method='linear')
-
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu)
-
-			ax.contour(cs, colors='k')
-
-			# Plot grid.
-			ax.grid(c='k', ls='-', alpha=0.1)
-			ax.quiver(grid_x, grid_y, grid_der_x_mu, grid_der_y_mu)
-
-			plt.show()
-
-		else:
-			print("Visualization not implemented")
+    def fit(self):
+        pass
+
+    @abstractmethod
+    def ucb(self, x):
+        pass
+
+    @abstractmethod
+    def lcb(self, x):
+        pass
+
+    def load_data(self, d):
+        self.x = d[0]
+        self.y = d[1]
+
+    def log_marginal(self, kernel, X, weight):
+        func = kernel.get_kernel()
+        K = (
+            func(self.x, self.x, **X)
+            + torch.eye(self.n, dtype=torch.float64) * self.s * self.s
+        )
+        L = torch.linalg.cholesky(K)
+        logdet = -0.5 * 2 * torch.sum(torch.log(torch.diag(L))) * weight
+        alpha = torch.cholesky_solve(self.y, L)
+        logprob = -0.5 * torch.mm(torch.t(self.y), alpha) + logdet
+        logprob = -logprob
+        return logprob
+
+    def optimize_params_general(
+        self,
+        params={},
+        restarts=2,
+        optimizer="pymanopt",
+        maxiter=1000,
+        mingradnorm=1e-4,
+        regularizer_func=None,
+        verbose=False,
+        scale=1.0,
+        weight=1.0,
+        save=False,
+        save_name="model.np",
+        parallel=False,
+        cores=None,
+    ):
+        """
+
+        :param params:
+        :param restarts:
+        :param optimizer:
+        :param maxiter:
+        :param mingradnorm:
+        :param regularizer_func:
+        :param verbose:
+        :return:
+        """
+        manifolds = []
+        bounds = []
+        init_values = []
+
+        for key, dict_params in params.items():
+            for var_name, value in dict_params.items():
+                init_value, manifold, bound = value
+                manifolds.append(manifold)
+                bounds.append(bound)
+                init_values.append(init_value)
+
+        if optimizer == "pymanopt":
+
+            manifold = Product(tuple(manifolds))
+
+            @pymanopt.function.pytorch(manifold)
+            def cost(*args):
+                # print (args)
+                input_dict = {}
+                i = 0
+                for key, dict_params in params.items():
+                    small_param = {}
+                    for var_name, value in dict_params.items():
+                        small_param[var_name] = args[i]
+                        i = i + 1
+                    input_dict[key] = small_param
+
+                if regularizer_func is not None:
+                    f = self.log_marginal(
+                        self.kernel_object, input_dict, weight
+                    ) + regularizer_func(args)
+                else:
+                    f = self.log_marginal(self.kernel_object, input_dict, weight)
+                return f
+
+            problem = pymanopt.Problem(manifold, cost=cost)
+            solver = SteepestDescent(
+                verbosity=verbose, max_iterations=maxiter, min_gradient_norm=mingradnorm
+            )
+
+            # get initial point
+            objective_values = []
+            objective_params = []
+
+            for rep in range(restarts):
+                x_init = []
+                for index, man in enumerate(manifolds):
+                    if init_values[index] is None:
+                        x_sub = man.random_point() * scale
+                    else:
+                        x_sub = np.array([init_values[index]])
+                    x_init.append(x_sub)
+                # try:
+                res = solver.run(problem, initial_point=x_init)
+
+                objective_params.append(res.point)
+                objective_values.append(res.cost)  # log['final_values']['f(x)'])
+            # except Exception as e:
+            # 	print (e)
+            # 	print ("Optimization restart failed:", x_init)
+            # pick the smallest objective
+            best_index = np.argmin(objective_values)
+            x_opt = [torch.from_numpy(j) for j in objective_params[best_index]]
+
+        elif optimizer == "scipy":
+            cost_numpy = lambda x: cost(x).detach.numpy()
+            egrad_numpy = lambda x: egrad(x).detach().numpy()
+
+        elif optimizer == "bisection":
+
+            def cost(x):
+                input_dict = self.kernel_object.params_dict
+                counter = 0
+                for key, dict_params in params.items():
+                    for var_name, value in dict_params.items():
+                        input_dict[key][var_name] = x
+                        counter += 1
+
+                if regularizer_func is not None:
+                    f = self.log_marginal(
+                        self.kernel_object, input_dict, weight
+                    ) + regularizer_func(x)
+                else:
+                    f = self.log_marginal(self.kernel_object, input_dict, weight)
+                return f
+
+            a, b = bounds[0]
+            x_opt = [bisection(cost, a, b, 100)]
+
+        elif optimizer == "pytorch-minimize":
+            var_names = []
+            dims = [
+                0,
+            ]
+            for key, dict_params in params.items():
+                for var_name, value in dict_params.items():
+                    init_value, manifold, bound = value
+
+                    manifolds.append(manifold)
+                    bounds.append(bound)
+                    init_values.append(init_value)
+                    var_names.append(var_name)
+                    dims.append(manifold.dim)
+
+            dims = np.cumsum(dims).astype(int)
+
+            def cost(x):
+                input_dict = self.kernel_object.params_dict
+                counter = 0
+                for key, dict_params in params.items():
+                    for var_name, value in dict_params.items():
+                        if key != "likelihood":
+                            input_dict[key][var_name] = x[
+                                dims[counter] : dims[counter + 1]
+                            ]
+                        else:
+                            self.s = x[dims[counter] : dims[counter + 1]]
+                            counter += 1
+
+                if regularizer_func is not None:
+                    f = self.log_marginal(
+                        self.kernel_object, input_dict, weight
+                    ) + regularizer_func(x)
+                else:
+                    f = self.log_marginal(self.kernel_object, input_dict, weight)
+                return f
+
+            objective_values = []
+            objective_params = []
+            x_opt = []
+
+            dim = dims[-1]
+            self.prepared_log_marginal = False
+            for rep in range(restarts):
+                # try:
+                if init_values[0] is None:
+                    x_init = torch.randn(size=(dim, 1)).double().view(-1) ** 2 * scale
+                else:
+                    x_init = init_values[0](dim)
+
+                if bounds[0] is None:
+                    res = minimize_torch(
+                        cost,
+                        x_init,
+                        method="l-bfgs",
+                        tol=1e-10,
+                        disp=verbose + 1,
+                        options={"max_iter": maxiter, "gtol": mingradnorm},
+                    )
+                    objective_params.append(res.x)
+                    objective_values.append(res.fun)
+                else:
+                    print("Constrained optimization with bounds", bounds[0])
+                    res = minimize(
+                        cost,
+                        x_init.numpy(),
+                        backend="torch",
+                        method="L-BFGS-B",
+                        bounds=bounds[0],
+                        precision="float64",
+                        tol=1e-8,
+                        options={
+                            "ftol": 1e-10,
+                            "gtol": mingradnorm,
+                            "eps": 1e-08,
+                            "maxfun": 15000,
+                            "maxiter": maxiter,
+                            "maxls": 20,
+                            "disp": verbose + 1,
+                        },
+                    )
+
+                    objective_params.append(torch.from_numpy(res.x))
+                    objective_values.append(torch.from_numpy(res.fun))
+                # except Exception as e:
+                # 	print(e)
+            # save models
+
+            if save:
+                vals = {
+                    "params": objective_params,
+                    "evidence": objective_values,
+                    "repeats": restarts,
+                    "dim": dims,
+                    "param_names": params,
+                }
+
+                with open(save_name, "wb") as f:
+                    pickle.dump(vals, f)
+
+            best_index = np.argmin(objective_values)
+
+            counter = 0
+            for key, dict_params in params.items():
+                for var_name, value in dict_params.items():
+                    x_opt.append(
+                        objective_params[best_index][dims[counter] : dims[counter + 1]]
+                    )
+                    counter += 1
+
+        elif optimizer == "discrete":
+            values = []
+            configurations = manifolds[0]
+            for config in manifolds[0]:
+                values.append(cost(config))
+
+            best_index = np.argmin(values)
+            x_opt = [configurations[best_index]]
+        else:
+            raise AssertionError("Optimizer not implemented.")
+
+        # put back into default dic
+        i = 0
+        for key, dict_params in params.items():
+            for var_name, value in dict_params.items():
+                if key == "likelihood":
+                    self.s = x_opt[i]
+
+                else:
+                    self.kernel_object.params_dict[key][var_name] = x_opt[i]
+                i = i + 1
+
+        # print ("--------- Finished. ------------")
+        # print (self.kernel_object.params_dict)
+
+        # disable back_prop
+        self.back_prop = False
+
+        # refit the model
+        self.fitted = False
+        print(self.description())
+        self.fit_gp(self.x, self.y)
+        return True
+
+    def load_params(self, objective_params, params, dims):
+        self.fig = False
+        self.back_prop = False
+        x_opt = []
+        counter = 0
+        for key, dict_params in params.items():
+            for var_name, value in dict_params.items():
+                x_opt.append(objective_params[dims[counter] : dims[counter + 1]])
+                counter += 1
+
+        counter = 0
+        for key, dict_params in params.items():
+            for var_name, value in dict_params.items():
+                self.kernel_object.params_dict[key][var_name] = x_opt[counter]
+                counter += 1
+
+        print(self.description())
+
+    def visualize_function(
+        self, xtest, f_trues, filename=None, colors=None, figsize=(15, 7)
+    ):
+        d = xtest.size()[1]
+        if d == 1:
+            if isinstance(f_trues, list):
+                for f_true in f_trues:
+                    plt.plot(xtest, f_true(xtest))
+            else:
+                plt.plot(xtest, f_trues(xtest))
+        elif d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=figsize)
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].numpy()
+            yy = xtest[:, 1].numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+
+            if isinstance(f_trues, list):
+                for index, f_true in enumerate(f_trues):
+                    grid_z = griddata(
+                        (xx, yy),
+                        f_true(xtest)[:, 0].numpy(),
+                        (grid_x, grid_y),
+                        method="linear",
+                    )
+                    if colors is not None:
+                        color = colors[index]
+                    ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4, color=color)
+            else:
+                grid_z = griddata(
+                    (xx, yy),
+                    f_trues(xtest)[:, 0].numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4)
+
+            if filename is not None:
+                plt.xticks(fontsize=20, rotation=0)
+                plt.yticks(fontsize=20, rotation=0)
+                plt.savefig(filename, dpi=300)
+
+    def visualize_function_contour(
+        self,
+        xtest,
+        f_true,
+        filename=None,
+        levels=10,
+        figsize=(15, 7),
+        alpha=1.0,
+        colorbar=True,
+        cmap="hot",
+        mean_point=None,
+        point_color="tab:red",
+        ax=None,
+        fig=None,
+    ):
+        d = xtest.size()[1]
+        if d == 1:
+            pass
+        elif d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].numpy()
+            yy = xtest[:, 1].numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            f = f_true(xtest)
+            grid_z_f = griddata(
+                (xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            if ax is None:
+                fig, ax = plt.subplots(figsize=figsize)
+
+            cs = ax.contourf(
+                grid_x,
+                grid_y,
+                grid_z_f,
+                alpha=0.5,
+                cmap=cmap,
+                linewidths=1,
+                levels=[0, 1],
+            )
+            ax.contour(cs, colors="k", levels=[0.5], alpha=0.5)
+            if colorbar:
+                cbar = fig.colorbar(cs)
+            # if self.x is not None:
+            # 	ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), c='r', s=100, marker="o")
+            ax.grid(c="k", ls="-", alpha=0.1)
+            if mean_point is not None:
+                plt.plot(mean_point[0], mean_point[1], "o", ms=10, color=point_color)
+
+            if filename is not None:
+                plt.xticks(fontsize=24, rotation=0)
+                plt.yticks(fontsize=24, rotation=0)
+                plt.savefig(filename, dpi=300)
+            return fig, ax
+
+    # plt.show()
+
+    def visualize(
+        self,
+        xtest,
+        bounds=False,
+        f_true=None,
+        points=True,
+        show=True,
+        size=2,
+        norm=1,
+        fig=True,
+        sqrtbeta=2,
+        constrained=None,
+        d=None,
+        matheron_kernel=None,
+        color=None,
+        label="",
+        visualize_point=None,
+    ):
+
+        if not bounds:
+            [mu, std] = self.mean_std(xtest)
+            lcb = mu - sqrtbeta * std
+            ucb = mu + sqrtbeta * std
+        else:
+            print("using bounds")
+            lcb = self.lcb(xtest)
+            ucb = self.ucb(xtest)
+            mu = self.mean(xtest)
+
+        if d is None:
+            d = self.d
+
+        if d == 1:
+            if fig == True:
+                plt.figure(figsize=(15, 7))
+                plt.clf()
+            if self.x is not None:
+                plt.plot(self.x.detach().numpy(), self.y.detach().numpy(), "ro", ms=10)
+
+            if visualize_point is not None:
+                [x, y] = visualize_point
+                plt.plot(x, y, "go", ms=10)
+
+            if size > 0:
+
+                if matheron_kernel is not None:
+                    z = (
+                        self.sample_matheron(xtest, matheron_kernel, size=size)
+                        .numpy()
+                        .T
+                    )
+                else:
+                    z = self.sample(xtest, size=size).numpy().T
+
+                for z_arr, label in zip(
+                    z, ["sample"] + [None for _ in range(size - 1)]
+                ):
+                    plt.plot(xtest.view(-1).numpy(), z_arr, "k--", lw=2, label=label)
+
+            plt.fill_between(
+                xtest.view(-1).numpy(),
+                lcb.view(-1).numpy(),
+                ucb.view(-1).numpy(),
+                color="#dddddd",
+            )
+
+            if f_true is not None:
+                plt.plot(
+                    xtest.numpy(), f_true(xtest).numpy(), "b-", lw=2, label="truth"
+                )
+
+            if color is None:
+                plt.plot(xtest.numpy(), mu.numpy(), "r-", lw=2, label="posterior mean")
+            else:
+                plt.plot(
+                    xtest.numpy(),
+                    mu.numpy(),
+                    linestyle="-",
+                    lw=2,
+                    label="posterior mean" + label,
+                    color=color,
+                )
+
+            plt.legend()
+            if show == True:
+                plt.show()
+
+        elif d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].numpy()
+            yy = xtest[:, 1].numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            ax.plot_surface(grid_x, grid_y, grid_z_mu, color="r", alpha=0.4, label="mu")
+
+            if f_true is not None:
+                grid_z = griddata(
+                    (xx, yy),
+                    f_true(xtest)[:, 0].numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(
+                    grid_x, grid_y, grid_z, color="b", alpha=0.4, label="truth"
+                )
+
+            if points == True and self.fitted == True:
+                ax.scatter(
+                    self.x[:, 0].detach().numpy(),
+                    self.x[:, 1].detach().numpy(),
+                    self.y[:, 0].detach().numpy(),
+                    c="r",
+                    s=100,
+                    marker="o",
+                    depthshade=False,
+                )
+
+            if hasattr(self, "beta"):
+                if self.beta is not None:
+                    beta = self.beta(norm=norm)
+                    grid_z2 = griddata(
+                        (xx, yy),
+                        (mu.detach() + beta * std.detach())[:, 0].detach().numpy(),
+                        (grid_x, grid_y),
+                        method="linear",
+                    )
+                    ax.plot_surface(grid_x, grid_y, grid_z2, color="gray", alpha=0.2)
+                    grid_z3 = griddata(
+                        (xx, yy),
+                        (mu.detach() - beta * std.detach())[:, 0].detach().numpy(),
+                        (grid_x, grid_y),
+                        method="linear",
+                    )
+                    ax.plot_surface(grid_x, grid_y, grid_z3, color="gray", alpha=0.2)
+
+                ax.plot_surface(grid_x, grid_y, grid_z_mu, color="r", alpha=0.4)
+                # plt.title('Posterior mean prediction plus 2 st.deviation')
+            plt.show()
+
+        else:
+            print("Visualization not implemented")
+
+    def visualize_subopt(
+        self,
+        xtest,
+        f_true=None,
+        points=True,
+        show=True,
+        size=2,
+        norm=1,
+        fig=True,
+        beta=2,
+    ):
+        [mu, std] = self.mean_std(xtest)
+
+        print("Visualizing in: ", self.d, "dimensions...")
+
+        if self.d == 1:
+            if fig == True:
+                plt.figure(figsize=(15, 7))
+                plt.clf()
+            if self.x is not None:
+                plt.plot(
+                    self.x.detach().numpy(),
+                    self.y.detach().numpy(),
+                    "r+",
+                    ms=10,
+                    marker="o",
+                )
+            plt.plot(
+                xtest.numpy(),
+                self.sample(xtest, size=size).numpy(),
+                "k--",
+                lw=2,
+                label="sample",
+            )
+            plt.fill_between(
+                xtest.numpy().flat,
+                (mu - 2 * std).numpy().flat,
+                (mu + 2 * std).numpy().flat,
+                color="#dddddd",
+            )
+            if f_true is not None:
+                plt.plot(
+                    xtest.numpy(), f_true(xtest).numpy(), "b-", lw=2, label="truth"
+                )
+            plt.plot(xtest.numpy(), mu.numpy(), "r-", lw=2, label="posterior mean")
+
+            min = torch.max(mu - beta * std)
+            mask = mu + beta * std < min
+            v = torch.min(mu - beta * std).numpy() - 1
+            plt.plot(
+                xtest.numpy()[mask],
+                0 * xtest.numpy()[mask] + v,
+                "ko",
+                lw=6,
+                label="Discarted Region",
+            )
+
+            plt.title("Posterior mean prediction plus 2 st.deviation")
+            plt.legend()
+
+            if show == True:
+                plt.show()
+
+    def visualize_slice(self, xtest, slice, show=True, eps=None, size=1, beta=2):
+        append = torch.ones(size=(xtest.size()[0], 1), dtype=torch.float64) * slice
+        xtest2 = torch.cat((xtest, append), dim=1)
+
+        [mu, std] = self.mean_std(xtest2)
+
+        plt.figure(figsize=(15, 7))
+        plt.clf()
+        plt.plot(
+            xtest.numpy(),
+            self.sample(xtest, size=size).numpy(),
+            "k--",
+            lw=2,
+            label="sample",
+        )
+        print(std.size(), mu.size())
+        if self.x is not None:
+            plt.plot(
+                self.x[:, 0].detach().numpy(),
+                self.y.detach().numpy(),
+                "r+",
+                ms=10,
+                marker="o",
+            )
+        plt.fill_between(
+            xtest.numpy().flat,
+            (mu - 2 * std).numpy().flat,
+            (mu + 2 * std).numpy().flat,
+            color="#dddddd",
+        )
+        plt.fill_between(
+            xtest.numpy().flat,
+            (mu + 2 * std).numpy().flat,
+            (mu + 2 * std + 2 * self.s).numpy().flat,
+            color="#bbdefb",
+        )
+        plt.fill_between(
+            xtest.numpy().flat,
+            (mu - 2 * std - 2 * self.s).numpy().flat,
+            (mu - 2 * std).numpy().flat,
+            color="#bbdefb",
+        )
+
+        if eps is not None:
+            mask = beta * std < eps
+            v = torch.min(mu - beta * std - 2 * self.s).numpy()
+            plt.plot(
+                xtest.numpy()[mask],
+                0 * xtest.numpy()[mask] + v,
+                "k",
+                lw=6,
+                label="$\\mathcal{D}_E$ - $\\epsilon$ accurate domain in a subspace",
+            )
+
+        plt.plot(xtest.numpy(), mu.numpy(), "r-", lw=2, label="posterior mean")
+        plt.title("Posterior mean prediction plus 2 st.deviation")
+        plt.legend()
+        if show == True:
+            plt.show()
+
+    def visualize_contour_with_gap(self, xtest, f_true=None, gap=None, show=False):
+        [mu, _] = self.mean_std(xtest)
+
+        if self.d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu)
+            ax.contour(cs, colors="k")
+
+            ax.plot(
+                self.x[:, 0].detach().numpy(),
+                self.x[:, 1].detach().numpy(),
+                "ro",
+                ms=10,
+            )
+            cbar = fig.colorbar(cs)
+
+            ax.grid(c="k", ls="-", alpha=0.1)
+
+            if f_true is not None:
+                f = f_true(xtest)
+                grid_z_f = griddata(
+                    (xx, yy),
+                    f[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                fig, ax = plt.subplots(figsize=(15, 7))
+                cs = ax.contourf(grid_x, grid_y, grid_z_f)
+                ax.contour(cs, colors="k")
+                cbar = fig.colorbar(cs)
+                ax.grid(c="k", ls="-", alpha=0.1)
+            if show == True:
+                plt.show()
+
+    def visualize_contour(
+        self, xtest, f_true=None, show=True, points=True, ms=5, levels=20
+    ):
+        [mu, _] = self.mean_std(xtest)
+
+        if self.d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu)
+            ax.contour(cs, colors="k")
+
+            if points == True:
+                ax.plot(
+                    self.x[:, 0].detach().numpy(),
+                    self.x[:, 1].detach().numpy(),
+                    "wo",
+                    ms=ms,
+                    alpha=0.5,
+                )
+            cbar = fig.colorbar(cs)
+            ax.grid(c="k", ls="-", alpha=0.1)
+
+            if f_true is not None:
+                f = f_true(xtest)
+                grid_z_f = griddata(
+                    (xx, yy),
+                    f[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                fig, ax = plt.subplots(figsize=(15, 7))
+                cs = ax.contourf(grid_x, grid_y, grid_z_f, levels=levels)
+                ax.contour(cs, colors="k")
+                cbar = fig.colorbar(cs)
+                ax.grid(c="k", ls="-", alpha=0.1)
+            if show == True:
+                plt.show()
+            return ax
+
+    def visualize_quiver(self, xtest, size=2, norm=1):
+        [mu, std] = self.mean_std(xtest)
+        if self.d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            #
+
+            ax.scatter(
+                self.x[:, 0].detach().numpy(),
+                self.x[:, 1].detach().numpy(),
+                self.y[:, 0].detach().numpy(),
+                c="r",
+                s=100,
+                marker="o",
+                depthshade=False,
+            )
+
+            if self.beta is not None:
+                beta = self.beta(norm=norm)
+                grid_z2 = griddata(
+                    (xx, yy),
+                    (mu.detach() + beta * std.detach())[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z2, color="gray", alpha=0.2)
+                grid_z3 = griddata(
+                    (xx, yy),
+                    (mu.detach() - beta * std.detach())[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z3, color="gray", alpha=0.2)
+
+            ax.plot_surface(grid_x, grid_y, grid_z_mu, color="r", alpha=0.4)
+            plt.title("Posterior mean prediction plus 2 st.deviation")
+
+            derivatives = torch.zeros(xtest.size()[0], 2)
+            for index, point in enumerate(xtest):
+                derivatives[index, :] = self.mean_gradient_hessian(point.view(-1, 2))
+                print(derivatives[index, :])
+
+            print(derivatives.size())
+
+            grid_der_x_mu = griddata(
+                (xx, yy),
+                derivatives[:, 0].detach().numpy(),
+                (grid_x, grid_y),
+                method="linear",
+            )
+            grid_der_y_mu = griddata(
+                (xx, yy),
+                derivatives[:, 1].detach().numpy(),
+                (grid_x, grid_y),
+                method="linear",
+            )
+
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu)
+
+            ax.contour(cs, colors="k")
+
+            # Plot grid.
+            ax.grid(c="k", ls="-", alpha=0.1)
+            ax.quiver(grid_x, grid_y, grid_der_x_mu, grid_der_y_mu)
+
+            plt.show()
+
+        else:
+            print("Visualization not implemented")
 
 
 if __name__ == "__main__":
-	from stpy.continuous_processes.kernelized_features import KernelizedFeatures
-	from stpy.kernels import KernelFunction
-	from stpy.embeddings.embedding import HermiteEmbedding
-	import stpy
-	import torch
-	import matplotlib.pyplot as plt
-	import numpy as np
-
-	n = 1024
-	N = 256
-	gamma = 0.09
-	s = 0.1
-	# benchmark = stpy.test_functions.benchmarks.GaussianProcessSample(d =1, gamma = gamma, sigma = s, n = n)
-	benchmark = stpy.test_functions.benchmarks.Simple1DFunction(d=1, sigma=s)
-
-	x = benchmark.initial_guess(N, adv_inv=True)
-	y = benchmark.eval(x)
-	xtest = benchmark.interval(1024)
-
-	# GP = GaussianProcess(gamma=gamma, s=s)
-	# GP.fit_gp(x, y)
-	# GP.visualize(xtest, show=False, size=5)
-	# plt.show()
-
-	m = 64
-	kernel = KernelFunction(gamma=gamma)
-	embedding = HermiteEmbedding(gamma=gamma, m=m)
-	RFF = KernelizedFeatures(embedding=embedding, s=s, m=m)
-	RFF.fit_gp(x, y)
-	RFF.visualize(xtest, fig=False, show=False, size=5, matheron_kernel=kernel)
-	plt.show()
+    from stpy.continuous_processes.kernelized_features import KernelizedFeatures
+    from stpy.kernels import KernelFunction
+    from stpy.embeddings.embedding import HermiteEmbedding
+    import stpy
+    import torch
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    n = 1024
+    N = 256
+    gamma = 0.09
+    s = 0.1
+    # benchmark = stpy.test_functions.benchmarks.GaussianProcessSample(d =1, gamma = gamma, sigma = s, n = n)
+    benchmark = stpy.test_functions.benchmarks.Simple1DFunction(d=1, sigma=s)
+
+    x = benchmark.initial_guess(N, adv_inv=True)
+    y = benchmark.eval(x)
+    xtest = benchmark.interval(1024)
+
+    # GP = GaussianProcess(gamma=gamma, s=s)
+    # GP.fit_gp(x, y)
+    # GP.visualize(xtest, show=False, size=5)
+    # plt.show()
+
+    m = 64
+    kernel = KernelFunction(gamma=gamma)
+    embedding = HermiteEmbedding(gamma=gamma, m=m)
+    RFF = KernelizedFeatures(embedding=embedding, s=s, m=m)
+    RFF.fit_gp(x, y)
+    RFF.visualize(xtest, fig=False, show=False, size=5, matheron_kernel=kernel)
+    plt.show()
diff --git a/stpy/feature_importance/feature_ranker.py b/stpy/feature_importance/feature_ranker.py
index 17ccbe9..42c131a 100644
--- a/stpy/feature_importance/feature_ranker.py
+++ b/stpy/feature_importance/feature_ranker.py
@@ -4,46 +4,44 @@
 from stpy.estimator import Estimator
 import copy
 
-class FeatureRanker():
-
-	def __init__(self,
-				 model: Estimator,
-				 mode: str  = 'explained variance'
-				 ):
-		self.model = model
-		self.mode = mode
-
-		if not hasattr(self.model, "kernel_object"):
-			print ("Invalid estimator structure to run feature importance analysis")
-
-	def importance(self):
-
-		if self.mode == 'explained variance':
-			return self.one_off_importance()
-		elif self.mode == 'cross_validation':
-			raise NotImplementedError("This is not implemented.")
-
-	def one_off_importance(self):
-		n,d = self.model.x.size()
-		x = self.model.x
-		y = self.model.y
-		# iterate over features and
-		importance = torch.zeros(size=(d,1)).double().view(-1)
-		res_total = torch.sum(self.model.residuals(x, y) ** 2)
-
-		for i in range(d):
-			# define new data
-			xnew = x.clone()
-			xnew[:,i] = 0.
-
-			# define new model
-			GP = copy.deepcopy(self.model)
-			GP.fit_gp(xnew,y)
-
-			# evaluate residuals
-			res = torch.sum(GP.residuals(xnew,y)**2)
-
-			# store
-			importance[i] = res_total/res
-			print(i + 1, "/", d,':', res_total/res)
-		return importance
\ No newline at end of file
+
+class FeatureRanker:
+
+    def __init__(self, model: Estimator, mode: str = "explained variance"):
+        self.model = model
+        self.mode = mode
+
+        if not hasattr(self.model, "kernel_object"):
+            print("Invalid estimator structure to run feature importance analysis")
+
+    def importance(self):
+
+        if self.mode == "explained variance":
+            return self.one_off_importance()
+        elif self.mode == "cross_validation":
+            raise NotImplementedError("This is not implemented.")
+
+    def one_off_importance(self):
+        n, d = self.model.x.size()
+        x = self.model.x
+        y = self.model.y
+        # iterate over features and
+        importance = torch.zeros(size=(d, 1)).double().view(-1)
+        res_total = torch.sum(self.model.residuals(x, y) ** 2)
+
+        for i in range(d):
+            # define new data
+            xnew = x.clone()
+            xnew[:, i] = 0.0
+
+            # define new model
+            GP = copy.deepcopy(self.model)
+            GP.fit_gp(xnew, y)
+
+            # evaluate residuals
+            res = torch.sum(GP.residuals(xnew, y) ** 2)
+
+            # store
+            importance[i] = res_total / res
+            print(i + 1, "/", d, ":", res_total / res)
+        return importance
diff --git a/stpy/generative_models/conditional_generative_model.py b/stpy/generative_models/conditional_generative_model.py
index 63b7e75..3d3f3a5 100644
--- a/stpy/generative_models/conditional_generative_model.py
+++ b/stpy/generative_models/conditional_generative_model.py
@@ -1,5 +1,7 @@
-class GenerativeModel():
+class GenerativeModel:
     pass
+
+
 class ConditionalGenerativeModel(GenerativeModel):
 
     x = np.random.randn(10)
@@ -9,6 +11,5 @@ class ConditionalGenerativeModel(GenerativeModel):
 
     # find the largest element
     np.max(x)
-    
 
-    pass
\ No newline at end of file
+    pass
diff --git a/stpy/generative_models/cvae.py b/stpy/generative_models/cvae.py
index d9f38a5..56918cf 100644
--- a/stpy/generative_models/cvae.py
+++ b/stpy/generative_models/cvae.py
@@ -7,7 +7,7 @@
 
 # cuda setup
 device = torch.device("cpu")
-kwargs = {'num_workers': 1, 'pin_memory': True} 
+kwargs = {"num_workers": 1, "pin_memory": True}
 
 # hyper params
 batch_size = 64
@@ -16,7 +16,6 @@
 epochs = 10
 
 
-
 def one_hot(labels, class_size):
     targets = torch.zeros(labels.size(0), class_size)
     for i, label in enumerate(labels):
@@ -25,13 +24,13 @@ def one_hot(labels, class_size):
 
 
 class CVAE(nn.Module):
-    def __init__(self, feature_size, latent_size, ouput_size, midsize = 400):
+    def __init__(self, feature_size, latent_size, ouput_size, midsize=400):
         super(CVAE, self).__init__()
         self.feature_size = feature_size
         self.class_size = ouput_size
 
         # encode
-        self.fc1  = nn.Linear(feature_size + ouput_size, midsize)
+        self.fc1 = nn.Linear(feature_size + ouput_size, midsize)
         self.fc21 = nn.Linear(midsize, latent_size)
         self.fc22 = nn.Linear(midsize, latent_size)
 
@@ -42,28 +41,28 @@ def __init__(self, feature_size, latent_size, ouput_size, midsize = 400):
         self.elu = nn.ELU()
         self.sigmoid = nn.Sigmoid()
 
-    def encode(self, x, y): # Q(z|x, c)
-        '''
+    def encode(self, x, y):  # Q(z|x, c)
+        """
         x: (bs, feature_size)
         y: (bs, class_size)
-        '''
-        inputs = torch.cat([x, y], 1) # (bs, feature_size+class_size)
+        """
+        inputs = torch.cat([x, y], 1)  # (bs, feature_size+class_size)
         h1 = self.elu(self.fc1(inputs))
         z_mu = self.fc21(h1)
         z_var = self.fc22(h1)
         return z_mu, z_var
 
     def reparameterize(self, mu, logvar):
-        std = torch.exp(0.5*logvar)
+        std = torch.exp(0.5 * logvar)
         eps = torch.randn_like(std)
-        return mu + eps*std
+        return mu + eps * std
 
-    def decode(self, z, y): # P(x|z, c)
-        '''
+    def decode(self, z, y):  # P(x|z, c)
+        """
         z: (bs, latent_size)
         c: (bs, class_size)
-        '''
-        inputs = torch.cat([z, y], 1) # (bs, latent_size+class_size)
+        """
+        inputs = torch.cat([z, y], 1)  # (bs, latent_size+class_size)
         h3 = self.elu(self.fc3(inputs))
         return self.sigmoid(self.fc4(h3))
 
@@ -72,13 +71,15 @@ def forward(self, x, y):
         z = self.reparameterize(mu, logvar)
         return self.decode(z, y), mu, logvar
 
+
 # create a CVAE model
 model = CVAE(1, 20, 1).to(device)
 optimizer = optim.Adam(model.parameters(), lr=1e-3)
 
+
 # Reconstruction + KL divergence losses summed over all elements and batch
 def loss_function(recon_x, x, mu, logvar):
-    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
+    BCE = F.binary_cross_entropy(recon_x, x, reduction="sum")
     KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
     return BCE + KLD
 
@@ -96,13 +97,21 @@ def train(epoch):
         train_loss += loss.detach().cpu().numpy()
         optimizer.step()
         if batch_idx % 20 == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader),
-                loss.item() / len(data)))
-
-    print('====> Epoch: {} Average loss: {:.4f}'.format(
-          epoch, train_loss / len(train_loader.dataset)))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item() / len(data),
+                )
+            )
+
+    print(
+        "====> Epoch: {} Average loss: {:.4f}".format(
+            epoch, train_loss / len(train_loader.dataset)
+        )
+    )
 
 
 def test(epoch):
@@ -113,26 +122,27 @@ def test(epoch):
             data, labels = data.to(device), labels.to(device)
             labels = one_hot(labels, 10)
             recon_batch, mu, logvar = model(data, labels)
-            test_loss += loss_function(recon_batch, data, mu, logvar).detach().cpu().numpy()
+            test_loss += (
+                loss_function(recon_batch, data, mu, logvar).detach().cpu().numpy()
+            )
             if i == 0:
                 n = min(data.size(0), 5)
-                comparison = torch.cat([data[:n],
-                                      recon_batch.view(-1, 1, 28, 28)[:n]])
-                save_image(comparison.cpu(),
-                         'reconstruction_' + str(epoch) + '.png', nrow=n)
+                comparison = torch.cat([data[:n], recon_batch.view(-1, 1, 28, 28)[:n]])
+                save_image(
+                    comparison.cpu(), "reconstruction_" + str(epoch) + ".png", nrow=n
+                )
 
     test_loss /= len(test_loader.dataset)
-    print('====> Test set loss: {:.4f}'.format(test_loss))
+    print("====> Test set loss: {:.4f}".format(test_loss))
 
 
 for epoch in range(1, epochs + 1):
-        train(epoch)
+    train(epoch)
 
-        test(epoch)
+    test(epoch)
 
-        with torch.no_grad():
-            c = torch.eye(10, 10).cuda()
-            sample = torch.randn(10, 20).to(device)
-            sample = model.decode(sample, c).cpu()
-            save_image(sample.view(10, 1, 28, 28),
-                       'sample_' + str(epoch) + '.png')
\ No newline at end of file
+    with torch.no_grad():
+        c = torch.eye(10, 10).cuda()
+        sample = torch.randn(10, 20).to(device)
+        sample = model.decode(sample, c).cpu()
+        save_image(sample.view(10, 1, 28, 28), "sample_" + str(epoch) + ".png")
diff --git a/stpy/generative_models/generative_sampler.py b/stpy/generative_models/generative_sampler.py
index 3ed89b1..919c5f3 100644
--- a/stpy/generative_models/generative_sampler.py
+++ b/stpy/generative_models/generative_sampler.py
@@ -1,6 +1,7 @@
 import torch
 
-class GenerativeSampler():
+
+class GenerativeSampler:
 
     def __init__(self):
-        pass
\ No newline at end of file
+        pass
diff --git a/stpy/helpers/ColorDB.py b/stpy/helpers/ColorDB.py
index d964fce..fd16997 100644
--- a/stpy/helpers/ColorDB.py
+++ b/stpy/helpers/ColorDB.py
@@ -26,140 +26,139 @@
 
 
 class BadColor(Exception):
-	pass
+    pass
 
 
 DEFAULT_DB = None
-SPACE = ' '
-COMMASPACE = ', '
+SPACE = " "
+COMMASPACE = ", "
 
 
 # generic class
 class ColorDB:
-	def __init__(self, fp):
-		lineno = 2
-		self.__name = fp.name
-		# Maintain several dictionaries for indexing into the color database.
-		# Note that while Tk supports RGB intensities of 4, 8, 12, or 16 bits,
-		# for now we only support 8 bit intensities.  At least on OpenWindows,
-		# all intensities in the /usr/openwin/lib/rgb.txt file are 8-bit
-		#
-		# key is (red, green, blue) tuple, value is (name, [aliases])
-		self.__byrgb = {}
-		# key is name, value is (red, green, blue)
-		self.__byname = {}
-		# all unique names (non-aliases).  built-on demand
-		self.__allnames = None
-		for line in fp:
-			# get this compiled regular expression from derived class
-			mo = self._re.match(line)
-			if not mo:
-				print('Error in', fp.name, ' line', lineno, file=sys.stderr)
-				lineno += 1
-				continue
-			# extract the red, green, blue, and name
-			red, green, blue = self._extractrgb(mo)
-			name = self._extractname(mo)
-			keyname = name.lower()
-			# BAW: for now the `name' is just the first named color with the
-			# rgb values we find.  Later, we might want to make the two word
-			# version the `name', or the CapitalizedVersion, etc.
-			key = (red, green, blue)
-			foundname, aliases = self.__byrgb.get(key, (name, []))
-			if foundname != name and foundname not in aliases:
-				aliases.append(name)
-			self.__byrgb[key] = (foundname, aliases)
-			# add to byname lookup
-			self.__byname[keyname] = key
-			lineno = lineno + 1
-
-	# override in derived classes
-	def _extractrgb(self, mo):
-		return [int(x) for x in mo.group('red', 'green', 'blue')]
-
-	def _extractname(self, mo):
-		return mo.group('name')
-
-	def filename(self):
-		return self.__name
-
-	def find_byrgb(self, rgbtuple):
-		"""Return name for rgbtuple"""
-		try:
-			return self.__byrgb[rgbtuple]
-		except KeyError:
-			raise BadColor(rgbtuple) from None
-
-	def find_byname(self, name):
-		"""Return (red, green, blue) for name"""
-		name = name.lower()
-		try:
-			return self.__byname[name]
-		except KeyError:
-			raise BadColor(name) from None
-
-	def nearest(self, red, green, blue):
-		"""Return the name of color nearest (red, green, blue)"""
-		# BAW: should we use Voronoi diagrams, Delaunay triangulation, or
-		# octree for speeding up the locating of nearest point?  Exhaustive
-		# search is inefficient, but seems fast enough.
-		nearest = -1
-		nearest_name = ''
-		for name, aliases in self.__byrgb.values():
-			r, g, b = self.__byname[name.lower()]
-			rdelta = red - r
-			gdelta = green - g
-			bdelta = blue - b
-			distance = rdelta * rdelta + gdelta * gdelta + bdelta * bdelta
-			if nearest == -1 or distance < nearest:
-				nearest = distance
-				nearest_name = name
-		return nearest_name
-
-	def unique_names(self):
-		# sorted
-		if not self.__allnames:
-			self.__allnames = []
-			for name, aliases in self.__byrgb.values():
-				self.__allnames.append(name)
-			self.__allnames.sort(key=str.lower)
-		return self.__allnames
-
-	def aliases_of(self, red, green, blue):
-		try:
-			name, aliases = self.__byrgb[(red, green, blue)]
-		except KeyError:
-			raise BadColor((red, green, blue)) from None
-		return [name] + aliases
+    def __init__(self, fp):
+        lineno = 2
+        self.__name = fp.name
+        # Maintain several dictionaries for indexing into the color database.
+        # Note that while Tk supports RGB intensities of 4, 8, 12, or 16 bits,
+        # for now we only support 8 bit intensities.  At least on OpenWindows,
+        # all intensities in the /usr/openwin/lib/rgb.txt file are 8-bit
+        #
+        # key is (red, green, blue) tuple, value is (name, [aliases])
+        self.__byrgb = {}
+        # key is name, value is (red, green, blue)
+        self.__byname = {}
+        # all unique names (non-aliases).  built-on demand
+        self.__allnames = None
+        for line in fp:
+            # get this compiled regular expression from derived class
+            mo = self._re.match(line)
+            if not mo:
+                print("Error in", fp.name, " line", lineno, file=sys.stderr)
+                lineno += 1
+                continue
+            # extract the red, green, blue, and name
+            red, green, blue = self._extractrgb(mo)
+            name = self._extractname(mo)
+            keyname = name.lower()
+            # BAW: for now the `name' is just the first named color with the
+            # rgb values we find.  Later, we might want to make the two word
+            # version the `name', or the CapitalizedVersion, etc.
+            key = (red, green, blue)
+            foundname, aliases = self.__byrgb.get(key, (name, []))
+            if foundname != name and foundname not in aliases:
+                aliases.append(name)
+            self.__byrgb[key] = (foundname, aliases)
+            # add to byname lookup
+            self.__byname[keyname] = key
+            lineno = lineno + 1
+
+    # override in derived classes
+    def _extractrgb(self, mo):
+        return [int(x) for x in mo.group("red", "green", "blue")]
+
+    def _extractname(self, mo):
+        return mo.group("name")
+
+    def filename(self):
+        return self.__name
+
+    def find_byrgb(self, rgbtuple):
+        """Return name for rgbtuple"""
+        try:
+            return self.__byrgb[rgbtuple]
+        except KeyError:
+            raise BadColor(rgbtuple) from None
+
+    def find_byname(self, name):
+        """Return (red, green, blue) for name"""
+        name = name.lower()
+        try:
+            return self.__byname[name]
+        except KeyError:
+            raise BadColor(name) from None
+
+    def nearest(self, red, green, blue):
+        """Return the name of color nearest (red, green, blue)"""
+        # BAW: should we use Voronoi diagrams, Delaunay triangulation, or
+        # octree for speeding up the locating of nearest point?  Exhaustive
+        # search is inefficient, but seems fast enough.
+        nearest = -1
+        nearest_name = ""
+        for name, aliases in self.__byrgb.values():
+            r, g, b = self.__byname[name.lower()]
+            rdelta = red - r
+            gdelta = green - g
+            bdelta = blue - b
+            distance = rdelta * rdelta + gdelta * gdelta + bdelta * bdelta
+            if nearest == -1 or distance < nearest:
+                nearest = distance
+                nearest_name = name
+        return nearest_name
+
+    def unique_names(self):
+        # sorted
+        if not self.__allnames:
+            self.__allnames = []
+            for name, aliases in self.__byrgb.values():
+                self.__allnames.append(name)
+            self.__allnames.sort(key=str.lower)
+        return self.__allnames
+
+    def aliases_of(self, red, green, blue):
+        try:
+            name, aliases = self.__byrgb[(red, green, blue)]
+        except KeyError:
+            raise BadColor((red, green, blue)) from None
+        return [name] + aliases
 
 
 class RGBColorDB(ColorDB):
-	_re = re.compile(
-		r'\s*(?P<red>\d+)\s+(?P<green>\d+)\s+(?P<blue>\d+)\s+(?P<name>.*)')
+    _re = re.compile(r"\s*(?P<red>\d+)\s+(?P<green>\d+)\s+(?P<blue>\d+)\s+(?P<name>.*)")
 
 
 class HTML40DB(ColorDB):
-	_re = re.compile(r'(?P<name>\S+)\s+(?P<hexrgb>#[0-9a-fA-F]{6})')
+    _re = re.compile(r"(?P<name>\S+)\s+(?P<hexrgb>#[0-9a-fA-F]{6})")
 
-	def _extractrgb(self, mo):
-		return rrggbb_to_triplet(mo.group('hexrgb'))
+    def _extractrgb(self, mo):
+        return rrggbb_to_triplet(mo.group("hexrgb"))
 
 
 class LightlinkDB(HTML40DB):
-	_re = re.compile(r'(?P<name>(.+))\s+(?P<hexrgb>#[0-9a-fA-F]{6})')
+    _re = re.compile(r"(?P<name>(.+))\s+(?P<hexrgb>#[0-9a-fA-F]{6})")
 
-	def _extractname(self, mo):
-		return mo.group('name').strip()
+    def _extractname(self, mo):
+        return mo.group("name").strip()
 
 
 class WebsafeDB(ColorDB):
-	_re = re.compile('(?P<hexrgb>#[0-9a-fA-F]{6})')
+    _re = re.compile("(?P<hexrgb>#[0-9a-fA-F]{6})")
 
-	def _extractrgb(self, mo):
-		return rrggbb_to_triplet(mo.group('hexrgb'))
+    def _extractrgb(self, mo):
+        return rrggbb_to_triplet(mo.group("hexrgb"))
 
-	def _extractname(self, mo):
-		return mo.group('hexrgb').upper()
+    def _extractname(self, mo):
+        return mo.group("hexrgb").upper()
 
 
 # format is a tuple (RE, SCANLINES, CLASS) where RE is a compiled regular
@@ -167,112 +166,111 @@ def _extractname(self, mo):
 # the class to instantiate if a match is found
 
 FILETYPES = [
-	(re.compile('Xorg'), RGBColorDB),
-	(re.compile('XConsortium'), RGBColorDB),
-	(re.compile('HTML'), HTML40DB),
-	(re.compile('lightlink'), LightlinkDB),
-	(re.compile('Websafe'), WebsafeDB),
+    (re.compile("Xorg"), RGBColorDB),
+    (re.compile("XConsortium"), RGBColorDB),
+    (re.compile("HTML"), HTML40DB),
+    (re.compile("lightlink"), LightlinkDB),
+    (re.compile("Websafe"), WebsafeDB),
 ]
 
 
 def get_colordb(file, filetype=None):
-	colordb = None
-	fp = open(file)
-	try:
-		line = fp.readline()
-		if not line:
-			return None
-		# try to determine the type of RGB file it is
-		if filetype is None:
-			filetypes = FILETYPES
-		else:
-			filetypes = [filetype]
-		for typere, class_ in filetypes:
-			mo = typere.search(line)
-			if mo:
-				break
-		else:
-			# no matching type
-			return None
-		# we know the type and the class to grok the type, so suck it in
-		colordb = class_(fp)
-	finally:
-		fp.close()
-	# save a global copy
-	global DEFAULT_DB
-	DEFAULT_DB = colordb
-	return colordb
+    colordb = None
+    fp = open(file)
+    try:
+        line = fp.readline()
+        if not line:
+            return None
+        # try to determine the type of RGB file it is
+        if filetype is None:
+            filetypes = FILETYPES
+        else:
+            filetypes = [filetype]
+        for typere, class_ in filetypes:
+            mo = typere.search(line)
+            if mo:
+                break
+        else:
+            # no matching type
+            return None
+        # we know the type and the class to grok the type, so suck it in
+        colordb = class_(fp)
+    finally:
+        fp.close()
+    # save a global copy
+    global DEFAULT_DB
+    DEFAULT_DB = colordb
+    return colordb
 
 
 _namedict = {}
 
 
 def rrggbb_to_triplet(color):
-	"""Converts a #rrggbb color to the tuple (red, green, blue)."""
-	rgbtuple = _namedict.get(color)
-	if rgbtuple is None:
-		if color[0] != '#':
-			raise BadColor(color)
-		red = color[1:3]
-		green = color[3:5]
-		blue = color[5:7]
-		rgbtuple = int(red, 16), int(green, 16), int(blue, 16)
-		_namedict[color] = rgbtuple
-	return rgbtuple
+    """Converts a #rrggbb color to the tuple (red, green, blue)."""
+    rgbtuple = _namedict.get(color)
+    if rgbtuple is None:
+        if color[0] != "#":
+            raise BadColor(color)
+        red = color[1:3]
+        green = color[3:5]
+        blue = color[5:7]
+        rgbtuple = int(red, 16), int(green, 16), int(blue, 16)
+        _namedict[color] = rgbtuple
+    return rgbtuple
 
 
 _tripdict = {}
 
 
 def triplet_to_rrggbb(rgbtuple):
-	"""Converts a (red, green, blue) tuple to #rrggbb."""
-	global _tripdict
-	hexname = _tripdict.get(rgbtuple)
-	if hexname is None:
-		hexname = '#%02x%02x%02x' % rgbtuple
-		_tripdict[rgbtuple] = hexname
-	return hexname
+    """Converts a (red, green, blue) tuple to #rrggbb."""
+    global _tripdict
+    hexname = _tripdict.get(rgbtuple)
+    if hexname is None:
+        hexname = "#%02x%02x%02x" % rgbtuple
+        _tripdict[rgbtuple] = hexname
+    return hexname
 
 
 def triplet_to_fractional_rgb(rgbtuple):
-	return [x / 256 for x in rgbtuple]
+    return [x / 256 for x in rgbtuple]
 
 
 def triplet_to_brightness(rgbtuple):
-	# return the brightness (grey level) along the scale 0.0==black to
-	# 1.0==white
-	r = 0.299
-	g = 0.587
-	b = 0.114
-	return r * rgbtuple[0] + g * rgbtuple[1] + b * rgbtuple[2]
-
-
-if __name__ == '__main__':
-	colordb = get_colordb('colors.txt')
-	if not colordb:
-		print('No parseable color database found')
-		sys.exit(1)
-	# on my system, this color matches exactly
-	target = 'navy'
-	red, green, blue = rgbtuple = colordb.find_byname(target)
-	print(target, ':', red, green, blue, triplet_to_rrggbb(rgbtuple))
-	print ("-----")
-	print (rgbtuple)
-	name, aliases = colordb.find_byrgb(rgbtuple)
-	print('name:', name, 'aliases:', COMMASPACE.join(aliases))
-	r, g, b = (1, 1, 128)  # nearest to navy
-	r, g, b = (145, 238, 144)  # nearest to lightgreen
-	r, g, b = (255, 251, 250)  # snow
-	print('finding nearest to', target, '...')
-	import time
-
-	t0 = time.time()
-	nearest = colordb.nearest(r, g, b)
-	t1 = time.time()
-	print('found nearest color', nearest, 'in', t1 - t0, 'seconds')
-	# dump the database
-	for n in colordb.unique_names():
-		r, g, b = colordb.find_byname(n)
-		aliases = colordb.aliases_of(r, g, b)
-		print('%20s: (%3d/%3d/%3d) == %s' % (n, r, g, b,
-											 SPACE.join(aliases[1:])))
\ No newline at end of file
+    # return the brightness (grey level) along the scale 0.0==black to
+    # 1.0==white
+    r = 0.299
+    g = 0.587
+    b = 0.114
+    return r * rgbtuple[0] + g * rgbtuple[1] + b * rgbtuple[2]
+
+
+if __name__ == "__main__":
+    colordb = get_colordb("colors.txt")
+    if not colordb:
+        print("No parseable color database found")
+        sys.exit(1)
+    # on my system, this color matches exactly
+    target = "navy"
+    red, green, blue = rgbtuple = colordb.find_byname(target)
+    print(target, ":", red, green, blue, triplet_to_rrggbb(rgbtuple))
+    print("-----")
+    print(rgbtuple)
+    name, aliases = colordb.find_byrgb(rgbtuple)
+    print("name:", name, "aliases:", COMMASPACE.join(aliases))
+    r, g, b = (1, 1, 128)  # nearest to navy
+    r, g, b = (145, 238, 144)  # nearest to lightgreen
+    r, g, b = (255, 251, 250)  # snow
+    print("finding nearest to", target, "...")
+    import time
+
+    t0 = time.time()
+    nearest = colordb.nearest(r, g, b)
+    t1 = time.time()
+    print("found nearest color", nearest, "in", t1 - t0, "seconds")
+    # dump the database
+    for n in colordb.unique_names():
+        r, g, b = colordb.find_byname(n)
+        aliases = colordb.aliases_of(r, g, b)
+        print("%20s: (%3d/%3d/%3d) == %s" % (n, r, g, b, SPACE.join(aliases[1:])))
diff --git a/stpy/helpers/abitrary_sampling.py b/stpy/helpers/abitrary_sampling.py
index 428c03c..e14887d 100644
--- a/stpy/helpers/abitrary_sampling.py
+++ b/stpy/helpers/abitrary_sampling.py
@@ -6,207 +6,217 @@
 
 
 def sample_uniform_sphere(n, d, radius=1):
-	X = np.random.randn(n, d)
-	X_n = np.random.randn(n, d)
-	for i in range(n):
-		X_n[i, :] = (X[i, :] / np.linalg.norm(X[i, :])) * radius
-	return X_n
+    X = np.random.randn(n, d)
+    X_n = np.random.randn(n, d)
+    for i in range(n):
+        X_n[i, :] = (X[i, :] / np.linalg.norm(X[i, :])) * radius
+    return X_n
 
 
 def rejection_sampling(pdf, size=(1, 1)):
-	"""
-	Implements rejection sampling
-
-	:param pdf:
-	:param size:
-	:return:
-	"""
-	n = size[0]
-	d = size[1]
-	output = np.zeros(shape=size)
-	i = 0
-	while i < n:
-		Z = np.random.normal(size=(1, d))
-		u = np.random.uniform()
-		if pdf(Z) < u:
-			output[i, :] = Z
-			i = i + 1
-
-	return output
+    """
+    Implements rejection sampling
+
+    :param pdf:
+    :param size:
+    :return:
+    """
+    n = size[0]
+    d = size[1]
+    output = np.zeros(shape=size)
+    i = 0
+    while i < n:
+        Z = np.random.normal(size=(1, d))
+        u = np.random.uniform()
+        if pdf(Z) < u:
+            output[i, :] = Z
+            i = i + 1
+
+    return output
 
 
 def next_prime():
-	def is_prime(num):
-		"Checks if num is a prime value"
-		for i in range(2, int(num ** 0.5) + 1):
-			if (num % i) == 0: return False
-		return True
+    def is_prime(num):
+        "Checks if num is a prime value"
+        for i in range(2, int(num**0.5) + 1):
+            if (num % i) == 0:
+                return False
+        return True
 
-	prime = 3
-	while (1):
-		if is_prime(prime):
-			yield prime
-		prime += 2
+    prime = 3
+    while 1:
+        if is_prime(prime):
+            yield prime
+        prime += 2
 
 
 def vdc(n, base=2):
-	vdc, denom = 0, 1
-	while n:
-		denom *= base
-		n, remainder = divmod(n, base)
-		vdc += remainder / float(denom)
-	return vdc
+    vdc, denom = 0, 1
+    while n:
+        denom *= base
+        n, remainder = divmod(n, base)
+        vdc += remainder / float(denom)
+    return vdc
 
 
 def halton_sequence(size, dim):
-	seq = []
-	primeGen = next_prime()
-	next(primeGen)
-	for d in range(dim):
-		base = next(primeGen)
-		seq.append([vdc(i, base) for i in range(size)])
-	return seq
+    seq = []
+    primeGen = next_prime()
+    next(primeGen)
+    for d in range(dim):
+        base = next(primeGen)
+        seq.append([vdc(i, base) for i in range(size)])
+    return seq
 
 
 def sample_qmc_halton_normal(size=(1, 1)):
-	Z = np.array(halton_sequence(size[0], size[1])).T
-	Z[0, :] += 10e-5
-	from scipy.stats import norm
-	Z = norm.ppf(Z)
-	return Z
+    Z = np.array(halton_sequence(size[0], size[1])).T
+    Z[0, :] += 10e-5
+    from scipy.stats import norm
+
+    Z = norm.ppf(Z)
+    return Z
 
 
 def sample_qmc_halton(sampler, size=(1, 1)):
-	Z = np.array(halton_sequence(size[0], size[1]), dtype=np.float64).T
-	Z[0, :] += 10e-5
-	Z = sampler(Z)
-	return Z
+    Z = np.array(halton_sequence(size[0], size[1]), dtype=np.float64).T
+    Z[0, :] += 10e-5
+    Z = sampler(Z)
+    return Z
 
 
 def sample_bounded(bounds):
-	d = len(bounds)
-	x = np.zeros(shape=(d))
-	for i in range(d):
-		x[i] = np.uniform(bounds[i][0], bounds[i][1])
-	return x
-
-
-def randomly_split_set_without_duplicates_balanced(x: torch.Tensor,
-											y: torch.Tensor,
-											max_bins: int = 2,
-										  	alpha: float = 0.2,
-										  	size: Union[int, float, None] = None):
-	# sort tensor
-	N = x.size()[0]
-
-	out, indices = torch.unique(x, dim=0, return_inverse=True)
-	n, d = out.size()
-	if size is None:
-		ntest = int(alpha * n)
-	else:
-		ntest = size
-	y_out = y[np.unique(indices)]
-
-	# bin the data
-	samples_per_bin, bins, = np.histogram(y_out, bins=max_bins)  # Doane's method worked best for me
-	classes = np.digitize(y_out, bins)
-	classes[classes == max_bins+1] = max_bins
-
-	# randomly split
-	s = StratifiedShuffleSplit(n_splits=1, test_size=ntest)
-
-	for _, n_test_indices in s.split(out,classes):
-		mask_test = torch.zeros(N).bool()
-		for index in n_test_indices:
-			mask_test = torch.logical_or(mask_test, indices == index)
-
-		return mask_test, ~mask_test
-
-
-def randomly_split_set_without_duplicates(x: torch.Tensor,
-										  alpha: float = 0.2,
-										  size: Union[int, float, None] = None):
-	"""
-	Randomly splits the dataset and returns the mask of the
-	:param x:
-	:param alpha:
-	:return:
-	"""
-
-	# sort tensor
-	N = x.size()[0]
-
-	out, indices = torch.unique(x, dim=0, return_inverse=True)
-
-	n, d = out.size()
-	if size is None:
-		ntest = int(alpha * n)
-	else:
-		ntest = size
-
-	# randomly split
-	n_test_indices = np.random.choice(np.arange(0, n, 1), size=ntest, replace=False)
-	mask_test = torch.zeros(N).bool()
-
-	for index in n_test_indices:
-		mask_test = torch.logical_or(mask_test, indices == index)
-
-	return mask_test, ~mask_test
-
-
-def randomly_split_set_without_duplicates_general(x: torch.Tensor,
-										  sizes: List = [None]):
-	"""
-	Randomly splits the dataset and returns the mask of the
-	:param x:
-	:param alpha:
-	:return:
-	"""
-
-	# sort tensor
-	N = x.size()[0]
-
-	out, indices = torch.unique(x, dim=0, return_inverse=True)
-	# is number of unique elements
-	n, d = out.size()
-
-	# randomly permute indices
-	inde = torch.from_numpy(np.random.permutation(np.arange(0, n, 1)))
-	cumsum_indices = torch.cumsum(torch.Tensor(sizes),0).int()
-	cumsum_indices = torch.cat((torch.Tensor([0]),cumsum_indices)).int()
-
-	masks = [torch.zeros(N).bool() for _ in sizes]
-	for j in range(len(sizes)):
-		n_test_indices = inde[cumsum_indices[j]:min(n,cumsum_indices[j+1])]
-		for index in n_test_indices:
-			masks[j] = torch.logical_or(masks[j], indices == index)
-
-	return masks
+    d = len(bounds)
+    x = np.zeros(shape=(d))
+    for i in range(d):
+        x[i] = np.uniform(bounds[i][0], bounds[i][1])
+    return x
+
+
+def randomly_split_set_without_duplicates_balanced(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    max_bins: int = 2,
+    alpha: float = 0.2,
+    size: Union[int, float, None] = None,
+):
+    # sort tensor
+    N = x.size()[0]
+
+    out, indices = torch.unique(x, dim=0, return_inverse=True)
+    n, d = out.size()
+    if size is None:
+        ntest = int(alpha * n)
+    else:
+        ntest = size
+    y_out = y[np.unique(indices)]
+
+    # bin the data
+    (
+        samples_per_bin,
+        bins,
+    ) = np.histogram(
+        y_out, bins=max_bins
+    )  # Doane's method worked best for me
+    classes = np.digitize(y_out, bins)
+    classes[classes == max_bins + 1] = max_bins
+
+    # randomly split
+    s = StratifiedShuffleSplit(n_splits=1, test_size=ntest)
+
+    for _, n_test_indices in s.split(out, classes):
+        mask_test = torch.zeros(N).bool()
+        for index in n_test_indices:
+            mask_test = torch.logical_or(mask_test, indices == index)
+
+        return mask_test, ~mask_test
+
+
+def randomly_split_set_without_duplicates(
+    x: torch.Tensor, alpha: float = 0.2, size: Union[int, float, None] = None
+):
+    """
+    Randomly splits the dataset and returns the mask of the
+    :param x:
+    :param alpha:
+    :return:
+    """
+
+    # sort tensor
+    N = x.size()[0]
+
+    out, indices = torch.unique(x, dim=0, return_inverse=True)
+
+    n, d = out.size()
+    if size is None:
+        ntest = int(alpha * n)
+    else:
+        ntest = size
+
+    # randomly split
+    n_test_indices = np.random.choice(np.arange(0, n, 1), size=ntest, replace=False)
+    mask_test = torch.zeros(N).bool()
+
+    for index in n_test_indices:
+        mask_test = torch.logical_or(mask_test, indices == index)
+
+    return mask_test, ~mask_test
+
+
+def randomly_split_set_without_duplicates_general(
+    x: torch.Tensor, sizes: List = [None]
+):
+    """
+    Randomly splits the dataset and returns the mask of the
+    :param x:
+    :param alpha:
+    :return:
+    """
+
+    # sort tensor
+    N = x.size()[0]
+
+    out, indices = torch.unique(x, dim=0, return_inverse=True)
+    # is number of unique elements
+    n, d = out.size()
+
+    # randomly permute indices
+    inde = torch.from_numpy(np.random.permutation(np.arange(0, n, 1)))
+    cumsum_indices = torch.cumsum(torch.tensor(sizes), 0).int()
+    cumsum_indices = torch.cat((torch.tensor([0]), cumsum_indices)).int()
+
+    masks = [torch.zeros(N).bool() for _ in sizes]
+    for j in range(len(sizes)):
+        n_test_indices = inde[cumsum_indices[j] : min(n, cumsum_indices[j + 1])]
+        for index in n_test_indices:
+            masks[j] = torch.logical_or(masks[j], indices == index)
+
+    return masks
 
 
 #
 
 
 if __name__ == "__main__":
-	# x = torch.Tensor([[2, 1, 1], [2, 1, 1], [2, 2, 2],
-	# 				  [3, 2, 2], [2, 1, 1], [4, 2, 1],
-	# 				  [4, 2, 4], [4,4,4], [1,2,2]]).double()
-	#
-	x = torch.randint(0, 10, size = (2000,3))
-	y = torch.randn(size = (x.size()[0],1))*10
-
-	# masks = randomly_split_set_without_duplicates_general(x, sizes=[1,2,3])
-	#
-	# for mask in masks:
-	# 	print (mask)
-
-	masks = randomly_split_set_without_duplicates_balanced(x,y, size = 100, max_bins = 10)
-	masks2 = randomly_split_set_without_duplicates(x, size = 100)
-	import matplotlib.pyplot as plt
-	labels = ['test', 'train']
-	for index,(mask,mask2) in enumerate(zip(masks,masks2)):
-		plt.hist(y[mask].T, alpha = 0.2, density= True, label = labels[index])
-		plt.hist(y[mask2].T, alpha=0.2, density=True, label=labels[index]+"_random")
-	plt.legend()
-	plt.show()
-
+    # x = torch.tensor([[2, 1, 1], [2, 1, 1], [2, 2, 2],
+    # 				  [3, 2, 2], [2, 1, 1], [4, 2, 1],
+    # 				  [4, 2, 4], [4,4,4], [1,2,2]]).double()
+    #
+    x = torch.randint(0, 10, size=(2000, 3))
+    y = torch.randn(size=(x.size()[0], 1)) * 10
+
+    # masks = randomly_split_set_without_duplicates_general(x, sizes=[1,2,3])
+    #
+    # for mask in masks:
+    # 	print (mask)
+
+    masks = randomly_split_set_without_duplicates_balanced(x, y, size=100, max_bins=10)
+    masks2 = randomly_split_set_without_duplicates(x, size=100)
+    import matplotlib.pyplot as plt
+
+    labels = ["test", "train"]
+    for index, (mask, mask2) in enumerate(zip(masks, masks2)):
+        plt.hist(y[mask].T, alpha=0.2, density=True, label=labels[index])
+        plt.hist(y[mask2].T, alpha=0.2, density=True, label=labels[index] + "_random")
+    plt.legend()
+    plt.show()
diff --git a/stpy/helpers/coreset_helper.py b/stpy/helpers/coreset_helper.py
index 84aaccf..85eae1f 100644
--- a/stpy/helpers/coreset_helper.py
+++ b/stpy/helpers/coreset_helper.py
@@ -3,26 +3,28 @@
 
 
 def epsilon_net(borel_set, k):
-	pass
+    pass
 
 
 def coreset(borel_set, k):
-	pass
+    pass
 
 
 def coreset_leverage_score_greedy(borel_set, kernel, n, tol=10e-4):
-	xtest = borel_set.return_discretization(n)
-	k = kernel.kernel
-	N = xtest.size()[0]
-	score = 1
-	K = k(xtest, xtest)
-	x = xtest[torch.randint(0, N, (1,)), :].view(1, -1)
-	c = 1
-	while score > tol:
-		I = torch.eye(c).double()
-		scores = np.diag(K - k(xtest, x).T @ torch.pinverse(k(x, x) + tol * I) @ k(x, xtest).T)
-		index = np.argmax(scores)
-		x = torch.cat((x, xtest[index, :].view(1, -1)))
-		score = scores[index]
-		c = c + 1
-	return x
+    xtest = borel_set.return_discretization(n)
+    k = kernel.kernel
+    N = xtest.size()[0]
+    score = 1
+    K = k(xtest, xtest)
+    x = xtest[torch.randint(0, N, (1,)), :].view(1, -1)
+    c = 1
+    while score > tol:
+        I = torch.eye(c).double()
+        scores = np.diag(
+            K - k(xtest, x).T @ torch.pinverse(k(x, x) + tol * I) @ k(x, xtest).T
+        )
+        index = np.argmax(scores)
+        x = torch.cat((x, xtest[index, :].view(1, -1)))
+        score = scores[index]
+        c = c + 1
+    return x
diff --git a/stpy/helpers/ellipsoid_algorithms.py b/stpy/helpers/ellipsoid_algorithms.py
index 112d077..2514883 100644
--- a/stpy/helpers/ellipsoid_algorithms.py
+++ b/stpy/helpers/ellipsoid_algorithms.py
@@ -7,424 +7,487 @@
 
 
 def maximum_volume_ellipsoid_l1_polytope_ellipse(ellipse, l1_polytope, verbose=False):
-	"""
-	ellipse is
-	xA_ix + 2b_i x + c_i \leq 0
-
-	\sum q_i | x^\top a_i - b_i |
-
-	:param ellipse:
-	:param polytope:
-	:param verbose:
-	:return:
-	"""
-
-	p = ellipse[0].shape[0]
-
-	B = cp.Variable((p, p), PSD=True)
-	d = cp.Variable((p, 1))
-	lam = cp.Variable((1, 1))
-	obj_max = cp.Maximize(cp.log_det(B))
-
-	constraints = []
-	A, b, c = ellipse
-
-	eye = np.eye(p)
-	zeros = np.zeros(shape=(1, p))
-	invA = np.linalg.inv(A)
-
-	constraints.append(
-		cp.bmat([
-			[-lam - c + b.T @ invA @ b, zeros, d.T + b.T @ invA.T],
-			[zeros.T, lam * eye, B],
-			[d + invA @ b, B, invA]]) >> 0)
-
-	q, X, y, eps = l1_polytope
-	m = X.shape[0]
-	t = cp.Variable((m, 1))
-	constraints.append(q.T @ t <= eps)
-	constraints.append(t >= 0.)
-	for i in range(m):
-		ai = X[i, :]
-		bi = y[i]
-		constraints.append(cp.norm2(B @ ai) + ai.T @ d - bi <= t[i])
-		constraints.append(cp.norm2(B @ ai) - ai.T @ d + bi <= t[i])
-
-	prob = cp.Problem(obj_max, constraints)
-	prob.solve(solver=cp.MOSEK, verbose=verbose)
-
-	print(prob.status)
-	if B.value is not None:
-		return np.linalg.inv(B.value).T @ np.linalg.inv(B.value), d.value
-	else:
-		return None, None
-
-
-def maximum_volume_ellipsoid_relu_polytope_ellipse(ellipse, relu_polytope, verbose=False):
-	"""
-	ellipse is
-	xA_ix + 2b_i x + c_i \leq 0
-
-
-	(eta_i + x^x_i) \leq eps_i
-
-	:param ellipse:
-	:param polytope:
-	:param verbose:
-	:return:
-	"""
-
-	p = ellipse[0].shape[0]
-
-	B = cp.Variable((p, p), PSD=True)
-	d = cp.Variable((p, 1))
-	lam = cp.Variable((1, 1))
-	obj_max = cp.Maximize(cp.log_det(B))
-
-	constraints = []
-	A, b, c = ellipse
-
-	eye = np.eye(p)
-	zeros = np.zeros(shape=(1, p))
-	invA = np.linalg.inv(A)
-
-	constraints.append(
-		cp.bmat([
-			[-lam - c + b.T @ invA @ b, zeros, d.T + b.T @ invA.T],
-			[zeros.T, lam * eye, B],
-			[d + invA @ b, B, invA]]) >> 0)
-
-	q, X, y, eps = relu_polytope
-	m = X.shape[0]
-	t = cp.Variable((m, 1))
-	constraints.append(q.T @ t <= eps)
-	constraints.append(t >= 0.)
-	for i in range(m):
-		ai = X[i, :]
-		bi = y[i]
-		constraints.append(cp.pos(cp.norm2(B @ ai) + ai.T @ d - bi) <= t[i])
-
-	prob = cp.Problem(obj_max, constraints)
-	prob.solve(solver=cp.MOSEK, verbose=verbose)
-
-	print(prob.status)
-	if B.value is not None:
-		return np.linalg.inv(B.value).T @ np.linalg.inv(B.value), d.value
-	else:
-		return None, None
-
-
-def maximum_volume_ellipsoid_intersection_ellipsoids(ellipses, planes=None, verbose=False):
-	"""
-	Each ellipse is
-	xA_ix + 2b_i x + c_i \leq 0
-
-	:param elipses: list of [A,b,c]
-
-	:return:elipse  ||x-v||_B^2 < 1
-	"""
-
-	p = ellipses[0][0].shape[0]
-	m = len(ellipses)
-
-	B = cp.Variable((p, p), PSD=True)
-	d = cp.Variable((p, 1))
-	lam = cp.Variable((m, 1))
-
-	obj_max = cp.Maximize(cp.log_det(B))
-
-	constraints = []
-	for index, ellipse in enumerate(ellipses):
-		A, b, c = ellipse
-
-		eye = np.eye(p)
-		zeros = np.zeros(shape=(1, p))
-		invA = np.linalg.inv(A)
-
-		constraints.append(
-			cp.bmat([
-				[-lam[index, 0] - c + b.T @ invA @ b, zeros, d.T + b.T @ invA.T],
-				[zeros.T, lam[index, 0] * eye, B],
-				[d + invA @ b, B, invA]]) >> 0)
-
-	if planes is not None:
-		for index, plane in enumerate(planes):
-			a, b = plane
-			constraints.append(cp.norm2(B @ a) + a.T @ d <= b)
-
-	prob = cp.Problem(obj_max, constraints)
-	prob.solve(solver=cp.MOSEK, verbose=verbose)
-
-	print(prob.status)
-	if B.value is not None:
-		return np.linalg.inv(B.value).T @ np.linalg.inv(B.value), d.value
-	else:
-		return None, None
+    r"""
+    ellipse is
+    xA_ix + 2b_i x + c_i \leq 0
+
+    \sum q_i | x^\top a_i - b_i |
+
+    :param ellipse:
+    :param polytope:
+    :param verbose:
+    :return:
+    """
+
+    p = ellipse[0].shape[0]
+
+    B = cp.Variable((p, p), PSD=True)
+    d = cp.Variable((p, 1))
+    lam = cp.Variable((1, 1))
+    obj_max = cp.Maximize(cp.log_det(B))
+
+    constraints = []
+    A, b, c = ellipse
+
+    eye = np.eye(p)
+    zeros = np.zeros(shape=(1, p))
+    invA = np.linalg.inv(A)
+
+    constraints.append(
+        cp.bmat(
+            [
+                [-lam - c + b.T @ invA @ b, zeros, d.T + b.T @ invA.T],
+                [zeros.T, lam * eye, B],
+                [d + invA @ b, B, invA],
+            ]
+        )
+        >> 0
+    )
+
+    q, X, y, eps = l1_polytope
+    m = X.shape[0]
+    t = cp.Variable((m, 1))
+    constraints.append(q.T @ t <= eps)
+    constraints.append(t >= 0.0)
+    for i in range(m):
+        ai = X[i, :]
+        bi = y[i]
+        constraints.append(cp.norm2(B @ ai) + ai.T @ d - bi <= t[i])
+        constraints.append(cp.norm2(B @ ai) - ai.T @ d + bi <= t[i])
+
+    prob = cp.Problem(obj_max, constraints)
+    prob.solve(solver=cp.MOSEK, verbose=verbose)
+
+    print(prob.status)
+    if B.value is not None:
+        return np.linalg.inv(B.value).T @ np.linalg.inv(B.value), d.value
+    else:
+        return None, None
+
+
+def maximum_volume_ellipsoid_relu_polytope_ellipse(
+    ellipse, relu_polytope, verbose=False
+):
+    """
+    ellipse is
+    xA_ix + 2b_i x + c_i \leq 0
+
+
+    (eta_i + x^x_i) \leq eps_i
+
+    :param ellipse:
+    :param polytope:
+    :param verbose:
+    :return:
+    """
+
+    p = ellipse[0].shape[0]
+
+    B = cp.Variable((p, p), PSD=True)
+    d = cp.Variable((p, 1))
+    lam = cp.Variable((1, 1))
+    obj_max = cp.Maximize(cp.log_det(B))
+
+    constraints = []
+    A, b, c = ellipse
+
+    eye = np.eye(p)
+    zeros = np.zeros(shape=(1, p))
+    invA = np.linalg.inv(A)
+
+    constraints.append(
+        cp.bmat(
+            [
+                [-lam - c + b.T @ invA @ b, zeros, d.T + b.T @ invA.T],
+                [zeros.T, lam * eye, B],
+                [d + invA @ b, B, invA],
+            ]
+        )
+        >> 0
+    )
+
+    q, X, y, eps = relu_polytope
+    m = X.shape[0]
+    t = cp.Variable((m, 1))
+    constraints.append(q.T @ t <= eps)
+    constraints.append(t >= 0.0)
+    for i in range(m):
+        ai = X[i, :]
+        bi = y[i]
+        constraints.append(cp.pos(cp.norm2(B @ ai) + ai.T @ d - bi) <= t[i])
+
+    prob = cp.Problem(obj_max, constraints)
+    prob.solve(solver=cp.MOSEK, verbose=verbose)
+
+    print(prob.status)
+    if B.value is not None:
+        return np.linalg.inv(B.value).T @ np.linalg.inv(B.value), d.value
+    else:
+        return None, None
+
+
+def maximum_volume_ellipsoid_intersection_ellipsoids(
+    ellipses, planes=None, verbose=False
+):
+    r"""
+    Each ellipse is
+    xA_ix + 2b_i x + c_i \leq 0
+
+    :param elipses: list of [A,b,c]
+
+    :return:elipse  ||x-v||_B^2 < 1
+    """
+
+    p = ellipses[0][0].shape[0]
+    m = len(ellipses)
+
+    B = cp.Variable((p, p), PSD=True)
+    d = cp.Variable((p, 1))
+    lam = cp.Variable((m, 1))
+
+    obj_max = cp.Maximize(cp.log_det(B))
+
+    constraints = []
+    for index, ellipse in enumerate(ellipses):
+        A, b, c = ellipse
+
+        eye = np.eye(p)
+        zeros = np.zeros(shape=(1, p))
+        invA = np.linalg.inv(A)
+
+        constraints.append(
+            cp.bmat(
+                [
+                    [-lam[index, 0] - c + b.T @ invA @ b, zeros, d.T + b.T @ invA.T],
+                    [zeros.T, lam[index, 0] * eye, B],
+                    [d + invA @ b, B, invA],
+                ]
+            )
+            >> 0
+        )
+
+    if planes is not None:
+        for index, plane in enumerate(planes):
+            a, b = plane
+            constraints.append(cp.norm2(B @ a) + a.T @ d <= b)
+
+    prob = cp.Problem(obj_max, constraints)
+    prob.solve(solver=cp.MOSEK, verbose=verbose)
+
+    print(prob.status)
+    if B.value is not None:
+        return np.linalg.inv(B.value).T @ np.linalg.inv(B.value), d.value
+    else:
+        return None, None
 
 
 # return B.value, -d.value
 
+
 def ellipsoid_cut(c, B, a, beta):
-	"""
-	:param c: elipsoid center
-	:param B: elipsoid covariance
-	:param a: a
-	:param beta:
-
-	(x-c)^\top B^{-1} (x-c) \leq 1
-	a^x \leq \beta
-
-	:return:
-	"""
-	N = a.T @ B @ a
-	print(N)
-	alpha = (a.T @ c - beta) / np.sqrt(N)
-	if alpha > 0:
-		d = c.shape[0]
-		tau = (1 + d * alpha) / (d + 1)
-		delta = ((d ** 2) / (d ** 2 - 1)) * (1 - alpha ** 2)
-		sigma = (2. * (1 + d * alpha)) / ((d + 1) * (1 + alpha))
-
-		s = B @ a
-		c = c + tau * (s / np.sqrt(N))
-		B = delta * (B - sigma * (s @ s.T) / (N))
-	return (c, B)
+    r"""
+    :param c: elipsoid center
+    :param B: elipsoid covariance
+    :param a: a
+    :param beta:
+
+    (x-c)^\top B^{-1} (x-c) \leq 1
+    a^x \leq \beta
+
+    :return:
+    """
+    N = a.T @ B @ a
+    print(N)
+    alpha = (a.T @ c - beta) / np.sqrt(N)
+    if alpha > 0:
+        d = c.shape[0]
+        tau = (1 + d * alpha) / (d + 1)
+        delta = ((d**2) / (d**2 - 1)) * (1 - alpha**2)
+        sigma = (2.0 * (1 + d * alpha)) / ((d + 1) * (1 + alpha))
+
+        s = B @ a
+        c = c + tau * (s / np.sqrt(N))
+        B = delta * (B - sigma * (s @ s.T) / (N))
+    return (c, B)
 
 
 def maximize_on_elliptical_slice(x, Sigma, mu, c, l, Lambda, u):
-	"""
-	solves the problem
-		min x^\top \theta
-		s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
-		l \leq Lambda \theta \leq u
-	"""
-
-	m = x.shape[0]
-	zero = np.zeros(m)
-	theta = cp.Variable(m)
-	obj_max = cp.Maximize(x @ theta)
-	Sigma_sqrt = np.linalg.cholesky(Sigma)
-	constraints = [cp.SOC(zero.T @ theta + c, Sigma_sqrt @ (theta - mu))]
-	constraints.append(Lambda @ theta >= l)
-	constraints.append(Lambda @ theta <= u)
-	prob = cp.Problem(obj_max, constraints)
-	prob.solve(solver=cp.MOSEK, verbose=False
-			   , mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.dual})
-	val = prob.value
-	theta = theta.value
-	return val, theta
+    r"""
+    solves the problem
+            min x^\top \theta
+            s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
+            l \leq Lambda \theta \leq u
+    """
+
+    m = x.shape[0]
+    zero = np.zeros(m)
+    theta = cp.Variable(m)
+    obj_max = cp.Maximize(x @ theta)
+    Sigma_sqrt = np.linalg.cholesky(Sigma)
+    constraints = [cp.SOC(zero.T @ theta + c, Sigma_sqrt @ (theta - mu))]
+    constraints.append(Lambda @ theta >= l)
+    constraints.append(Lambda @ theta <= u)
+    prob = cp.Problem(obj_max, constraints)
+    prob.solve(
+        solver=cp.MOSEK,
+        verbose=False,
+        mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.dual},
+    )
+    val = prob.value
+    theta = theta.value
+    return val, theta
 
 
 def maximize_matrix_quadratic_on_ellipse(X, Sigma, mu, c, threads=4):
-	"""
-	solves the problem
-		max \theta ^top Z \theta
-		s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
-	"""
-	a = -X @ mu.reshape(-1)
-	val, theta = QCQP_problem(-X, a, c, Sigma=Sigma, threads=threads)
-	val = -val + mu @ X @ mu
-	return val, theta
+    r"""
+    solves the problem
+            max \theta ^top Z \theta
+            s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
+    """
+    a = -X @ mu.reshape(-1)
+    val, theta = QCQP_problem(-X, a, c, Sigma=Sigma, threads=threads)
+    val = -val + mu @ X @ mu
+    return val, theta
 
 
 def minimize_matrix_quadratic_on_ellipse(Z, Sigma, mu, c, threads=4):
-	"""
-	solves the problem
-		min \theta ^top Z \theta
-		s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
-	"""
-
-	m = Z.shape[0]
-	zero = np.zeros(m)
-	Sigma_sqrt = np.linalg.cholesky(Sigma)
-	theta = cp.Variable(m)
-	obj = cp.Minimize(cp.quad_form(theta, Z))
-	constraints = [cp.SOC(zero.T @ theta + c, Sigma_sqrt @ (theta - mu))]
-	prob = cp.Problem(obj, constraints)
-	prob.solve(solver=cp.MOSEK, verbose=False,
-			   mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-							 mosek.iparam.num_threads: threads})
-	val = prob.value
-	theta = theta.value
-	return val, theta
+    r"""
+    solves the problem
+            min \theta ^top Z \theta
+            s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
+    """
+
+    m = Z.shape[0]
+    zero = np.zeros(m)
+    Sigma_sqrt = np.linalg.cholesky(Sigma)
+    theta = cp.Variable(m)
+    obj = cp.Minimize(cp.quad_form(theta, Z))
+    constraints = [cp.SOC(zero.T @ theta + c, Sigma_sqrt @ (theta - mu))]
+    prob = cp.Problem(obj, constraints)
+    prob.solve(
+        solver=cp.MOSEK,
+        verbose=False,
+        mosek_params={
+            mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+            mosek.iparam.num_threads: threads,
+        },
+    )
+    val = prob.value
+    theta = theta.value
+    return val, theta
 
 
 def maximize_quadratic_on_ellipse(x, Sigma, mu, c, threads=4):
-	"""
-	solves the problem
-		max (x^\top \theta)^2
-		s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
-	"""
-	X = x.reshape(-1, 1) @ x.reshape(1, -1)
-	a = -X @ mu.reshape(-1)
-	val, theta = QCQP_problem(-X, a, c, Sigma=Sigma, threads=threads)
-	val = -val + mu @ X @ mu
-	return val, theta
+    r"""
+    solves the problem
+            max (x^\top \theta)^2
+            s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
+    """
+    X = x.reshape(-1, 1) @ x.reshape(1, -1)
+    a = -X @ mu.reshape(-1)
+    val, theta = QCQP_problem(-X, a, c, Sigma=Sigma, threads=threads)
+    val = -val + mu @ X @ mu
+    return val, theta
 
 
 def minimize_quadratic_on_ellipse(x, Sigma, mu, c, threads=4):
-	"""
-	solves the problem
-		min (x^\top \theta)^2
-		s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
-	"""
-
-	m = x.shape[0]
-	zero = np.zeros(m)
-	Sigma_sqrt = np.linalg.cholesky(Sigma)
-	theta = cp.Variable(m)
-	obj = cp.Minimize((x @ theta) ** 2)
-	constraints = [cp.SOC(zero.T @ theta + c, Sigma_sqrt @ (theta - mu))]
-	prob = cp.Problem(obj, constraints)
-	prob.solve(solver=cp.MOSEK, verbose=False,
-			   mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-							 mosek.iparam.num_threads: threads})
-	val = prob.value
-	theta = theta.value
-	return val, theta
+    r"""
+    solves the problem
+            min (x^\top \theta)^2
+            s.t. (\theta - \mu)Sigma(\theta - \mu) \leq c
+    """
+
+    m = x.shape[0]
+    zero = np.zeros(m)
+    Sigma_sqrt = np.linalg.cholesky(Sigma)
+    theta = cp.Variable(m)
+    obj = cp.Minimize((x @ theta) ** 2)
+    constraints = [cp.SOC(zero.T @ theta + c, Sigma_sqrt @ (theta - mu))]
+    prob = cp.Problem(obj, constraints)
+    prob.solve(
+        solver=cp.MOSEK,
+        verbose=False,
+        mosek_params={
+            mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+            mosek.iparam.num_threads: threads,
+        },
+    )
+    val = prob.value
+    theta = theta.value
+    return val, theta
 
 
 def KY_initialization(X):
-	(n, d) = X.shape
-	y = np.zeros(shape=(d, d,))
-	zs = []
-	c = np.random.randn(d)
-	for j in range(d):
-		id_max = np.argmax(X @ c)
-		id_min = np.argmin(X @ c)
-
-		z_max = X[np.argmax(X @ c), :]
-		z_min = X[np.argmin(X @ c), :]
-
-		zs = zs + [id_max, id_min]
-		y[j, :] = z_max - z_min
-
-		c = np.random.randn(d)
-		for i in range(j):
-			c = c - ((np.dot(c, y[i, :])) / (np.dot(y[i, :], y[i, :]))) * y[i, :]
-
-	mu = np.zeros(shape=(n))
-	mu[zs] = 1.
-	mu = mu / np.sum(mu)
-	return mu
+    (n, d) = X.shape
+    y = np.zeros(
+        shape=(
+            d,
+            d,
+        )
+    )
+    zs = []
+    c = np.random.randn(d)
+    for j in range(d):
+        id_max = np.argmax(X @ c)
+        id_min = np.argmin(X @ c)
+
+        z_max = X[np.argmax(X @ c), :]
+        z_min = X[np.argmin(X @ c), :]
+
+        zs = zs + [id_max, id_min]
+        y[j, :] = z_max - z_min
+
+        c = np.random.randn(d)
+        for i in range(j):
+            c = c - ((np.dot(c, y[i, :])) / (np.dot(y[i, :], y[i, :]))) * y[i, :]
+
+    mu = np.zeros(shape=(n))
+    mu[zs] = 1.0
+    mu = mu / np.sum(mu)
+    return mu
 
 
 def KY_initialization_modified(X):
-	(n, d) = X.shape
-	y = np.zeros(shape=(d, d,))
-	zs = []
-	c = np.random.randn(d)
-	for j in range(d):
-		id_max = np.argmax(X @ c)
-		id_min = np.argmin(X @ c)
-
-		z_max = X[np.argmax(X @ c), :]
-		z_min = X[np.argmin(X @ c), :]
-
-		zs = zs + [id_max]
-		y[j, :] = z_max - z_min
-
-		c = np.random.randn(d)
-		for i in range(j):
-			c = c - ((np.dot(c, y[i, :])) / (np.dot(y[i, :], y[i, :]))) * y[i, :]
-
-	mu = np.zeros(shape=(n))
-	mu[zs] = 1.
-	mu = mu / np.sum(mu)
-	return mu
-
-
-def plot_ellipse(offset, cov, scale=1, theta_num=1000, axis=None, plot_kwargs=None, fill=False, fill_kwargs=None,
-				 color='r'):
-	'''
-	offset = 2d array which gives center of ellipse
-	cov = covariance of ellipse
-	scale = scale ellipse by constant factor
-	theta_num = used for a linspace below, not sure exactly (?)
-
-	'''
-	# Get Ellipse Properties from cov matrix
-
-	eig_vec, eig_val, u = np.linalg.svd(cov)
-	# Make sure 0th eigenvector has positive x-coordinate
-	if eig_vec[0][0] < 0:
-		eig_vec[0] *= -1
-	semimaj = np.sqrt(eig_val[0])
-	semimin = np.sqrt(eig_val[1])
-	semimaj *= scale
-	semimin *= scale
-	phi = np.arccos(np.dot(eig_vec[0], np.array([1, 0])))
-	if eig_vec[0][1] < 0 and phi > 0:
-		phi *= -1
-
-	# Generate data for ellipse structure
-	theta = np.linspace(0, 2 * np.pi, theta_num)
-	r = 1 / np.sqrt((np.cos(theta)) ** 2 + (np.sin(theta)) ** 2)
-	x = r * np.cos(theta)
-	y = r * np.sin(theta)
-	data = np.array([x, y])
-	S = np.array([[semimaj, 0], [0, semimin]])
-	R = np.array([[np.cos(phi), -np.sin(phi)], [np.sin(phi), np.cos(phi)]])
-	T = np.dot(R, S)
-	data = np.dot(T, data)
-	data[0] += offset[0]
-	data[1] += offset[1]
-
-	# Plot!
-	return_fig = False
-	if axis is None:
-		axis = plt.gca()
-
-	if plot_kwargs is None:
-		p, = axis.plot(data[0], data[1], color=color, linestyle='-')
-	else:
-		p, = axis.plot(data[0], data[1], **plot_kwargs)
-
-	if fill == True:
-		if fill_kwargs is None:
-			fill_kwargs = dict()
-		axis.fill(data[0], data[1], alpha=0.2, color=color)
+    (n, d) = X.shape
+    y = np.zeros(
+        shape=(
+            d,
+            d,
+        )
+    )
+    zs = []
+    c = np.random.randn(d)
+    for j in range(d):
+        id_max = np.argmax(X @ c)
+        id_min = np.argmin(X @ c)
+
+        z_max = X[np.argmax(X @ c), :]
+        z_min = X[np.argmin(X @ c), :]
+
+        zs = zs + [id_max]
+        y[j, :] = z_max - z_min
+
+        c = np.random.randn(d)
+        for i in range(j):
+            c = c - ((np.dot(c, y[i, :])) / (np.dot(y[i, :], y[i, :]))) * y[i, :]
+
+    mu = np.zeros(shape=(n))
+    mu[zs] = 1.0
+    mu = mu / np.sum(mu)
+    return mu
+
+
+def plot_ellipse(
+    offset,
+    cov,
+    scale=1,
+    theta_num=1000,
+    axis=None,
+    plot_kwargs=None,
+    fill=False,
+    fill_kwargs=None,
+    color="r",
+):
+    """
+    offset = 2d array which gives center of ellipse
+    cov = covariance of ellipse
+    scale = scale ellipse by constant factor
+    theta_num = used for a linspace below, not sure exactly (?)
+
+    """
+    # Get Ellipse Properties from cov matrix
+
+    eig_vec, eig_val, u = np.linalg.svd(cov)
+    # Make sure 0th eigenvector has positive x-coordinate
+    if eig_vec[0][0] < 0:
+        eig_vec[0] *= -1
+    semimaj = np.sqrt(eig_val[0])
+    semimin = np.sqrt(eig_val[1])
+    semimaj *= scale
+    semimin *= scale
+    phi = np.arccos(np.dot(eig_vec[0], np.array([1, 0])))
+    if eig_vec[0][1] < 0 and phi > 0:
+        phi *= -1
+
+    # Generate data for ellipse structure
+    theta = np.linspace(0, 2 * np.pi, theta_num)
+    r = 1 / np.sqrt((np.cos(theta)) ** 2 + (np.sin(theta)) ** 2)
+    x = r * np.cos(theta)
+    y = r * np.sin(theta)
+    data = np.array([x, y])
+    S = np.array([[semimaj, 0], [0, semimin]])
+    R = np.array([[np.cos(phi), -np.sin(phi)], [np.sin(phi), np.cos(phi)]])
+    T = np.dot(R, S)
+    data = np.dot(T, data)
+    data[0] += offset[0]
+    data[1] += offset[1]
+
+    # Plot!
+    return_fig = False
+    if axis is None:
+        axis = plt.gca()
+
+    if plot_kwargs is None:
+        (p,) = axis.plot(data[0], data[1], color=color, linestyle="-")
+    else:
+        (p,) = axis.plot(data[0], data[1], **plot_kwargs)
+
+    if fill == True:
+        if fill_kwargs is None:
+            fill_kwargs = dict()
+        axis.fill(data[0], data[1], alpha=0.2, color=color)
 
 
 if __name__ == "__main__":
-	d = 2
+    d = 2
 
-	s1 = 1
-	s2 = 1
+    s1 = 1
+    s2 = 1
 
-	A1 = np.random.randn(d, d)
-	A1 = A1.T @ A1
+    A1 = np.random.randn(d, d)
+    A1 = A1.T @ A1
 
-	A2 = np.random.randn(d, d)
-	A2 = A2.T @ A2
+    A2 = np.random.randn(d, d)
+    A2 = A2.T @ A2
 
-	center1 = np.zeros((d, 1))
-	center2 = np.ones((d, 1))
+    center1 = np.zeros((d, 1))
+    center2 = np.ones((d, 1))
 
-	b1 = - A1 @ center1
-	b2 = - A2 @ center2
+    b1 = -A1 @ center1
+    b2 = -A2 @ center2
 
-	c1 = -s1 + center1.T @ A1 @ center1
-	c2 = -s2 + center2.T @ A2 @ center2
+    c1 = -s1 + center1.T @ A1 @ center1
+    c2 = -s2 + center2.T @ A2 @ center2
 
-	# ellipsoids = [[A1,b1,c1],[A2,b2,c2]]
-	ellipsoids = [[A2, b2, c2]]
-	planes = [[center2, np.array([[0.]])]]
+    # ellipsoids = [[A1,b1,c1],[A2,b2,c2]]
+    ellipsoids = [[A2, b2, c2]]
+    planes = [[center2, np.array([[0.0]])]]
 
-	A, b = maximum_volume_ellipsoid_intersection_ellipsoids(ellipsoids, planes=planes)
-	# c = 1
+    A, b = maximum_volume_ellipsoid_intersection_ellipsoids(ellipsoids, planes=planes)
+    # c = 1
 
-	axis = plt.gca()
+    axis = plt.gca()
 
-	## the cov is
-	# (x-center)cov^{-1}(x-center)
-	# plot_ellipse(np.array([0.,0.]), cov=np.array([[2,0.],[0.0,2.]]), scale = 1., axis=axis, fill=True, color = 'purple')
+    ## the cov is
+    # (x-center)cov^{-1}(x-center)
+    # plot_ellipse(np.array([0.,0.]), cov=np.array([[2,0.],[0.0,2.]]), scale = 1., axis=axis, fill=True, color = 'purple')
 
-	plot_ellipse(center1.reshape(-1), cov=np.linalg.inv(A1), scale=1., axis=axis, fill=True)
-	plot_ellipse(center2.reshape(-1), cov=np.linalg.inv(A2), scale=1., axis=axis, fill=True, color='b')
+    plot_ellipse(
+        center1.reshape(-1), cov=np.linalg.inv(A1), scale=1.0, axis=axis, fill=True
+    )
+    plot_ellipse(
+        center2.reshape(-1),
+        cov=np.linalg.inv(A2),
+        scale=1.0,
+        axis=axis,
+        fill=True,
+        color="b",
+    )
 
-	plot_ellipse(b.reshape(-1), cov=np.linalg.inv(A), scale=1., axis=axis, fill=True, color='g')
+    plot_ellipse(
+        b.reshape(-1), cov=np.linalg.inv(A), scale=1.0, axis=axis, fill=True, color="g"
+    )
 
-	plt.xlim([-4, 4])
-	plt.ylim([-4, 4])
-	plt.show()
+    plt.xlim([-4, 4])
+    plt.ylim([-4, 4])
+    plt.show()
diff --git a/stpy/helpers/haarfisz_transform.py b/stpy/helpers/haarfisz_transform.py
index 0c975d6..3c95a8e 100644
--- a/stpy/helpers/haarfisz_transform.py
+++ b/stpy/helpers/haarfisz_transform.py
@@ -3,98 +3,103 @@
 
 
 """
+
 import numpy as np
 
 
 def haar_fisz_transform(data):
-	a = 2.
-	n = data.shape[0]
-	nhalf = n // 2
+    a = 2.0
+    n = data.shape[0]
+    nhalf = n // 2
 
-	J = np.log2(n)
-	res = data.copy()
-	sm = np.zeros(shape=nhalf, dtype=float)
-	det = sm.copy()
+    J = np.log2(n)
+    res = data.copy()
+    sm = np.zeros(shape=nhalf, dtype=float)
+    det = sm.copy()
 
-	for i in np.arange(0, J, 1):
-		indices = np.arange(0, nhalf, 1)
+    for i in np.arange(0, J, 1):
+        indices = np.arange(0, nhalf, 1)
 
-		sm[0:nhalf] = (res[2 * indices] + res[2 * indices + 1]) / a
-		det[0:nhalf] = (res[2 * indices] - res[2 * indices + 1]) / a
+        sm[0:nhalf] = (res[2 * indices] + res[2 * indices + 1]) / a
+        det[0:nhalf] = (res[2 * indices] - res[2 * indices + 1]) / a
 
-		det[sm > 0] = det[sm > 0] / np.sqrt(sm[sm > 0])
+        det[sm > 0] = det[sm > 0] / np.sqrt(sm[sm > 0])
 
-		res[0:nhalf] = sm[0:nhalf]
-		res[nhalf:n] = det[0:nhalf]
+        res[0:nhalf] = sm[0:nhalf]
+        res[nhalf:n] = det[0:nhalf]
 
-		n = n // 2
-		nhalf = nhalf // 2
-		sm = np.zeros(shape=nhalf)
-		det = sm.copy()
+        n = n // 2
+        nhalf = nhalf // 2
+        sm = np.zeros(shape=nhalf)
+        det = sm.copy()
 
-	nhalf = 1
-	n = 2
-	sm = np.zeros(shape=nhalf)
-	det = sm.copy()
-	for i in np.arange(0, J, 1):
-		indices = np.arange(0, nhalf, 1)
-		sm[indices] = res[indices]
-		det[indices] = res[nhalf:n]
-		res[2 * indices] = a / 2. * (sm[indices] + det[indices])
-		res[2 * indices + 1] = a / 2. * (sm[indices] - det[indices])
+    nhalf = 1
+    n = 2
+    sm = np.zeros(shape=nhalf)
+    det = sm.copy()
+    for i in np.arange(0, J, 1):
+        indices = np.arange(0, nhalf, 1)
+        sm[indices] = res[indices]
+        det[indices] = res[nhalf:n]
+        res[2 * indices] = a / 2.0 * (sm[indices] + det[indices])
+        res[2 * indices + 1] = a / 2.0 * (sm[indices] - det[indices])
 
-		n = 2 * n
-		nhalf = 2 * nhalf
+        n = 2 * n
+        nhalf = 2 * nhalf
 
-		sm = np.zeros(shape=nhalf)
-		det = sm.copy()
-	return res
+        sm = np.zeros(shape=nhalf)
+        det = sm.copy()
+    return res
 
 
 def inverse_haar_fisz_transform(data):
-	a = 2.
-	n = data.shape[0]
-	nhalf = n // 2
-	J = np.log2(n)
-	res = data.copy()
-	sm = np.zeros(shape=nhalf)
-	det = sm.copy()
-
-	for i in np.arange(0, J, 1):
-		indices = np.arange(0, nhalf, 1)
-
-		sm[0:nhalf] = (res[2 * indices] + res[2 * indices + 1]) / a
-		det[0:nhalf] = (res[2 * indices] - res[2 * indices + 1]) / a
-		res[0:nhalf] = sm[0:nhalf]
-		res[(nhalf):n] = det[0:nhalf]
-		n = n // 2
-		nhalf = nhalf // 2
-
-	nhalf = 1
-	n = 2
-
-	for i in np.arange(0, J, 1):
-		sm[0:nhalf] = res[0:nhalf]
-		det[0:nhalf] = res[nhalf:n]
-		indices = np.arange(0, nhalf, 1)
-
-		res[2 * indices] = (a / 2.) * (sm[0:nhalf] + det[0:nhalf] * np.sqrt(sm[0:nhalf]))
-		res[2 * indices + 1] = (a / 2.) * (sm[0:nhalf] - det[0:nhalf] * np.sqrt(sm[0:nhalf]))
-		res[res < 0.] = 0.
-		n = 2 * n
-		nhalf = 2 * nhalf
-	return res
+    a = 2.0
+    n = data.shape[0]
+    nhalf = n // 2
+    J = np.log2(n)
+    res = data.copy()
+    sm = np.zeros(shape=nhalf)
+    det = sm.copy()
+
+    for i in np.arange(0, J, 1):
+        indices = np.arange(0, nhalf, 1)
+
+        sm[0:nhalf] = (res[2 * indices] + res[2 * indices + 1]) / a
+        det[0:nhalf] = (res[2 * indices] - res[2 * indices + 1]) / a
+        res[0:nhalf] = sm[0:nhalf]
+        res[(nhalf):n] = det[0:nhalf]
+        n = n // 2
+        nhalf = nhalf // 2
+
+    nhalf = 1
+    n = 2
+
+    for i in np.arange(0, J, 1):
+        sm[0:nhalf] = res[0:nhalf]
+        det[0:nhalf] = res[nhalf:n]
+        indices = np.arange(0, nhalf, 1)
+
+        res[2 * indices] = (a / 2.0) * (
+            sm[0:nhalf] + det[0:nhalf] * np.sqrt(sm[0:nhalf])
+        )
+        res[2 * indices + 1] = (a / 2.0) * (
+            sm[0:nhalf] - det[0:nhalf] * np.sqrt(sm[0:nhalf])
+        )
+        res[res < 0.0] = 0.0
+        n = 2 * n
+        nhalf = 2 * nhalf
+    return res
 
 
 if __name__ == "__main__":
-	import matplotlib.pyplot as plt
-
-	s = np.random.poisson(5, 4) * 0 + 1
-	s2 = np.random.poisson(20, 4) * 0 + 3
-	s = np.concatenate((s, s2)).astype(float)
-	plt.plot(s)
-	v = haar_fisz_transform(s)
-	s_inv = inverse_haar_fisz_transform(v)
-	plt.plot(v)
-	plt.plot(s_inv, '--')
-	plt.show()
+    import matplotlib.pyplot as plt
+
+    s = np.random.poisson(5, 4) * 0 + 1
+    s2 = np.random.poisson(20, 4) * 0 + 3
+    s = np.concatenate((s, s2)).astype(float)
+    plt.plot(s)
+    v = haar_fisz_transform(s)
+    s_inv = inverse_haar_fisz_transform(v)
+    plt.plot(v)
+    plt.plot(s_inv, "--")
+    plt.show()
diff --git a/stpy/helpers/helper.py b/stpy/helpers/helper.py
index 26591ed..5371e61 100755
--- a/stpy/helpers/helper.py
+++ b/stpy/helpers/helper.py
@@ -8,531 +8,564 @@
 from torch.autograd.functional import jacobian
 
 
-def isin(element, test_elements, assume_unique=False, atol = 1e-10):
-	(n, d) = element.shape
-	(m, d) = test_elements.shape
-	maskFull = np.full((n), False, dtype=bool)
-	for j in range(m):
-		mask = np.full((n), True, dtype=bool)
-		for i in range(d):
-			# mask = np.logical_and(mask,np.in1d(element[:,i],test_elements[j,i], assume_unique=assume_unique))
-			mask = np.logical_and(mask, np.isclose(element[:, i], test_elements[j, i], atol=atol))
-		# print (j, i, mask)
-		maskFull = np.logical_or(mask, maskFull)
-	# print (maskFull)
-	return maskFull
-
-
-
-def cartesian(arrays, out=None, dtype = None):
-	"""
-	Generate a cartesian product of input arrays.
-
-	Parameters
-	----------
-	arrays : list of array-like
-			1-D arrays to form the cartesian product of.
-	out : ndarray
-			Array to place the cartesian product in.
-
-	Returns
-	-------
-	out : ndarray
-			2-D array of shape (M, len(arrays)) containing cartesian products
-			formed of input arrays.
-
-	"""
-	arrays = [np.asarray(x) for x in arrays]
-	if dtype is None:
-		dtype = arrays[0].dtype
-	n = np.prod([x.size for x in arrays])
-	if out is None:
-		out = np.zeros([n, len(arrays)], dtype=dtype)
-
-	m = n / arrays[0].size
-	m = int(m)
-	out[:, 0] = np.repeat(arrays[0], m)
-	if arrays[1:]:
-		cartesian(arrays[1:], out=out[0:m, 1:])
-		for j in range(1, arrays[0].size):
-			out[j * m:(j + 1) * m, 1:] = out[0:m, 1:]
-	return out
-
-
-def estimate_std(x: torch.Tensor, # x values used for uniqueness detection
-				 y: torch.Tensor, # y values
-				 truncation:Union[float,None] = None, # truncate at specific y
-				 verbose:bool = False, # verbosity level
-				 conservative:bool = False,
-				 return_all_residuals:bool = False # return
-				 ): #
-
-	out, indices, counts = torch.unique(x, dim=0, return_inverse=True, return_counts=True)
-	residuals_mean_list = []
-
-	for i in range(counts.size()[0]):
-		if counts[i] > 1:
-			mask = indices == i
-			mean = torch.mean(y[mask].view(-1))
-			residuals_mean_list.append(y[mask].view(-1)-mean.view(-1))
-	residuals_mean = torch.hstack(residuals_mean_list)
-
-	if verbose:
-		print ("Estimating variance from:",residuals_mean.size())
-
-	if truncation is not None:
-		residuals_mean_trunc = residuals_mean[torch.abs(residuals_mean)<truncation]
-		sigma_std = torch.std(residuals_mean_trunc)
-	else:
-		sigma_std = torch.std(residuals_mean)
-
-	if return_all_residuals:
-		return residuals_mean_list, out, counts, residuals_mean, indices
-	else:
-		return sigma_std
-
+def isin(element, test_elements, assume_unique=False, atol=1e-10):
+    (n, d) = element.shape
+    (m, d) = test_elements.shape
+    maskFull = np.full((n), False, dtype=bool)
+    for j in range(m):
+        mask = np.full((n), True, dtype=bool)
+        for i in range(d):
+            # mask = np.logical_and(mask,np.in1d(element[:,i],test_elements[j,i], assume_unique=assume_unique))
+            mask = np.logical_and(
+                mask, np.isclose(element[:, i], test_elements[j, i], atol=atol)
+            )
+        # print (j, i, mask)
+        maskFull = np.logical_or(mask, maskFull)
+    # print (maskFull)
+    return maskFull
+
+
+def cartesian(arrays, out=None, dtype=None):
+    """
+    Generate a cartesian product of input arrays.
+
+    Parameters
+    ----------
+    arrays : list of array-like
+                    1-D arrays to form the cartesian product of.
+    out : ndarray
+                    Array to place the cartesian product in.
+
+    Returns
+    -------
+    out : ndarray
+                    2-D array of shape (M, len(arrays)) containing cartesian products
+                    formed of input arrays.
+
+    """
+    arrays = [np.asarray(x) for x in arrays]
+    if dtype is None:
+        dtype = arrays[0].dtype
+    n = np.prod([x.size for x in arrays])
+    if out is None:
+        out = np.zeros([n, len(arrays)], dtype=dtype)
+
+    m = n / arrays[0].size
+    m = int(m)
+    out[:, 0] = np.repeat(arrays[0], m)
+    if arrays[1:]:
+        cartesian(arrays[1:], out=out[0:m, 1:])
+        for j in range(1, arrays[0].size):
+            out[j * m : (j + 1) * m, 1:] = out[0:m, 1:]
+    return out
+
+
+def estimate_std(
+    x: torch.Tensor,  # x values used for uniqueness detection
+    y: torch.Tensor,  # y values
+    truncation: Union[float, None] = None,  # truncate at specific y
+    verbose: bool = False,  # verbosity level
+    conservative: bool = False,
+    return_all_residuals: bool = False,  # return
+):  #
+
+    out, indices, counts = torch.unique(
+        x, dim=0, return_inverse=True, return_counts=True
+    )
+    residuals_mean_list = []
+
+    for i in range(counts.size()[0]):
+        if counts[i] > 1:
+            mask = indices == i
+            mean = torch.mean(y[mask].view(-1))
+            residuals_mean_list.append(y[mask].view(-1) - mean.view(-1))
+    residuals_mean = torch.hstack(residuals_mean_list)
+
+    if verbose:
+        print("Estimating variance from:", residuals_mean.size())
+
+    if truncation is not None:
+        residuals_mean_trunc = residuals_mean[torch.abs(residuals_mean) < truncation]
+        sigma_std = torch.std(residuals_mean_trunc)
+    else:
+        sigma_std = torch.std(residuals_mean)
+
+    if return_all_residuals:
+        return residuals_mean_list, out, counts, residuals_mean, indices
+    else:
+        return sigma_std
 
 
 def direct_sum(arrays):
-	dim = np.sum([array.shape[1] for array in arrays])
-	size = np.sum([array.shape[0] for array in arrays])
+    dim = np.sum([array.shape[1] for array in arrays])
+    size = np.sum([array.shape[0] for array in arrays])
 
-	out = np.zeros(shape=(size, dim))
-	dim = 0
-	n = 0
-	for j in range(len(arrays)):
-		new_n, new_dim = arrays[j].shape
-		out[n:n + new_n, dim:dim + new_dim] = arrays[j]
-		dim = dim + new_dim
-		n = n + new_n
+    out = np.zeros(shape=(size, dim))
+    dim = 0
+    n = 0
+    for j in range(len(arrays)):
+        new_n, new_dim = arrays[j].shape
+        out[n : n + new_n, dim : dim + new_dim] = arrays[j]
+        dim = dim + new_dim
+        n = n + new_n
 
-	return out
+    return out
 
 
 def symsqrt(matrix):
-	"""Compute the square root of a positive definite matrix."""
-	# perform the decomposition
-	# s, v = matrix.symeig(eigenvectors=True)
-	_, s, v = matrix.svd()  # passes torch.autograd.gradcheck()
-	# truncate small components
-	above_cutoff = s > s.max() * s.size(-1) * torch.finfo(s.dtype).eps
-	s = s[..., above_cutoff]
-	v = v[..., above_cutoff]
-	# compose the square root matrix
-	return (v * s.sqrt().unsqueeze(-2)) @ v.transpose(-2, -1)
+    """Compute the square root of a positive definite matrix."""
+    # perform the decomposition
+    # s, v = matrix.symeig(eigenvectors=True)
+    _, s, v = matrix.svd()  # passes torch.autograd.gradcheck()
+    # truncate small components
+    above_cutoff = s > s.max() * s.size(-1) * torch.finfo(s.dtype).eps
+    s = s[..., above_cutoff]
+    v = v[..., above_cutoff]
+    # compose the square root matrix
+    return (v * s.sqrt().unsqueeze(-2)) @ v.transpose(-2, -1)
 
 
 def interval(n, d, L_infinity_ball=1, offset=None):
-	if offset is None:
-		arrays = [np.linspace(-L_infinity_ball, L_infinity_ball, n).reshape(n, 1) for i in range(d)]
-		xtest = cartesian(arrays)
-	else:
-		arrays = [np.linspace(offset[i][0], offset[i][1], n).reshape(n, 1) for i in range(d)]
-		xtest = cartesian(arrays)
-	return xtest
+    if offset is None:
+        arrays = [
+            np.linspace(-L_infinity_ball, L_infinity_ball, n).reshape(n, 1)
+            for i in range(d)
+        ]
+        xtest = cartesian(arrays)
+    else:
+        arrays = [
+            np.linspace(offset[i][0], offset[i][1], n).reshape(n, 1) for i in range(d)
+        ]
+        xtest = cartesian(arrays)
+    return xtest
 
 
 def interval_torch(n, d, L_infinity_ball=1, offset=None):
-	return torch.from_numpy(interval(n, d, L_infinity_ball=L_infinity_ball, offset=offset))
+    return torch.from_numpy(
+        interval(n, d, L_infinity_ball=L_infinity_ball, offset=offset)
+    )
 
 
 def get_ecdf(x):
-	x = np.sort(x)
+    x = np.sort(x)
 
-	def result(v):
-		return np.searchsorted(x, v, side='right') / x.size
+    def result(v):
+        return np.searchsorted(x, v, side="right") / x.size
 
-	return result
+    return result
 
 
 def emprical_cdf(data):
-	"""
-	#>>> import numpy as np
-	#>>> emprical_cdf(np.array([1.,2.,3.,1.,2.]))
-	#[1.,2.,3.],[0.4,0.4,0.2]
-	"""
-
-	# create a sorted series of unique data
-	cdfx = np.sort(np.unique(data))
-	# x-data for the ECDF: evenly spaced sequence of the uniques
-	x_values = np.linspace(start=min(cdfx),
-						   stop=max(cdfx), num=len(cdfx))
-
-	# size of the x_values
-	size_data = data.shape[0]
-	# y-data for the ECDF:
-	y_values = []
-	for i in x_values:
-		# all the values in raw data less than the ith value in x_values
-		temp = data[data <= i]
-		# fraction of that value with respect to the size of the x_values
-		value = float(temp.shape[0]) / float(size_data)
-		# pushing the value in the y_values
-		y_values.append(value)
-	# return both x and y values
-	return x_values, np.array(y_values)
+    """
+    #>>> import numpy as np
+    #>>> emprical_cdf(np.array([1.,2.,3.,1.,2.]))
+    #[1.,2.,3.],[0.4,0.4,0.2]
+    """
+
+    # create a sorted series of unique data
+    cdfx = np.sort(np.unique(data))
+    # x-data for the ECDF: evenly spaced sequence of the uniques
+    x_values = np.linspace(start=min(cdfx), stop=max(cdfx), num=len(cdfx))
+
+    # size of the x_values
+    size_data = data.shape[0]
+    # y-data for the ECDF:
+    y_values = []
+    for i in x_values:
+        # all the values in raw data less than the ith value in x_values
+        temp = data[data <= i]
+        # fraction of that value with respect to the size of the x_values
+        value = float(temp.shape[0]) / float(size_data)
+        # pushing the value in the y_values
+        y_values.append(value)
+    # return both x and y values
+    return x_values, np.array(y_values)
 
 
 def batch_jacobian(f, x, create_graph=False, vectorize=False):
-	f_sum = lambda x: torch.sum(f(x), axis=0)
-	return jacobian(f_sum, x, create_graph=create_graph, vectorize=vectorize)
+    f_sum = lambda x: torch.sum(f(x), axis=0)
+    return jacobian(f_sum, x, create_graph=create_graph, vectorize=vectorize)
 
 
 def batch_hessian(f, x, create_graph=False, vectorize=False, vv=False):
-	J = lambda x: batch_jacobian(f, x, create_graph=True, vectorize=vectorize).transpose(0, 1)
-	H = batch_jacobian(J, x, create_graph=create_graph, vectorize=vv)
-	return H
+    J = lambda x: batch_jacobian(
+        f, x, create_graph=True, vectorize=vectorize
+    ).transpose(0, 1)
+    H = batch_jacobian(J, x, create_graph=create_graph, vectorize=vv)
+    return H
 
 
 def create_pull_back(low, high, inverse=False, to=[-1, 1]):
-	translate = lambda x: x * (to[0] - to[1]) / (low - high) + to[1] - ((to[0] - to[1]) * high) / (low - high)
-	if inverse:
-		translate_back = lambda x: x * (low - high) / (to[0] - to[1]) + high - to[1] * (low - high) / (to[0] - to[1])
-		return translate, translate_back
-	else:
-		return translate
+    translate = (
+        lambda x: x * (to[0] - to[1]) / (low - high)
+        + to[1]
+        - ((to[0] - to[1]) * high) / (low - high)
+    )
+    if inverse:
+        translate_back = (
+            lambda x: x * (low - high) / (to[0] - to[1])
+            + high
+            - to[1] * (low - high) / (to[0] - to[1])
+        )
+        return translate, translate_back
+    else:
+        return translate
 
 
 def hierarchical_distance(group1, group2):
-	group3 = copy.deepcopy(group2)
-	group4 = copy.deepcopy(group1)
-	for elem in group1:
-		try:
-			group3.remove(elem)
-			group4.remove(elem)
-		except:
-			pass
-	if len(group3) == 1 and len(group3[0]) == 1 and len(group4) == 0:
-		return 1
-
-	isin = lambda set, set2: []
-	for a, b in list(itertools.product(group1, group1)):
-		new_group = copy.deepcopy(group1)
-		if a != b:
-			new_group.remove(b)
-			new_group.remove(a)
-			new_group.append(a + b)
-			if len(new_group) == len(group2) and all(i in new_group for i in group2):
-				return 1
-	return 2
+    group3 = copy.deepcopy(group2)
+    group4 = copy.deepcopy(group1)
+    for elem in group1:
+        try:
+            group3.remove(elem)
+            group4.remove(elem)
+        except:
+            pass
+    if len(group3) == 1 and len(group3[0]) == 1 and len(group4) == 0:
+        return 1
+
+    isin = lambda set, set2: []
+    for a, b in list(itertools.product(group1, group1)):
+        new_group = copy.deepcopy(group1)
+        if a != b:
+            new_group.remove(b)
+            new_group.remove(a)
+            new_group.append(a + b)
+            if len(new_group) == len(group2) and all(i in new_group for i in group2):
+                return 1
+    return 2
 
 
 def valid_enlargement(curr, groups):
-	out = []
-	for index, group in enumerate(groups):
-		if hierarchical_distance(curr, group) == 1:
-			out.append(index)
-	return out
+    out = []
+    for index, group in enumerate(groups):
+        if hierarchical_distance(curr, group) == 1:
+            out.append(index)
+    return out
 
 
 def interval_groups(n, d, groups, L_infinity_ball=1):
-	arrays = [interval(n, len(groups[i]), L_infinity_ball=L_infinity_ball) for i in range(len(groups))]
-	xtest = direct_sum(arrays)
-	out = np.zeros(shape=(xtest.shape[0], d))
-	out[:, 0:xtest.shape[1]] = xtest
-	return out
+    arrays = [
+        interval(n, len(groups[i]), L_infinity_ball=L_infinity_ball)
+        for i in range(len(groups))
+    ]
+    xtest = direct_sum(arrays)
+    out = np.zeros(shape=(xtest.shape[0], d))
+    out[:, 0 : xtest.shape[1]] = xtest
+    return out
 
 
 def logsumexp(a, axis=None, b=None):
-	a = np.asarray(a)
-	if axis is None:
-		a = a.ravel()
-	else:
-		a = np.rollaxis(a, axis)
-	a_max = a.max(axis=0)
-	if b is not None:
-		b = np.asarray(b)
-		if axis is None:
-			b = b.ravel()
-		else:
-			b = np.rollaxis(b, axis)
-		out = np.log(np.sum(b * np.exp(a - a_max), axis=0))
-	else:
-		out = np.log(np.sum(np.exp(a - a_max), axis=0))
-	out += a_max
-	return out
+    a = np.asarray(a)
+    if axis is None:
+        a = a.ravel()
+    else:
+        a = np.rollaxis(a, axis)
+    a_max = a.max(axis=0)
+    if b is not None:
+        b = np.asarray(b)
+        if axis is None:
+            b = b.ravel()
+        else:
+            b = np.rollaxis(b, axis)
+        out = np.log(np.sum(b * np.exp(a - a_max), axis=0))
+    else:
+        out = np.log(np.sum(np.exp(a - a_max), axis=0))
+    out += a_max
+    return out
 
 
 class MyBounds(object):
-	def __init__(self, xmax=[1.1, 1.1], xmin=[-1.1, -1.1]):
-		self.xmax = np.array(xmax)
-		self.xmin = np.array(xmin)
+    def __init__(self, xmax=[1.1, 1.1], xmin=[-1.1, -1.1]):
+        self.xmax = np.array(xmax)
+        self.xmin = np.array(xmin)
 
-	def __call__(self, **kwargs):
-		x = kwargs["x_new"]
-		tmax = bool(np.all(x <= self.xmax))
-		tmin = bool(np.all(x >= self.xmin))
-		return tmax and tmin
+    def __call__(self, **kwargs):
+        x = kwargs["x_new"]
+        tmax = bool(np.all(x <= self.xmax))
+        tmin = bool(np.all(x >= self.xmin))
+        return tmax and tmin
 
 
 def full_group(d):
-	g = []
-	for i in range(d):
-		g.append([i])
-	return g
+    g = []
+    for i in range(d):
+        g.append([i])
+    return g
 
 
 def pair_groups(d):
-	g = []
-	for i in range(d - 1):
-		g.append([i, i + 1])
-	return g
+    g = []
+    for i in range(d - 1):
+        g.append([i, i + 1])
+    return g
 
 
 def conditional_decorator(dec, condition):
-	def decorator(func):
-		if not condition:
-			# Return the function unchanged, not decorated.
-			return func
-		return dec(func)
+    def decorator(func):
+        if not condition:
+            # Return the function unchanged, not decorated.
+            return func
+        return dec(func)
+
+    return decorator
 
-	return decorator
 
 def generate_all_pairs(d):
-	groups = []
-	for elem in range(d):
-		for elem2 in range(d):
-			groups.append([elem, elem2])
-	return groups
+    groups = []
+    for elem in range(d):
+        for elem2 in range(d):
+            groups.append([elem, elem2])
+    return groups
 
 
 def generate_groups(d, elements=None):
-	"""
-	returns a list of all possible groups combinations of d elements
-	:param d: integer
-	:return:
-	>>> generate_groups(1)
-	[[0]]
-	>>> generate_groups(2)
-	[[[0], [1]], [[1], [0]], [[0, 1]]]
-	"""
-	if elements is None:
-		elements = list(range(d))
-	g = []
-	if len(elements) == 1:
-		return [elements]
-
-	for r in range(1, d + 1, 1):
-		gn = [list(a) for a in list(itertools.combinations(elements, r))]
-		for i in gn:
-			elements2 = list(set(elements) - set(i))
-			g.append([i] + generate_groups(d, elements=elements2))
-	return g
+    """
+    returns a list of all possible groups combinations of d elements
+    :param d: integer
+    :return:
+    >>> generate_groups(1)
+    [[0]]
+    >>> generate_groups(2)
+    [[[0], [1]], [[1], [0]], [[0, 1]]]
+    """
+    if elements is None:
+        elements = list(range(d))
+    g = []
+    if len(elements) == 1:
+        return [elements]
+
+    for r in range(1, d + 1, 1):
+        gn = [list(a) for a in list(itertools.combinations(elements, r))]
+        for i in gn:
+            elements2 = list(set(elements) - set(i))
+            g.append([i] + generate_groups(d, elements=elements2))
+    return g
 
 
 class results:
-	def __init__(self):
-		self.x = 0
+    def __init__(self):
+        self.x = 0
 
 
 def proj(x, bounds):
-	y = np.zeros(shape=x.shape)
-	for ind, elem in enumerate(x):
-		if elem > bounds[ind][1]:
-			y[ind] = bounds[ind][1]
+    y = np.zeros(shape=x.shape)
+    for ind, elem in enumerate(x):
+        if elem > bounds[ind][1]:
+            y[ind] = bounds[ind][1]
 
-		elif elem < bounds[ind][0]:
-			y[ind] = bounds[ind][0]
+        elif elem < bounds[ind][0]:
+            y[ind] = bounds[ind][0]
 
-		else:
-			y[ind] = elem
-	return y
+        else:
+            y[ind] = elem
+    return y
 
 
 def lambda_coordinate(fun, x0, index, x):
-	x0[index] = x
-	r = fun(x0)
-	return r
-
-
-def projected_gradient_descent(fun, grad, x, bounds, maxit=10e23, verbose=False, tol=0.000001, nu=0.001):
-	i = 0
-	x_old = x + np.random.randn(x.shape[0])
-	while (i < maxit and np.linalg.norm(x - x_old) > tol):
-		x_old = x
-		x = x - (100 * nu) * grad(x)
-		x = proj(x, bounds)
-
-		if verbose == True:
-			print("Iteration: ", i, " ", fun(x))
-		i += 1
-	res = results()
-	res.x = x
-	return res
-
-
-def projected_gradient_descent(fun, grad, x, bounds, maxit=10e23, verbose=False, tol=0.000001, nu=0.001):
-	i = 0
-	x_old = x + np.random.randn(x.shape[0])
-	while (i < maxit and np.linalg.norm(x - x_old) > tol):
-		x_old = x
-		x = x - (100 * nu) * grad(x)
-		x = proj(x, bounds)
-
-		if verbose == True:
-			print("Iteration: ", i, " ", fun(x))
-		i += 1
-	res = results()
-	res.x = x
-	return res
+    x0[index] = x
+    r = fun(x0)
+    return r
+
+
+def projected_gradient_descent(
+    fun, grad, x, bounds, maxit=10e23, verbose=False, tol=0.000001, nu=0.001
+):
+    i = 0
+    x_old = x + np.random.randn(x.shape[0])
+    while i < maxit and np.linalg.norm(x - x_old) > tol:
+        x_old = x
+        x = x - (100 * nu) * grad(x)
+        x = proj(x, bounds)
+
+        if verbose == True:
+            print("Iteration: ", i, " ", fun(x))
+        i += 1
+    res = results()
+    res.x = x
+    return res
+
+
+def projected_gradient_descent(
+    fun, grad, x, bounds, maxit=10e23, verbose=False, tol=0.000001, nu=0.001
+):
+    i = 0
+    x_old = x + np.random.randn(x.shape[0])
+    while i < maxit and np.linalg.norm(x - x_old) > tol:
+        x_old = x
+        x = x - (100 * nu) * grad(x)
+        x = proj(x, bounds)
+
+        if verbose == True:
+            print("Iteration: ", i, " ", fun(x))
+        i += 1
+    res = results()
+    res.x = x
+    return res
 
 
 def complex_step_derivative(fun, h, x):
-	d = x.shape[1]
-	der = np.zeros(shape=(1, d))
-	for i in range(d):
-		one = np.zeros(shape=(1, d))
-		one[0, i] = 1.0
-		der[0, i] = np.imag((fun(x + 1j * h * one) - fun(x))) / h
-	return der
+    d = x.shape[1]
+    der = np.zeros(shape=(1, d))
+    for i in range(d):
+        one = np.zeros(shape=(1, d))
+        one[0, i] = 1.0
+        der[0, i] = np.imag((fun(x + 1j * h * one) - fun(x))) / h
+    return der
 
 
 def finite_differences(fun, h, x):
-	d = x.size()[1]
-	der = torch.zeros(size=(1, d), dtype=torch.float64)
-	for i in range(d):
-		one = torch.zeros(size=(1, d), dtype=torch.float64)
-		one[0, i] = 1.0
-		der[0, i] = (fun(x + one * h) - fun(x)) / h
-	return der
+    d = x.size()[1]
+    der = torch.zeros(size=(1, d), dtype=torch.float64)
+    for i in range(d):
+        one = torch.zeros(size=(1, d), dtype=torch.float64)
+        one[0, i] = 1.0
+        der[0, i] = (fun(x + one * h) - fun(x)) / h
+    return der
 
 
 def finite_differences_hessian(fun, h, x):
-	d = x.size()[1]
-	hess = torch.zeros(size=(d, d), dtype=torch.float64)
-	for i in range(d):
-		for j in range(d):
-			one_i = torch.zeros(size=(1, d), dtype=torch.float64)
-			one_j = torch.zeros(size=(1, d), dtype=torch.float64)
-			one_i[0, i] = 1.0
-			one_j[0, j] = 1.0
-			hess[i, j] = np.log(
-				np.abs(fun(x + h * one_i + h * one_j) - fun(x + h * one_i) - fun(x + h * one_j) + fun(x))) - 2 * np.log(
-				h)
-
-	hess = torch.exp(hess)
-	return (hess + torch.t(hess)) / 2.
+    d = x.size()[1]
+    hess = torch.zeros(size=(d, d), dtype=torch.float64)
+    for i in range(d):
+        for j in range(d):
+            one_i = torch.zeros(size=(1, d), dtype=torch.float64)
+            one_j = torch.zeros(size=(1, d), dtype=torch.float64)
+            one_i[0, i] = 1.0
+            one_j[0, j] = 1.0
+            hess[i, j] = np.log(
+                np.abs(
+                    fun(x + h * one_i + h * one_j)
+                    - fun(x + h * one_i)
+                    - fun(x + h * one_j)
+                    + fun(x)
+                )
+            ) - 2 * np.log(h)
+
+    hess = torch.exp(hess)
+    return (hess + torch.t(hess)) / 2.0
 
 
 def finite_differences_np(fun, h, x):
-	d = x.shape[0]
-	der = np.zeros(shape=(d))
-	for i in range(d):
-		one = np.zeros(shape=(d))
-		one[i] = 1.0
-		der[i] = (fun(x + one * h) - fun(x)) / h
-	return der
+    d = x.shape[0]
+    der = np.zeros(shape=(d))
+    for i in range(d):
+        one = np.zeros(shape=(d))
+        one[i] = 1.0
+        der[i] = (fun(x + one * h) - fun(x)) / h
+    return der
 
 
-def finite_differences_test(fun, fun_der, x, h_max=1.):
-	n = 10
-	for i in range(n):
-		h = 2 ** (-i) * h_max
-		approx_nabla = finite_differences_np(fun, h, x)
-		print(i, h, np.linalg.norm(approx_nabla - fun_der(x)))
+def finite_differences_test(fun, fun_der, x, h_max=1.0):
+    n = 10
+    for i in range(n):
+        h = 2 ** (-i) * h_max
+        approx_nabla = finite_differences_np(fun, h, x)
+        print(i, h, np.linalg.norm(approx_nabla - fun_der(x)))
 
 
 def sample_custom(inverse_cumulative_distribution, size=(1, 1)):
-	U = np.random.uniform(0, 1, size=size)
-	F = np.vectorize(inverse_cumulative_distribution)
-	Z = F(U)
-	return Z
+    U = np.random.uniform(0, 1, size=size)
+    F = np.vectorize(inverse_cumulative_distribution)
+    Z = F(U)
+    return Z
 
 
 def select_subset(M, S):
-	d = M.shape[0]
-	I = np.zeros(shape=(d, d))
-	I[S, S] = 1.
-	return I @ M @ I
+    d = M.shape[0]
+    I = np.zeros(shape=(d, d))
+    I[S, S] = 1.0
+    return I @ M @ I
 
 
 def select_subset_inv(M, S):
-	M = select_subset(M, S)
-	return np.linalg.pinv(M)
+    M = select_subset(M, S)
+    return np.linalg.pinv(M)
 
 
 def complement_set(S, size):
-	V = set(np.arange(0, size, 1))
-	s = V - set(S)
-	S_C = list(s)
-	return S_C
+    V = set(np.arange(0, size, 1))
+    s = V - set(S)
+    S_C = list(s)
+    return S_C
 
 
 def add_element(elements, new_element):
-	new_out = []
-	for element in elements:
-		new_out.append(element + [[new_element]])
-		new_out.append(element)
-		for j in element:
-			new = copy.deepcopy(element)
-			new.remove(j)
-			new.append(j + [new_element])
-			new_out.append(new)
+    new_out = []
+    for element in elements:
+        new_out.append(element + [[new_element]])
+        new_out.append(element)
+        for j in element:
+            new = copy.deepcopy(element)
+            new.remove(j)
+            new.append(j + [new_element])
+            new_out.append(new)
 
-	return new_out
+    return new_out
 
 
 def get_hierarchy(start=1, new_elements=[2, 3, 4]):
-	elements = [[[start]]]
-	for new_element in new_elements:
-		elements = add_element(elements, new_element)
-	l = []
-	for element in elements:
-		l.append(np.sum([3 ** len(e) for e in element]))
-	indices = np.argsort(l)
-	out = []
-	for index in indices:
-		out.append(elements[index])
-	return out
+    elements = [[[start]]]
+    for new_element in new_elements:
+        elements = add_element(elements, new_element)
+    l = []
+    for element in elements:
+        l.append(np.sum([3 ** len(e) for e in element]))
+    indices = np.argsort(l)
+    out = []
+    for index in indices:
+        out.append(elements[index])
+    return out
 
 
 def likelihood_bernoulli_test(alpha, delta, failure):
-	if alpha == 1.:
-		alpha = 0.99999
+    if alpha == 1.0:
+        alpha = 0.99999
 
-	p = (1 - (np.log(alpha / delta)) / np.log((1 - alpha) / (1 - delta))) ** (-1)
+    p = (1 - (np.log(alpha / delta)) / np.log((1 - alpha) / (1 - delta))) ** (-1)
 
-	dkl = p * np.log(p / delta) + (1 - p) * np.log((1 - p) / (1 - delta))
-	n = np.log(2 / failure) / dkl
-	k = n * p
-	return n, k
+    dkl = p * np.log(p / delta) + (1 - p) * np.log((1 - p) / (1 - delta))
+    n = np.log(2 / failure) / dkl
+    k = n * p
+    return n, k
 
 
 def median_of_means(list, delta=0.01):
-	r = list.shape[0]
-	if r > 3:
-		k = r
-		N = int(np.floor(r / k))
-		means = []
-		for j in range(k - 1):
-			means.append((1. / N) * np.sum(list[(j * N):(j + 1) * N]))
-		return np.median(means)
-	else:
-		return 0.
-
-
-def get_indices(xtest,x):
-	"""
-	Find location of vectors in a larger set
-	:param xtest: torch.Tensor, tensor to be located
-	:param x: torch.Tensor, to be located in xtest
-	:return: list, if None its means it was not found in the original tensor
-	"""
-
-	indices = []
-	for i in range(x.size()[0]):
-		xtrial = x[i,:]
-		mask = torch.all(xtest == xtrial, dim=1)
-		if torch.sum(mask) > 0:
-			index = int(torch.argmax(mask.int()))
-			indices.append(index)
-		else:
-			indices.append(None)
-
-	return indices
+    r = list.shape[0]
+    if r > 3:
+        k = r
+        N = int(np.floor(r / k))
+        means = []
+        for j in range(k - 1):
+            means.append((1.0 / N) * np.sum(list[(j * N) : (j + 1) * N]))
+        return np.median(means)
+    else:
+        return 0.0
+
+
+def get_indices(xtest, x):
+    """
+    Find location of vectors in a larger set
+    :param xtest: torch.Tensor, tensor to be located
+    :param x: torch.Tensor, to be located in xtest
+    :return: list, if None its means it was not found in the original tensor
+    """
+
+    indices = []
+    for i in range(x.size()[0]):
+        xtrial = x[i, :]
+        mask = torch.all(xtest == xtrial, dim=1)
+        if torch.sum(mask) > 0:
+            index = int(torch.argmax(mask.int()))
+            indices.append(index)
+        else:
+            indices.append(None)
+
+    return indices
+
 
 if __name__ == "__main__":
-	x = torch.arange(0,9,1).reshape(3,3)
-	xtrial = torch.Tensor([[0,1,2],[6,7,8],[3,4,5]])
-	print (x)
-	print (get_indices(x,xtrial))
+    x = torch.arange(0, 9, 1).reshape(3, 3)
+    xtrial = torch.tensor([[0, 1, 2], [6, 7, 8], [3, 4, 5]])
+    print(x)
+    print(get_indices(x, xtrial))
diff --git a/stpy/helpers/parallel_interpolation.py b/stpy/helpers/parallel_interpolation.py
new file mode 100644
index 0000000..c070333
--- /dev/null
+++ b/stpy/helpers/parallel_interpolation.py
@@ -0,0 +1,370 @@
+from torch.multiprocessing import Pool
+from os import cpu_count
+import torch
+import numpy as np
+from scipy.spatial import Delaunay, cKDTree
+
+shared_triangulation: Delaunay | None
+xtree: cKDTree
+
+
+def _initialize(tri: Delaunay, tree: cKDTree):
+    global shared_triangulation
+    global xtree
+    shared_triangulation = tri
+    xtree = tree
+
+
+def _find_exact_or_simplex_batch(batch: np.ndarray, tri_local=None, xtree_local=None):
+    if tri_local is None:
+        tri_local = shared_triangulation
+    if xtree_local is None:
+        xtree_local = xtree
+    distances, idx = xtree_local.query(batch, k=1, distance_upper_bound=1e-7)
+    exact_match_mask = distances <= 1e-7
+    batch_remaining = batch[~exact_match_mask]
+
+    simplices = tri_local.find_simplex(batch_remaining)
+    outside_conv_hull_mask = simplices < 0
+
+    simplices_remaining = simplices[~outside_conv_hull_mask]
+    exact_match_mask[~exact_match_mask] = outside_conv_hull_mask
+
+    idx = idx[exact_match_mask]
+    no_match_mask = idx == len(xtree_local.data)
+    if no_match_mask.any():
+        _, idx_no_match = xtree_local.query(batch[exact_match_mask][no_match_mask], k=1)
+        idx[no_match_mask] = idx_no_match
+
+    return idx, simplices_remaining, exact_match_mask
+
+
+class InterpolatorArray:
+
+    def __init__(self, x: torch.Tensor, phi: torch.Tensor, m: int, num_cpu_cores=None):
+        # Ensure x is on CPU for Delaunay
+        x_cpu = x.cpu().numpy()
+
+        # Build the Delaunay triangulation on CPU
+        tri = Delaunay(x_cpu)
+        xtree = cKDTree(x_cpu)
+        if num_cpu_cores is None:
+            num_cpu_cores = cpu_count()
+        self.num_cpu_cores = num_cpu_cores
+        if self.num_cpu_cores >= 1:
+            pool = Pool(self.num_cpu_cores, _initialize, [tri, xtree])
+        else:
+            pool = None
+        self.interpolators = {
+            0: [
+                InterpolatorND(x, phi[:, j], tri, xtree, pool, num_cpu_cores)
+                for j in range(m)
+            ]
+        }
+        self.pools = {0: pool}
+
+    def __call__(self, j: int, q: torch.Tensor):
+        all_interpolators = [ip for list in self.interpolators.values() for ip in list]
+        return all_interpolators[j](q).view(-1, 1)
+
+    def set(self, i: int, x: torch.Tensor, phi: torch.Tensor, m: int):
+        x_cpu = x.cpu().numpy()
+        tri = Delaunay(x_cpu)
+        xtree = cKDTree(x_cpu)
+        if self.num_cpu_cores >= 1:
+            pool = Pool(self.num_cpu_cores, _initialize, [tri, xtree])
+        else:
+            pool = None
+        self.interpolators[i] = [
+            InterpolatorND(x, phi[:, j], tri, xtree, pool, self.num_cpu_cores)
+            for j in range(m)
+        ]
+        if i in self.pools and self.pools[i] is not None:
+            self.pools[i].close()
+            self.pools[i].join()
+        self.pools[i] = pool
+
+    def __del__(self):
+        for pool in self.pools.values():
+            if pool is not None:
+                pool.close()
+                pool.join()
+
+
+class InterpolatorND:
+    """
+    Piecewise linear interpolator for N-dimensional data using Delaunay triangulation.
+    """
+
+    def __init__(
+        self,
+        x: torch.Tensor,
+        y: torch.Tensor,
+        tri=None,
+        xtree=None,
+        pool=None,
+        num_cpu_cores=None,
+    ):
+        """
+        Args:
+            x: (N, D) tensor of input points in D-dimensional space.
+            y: (N,) tensor of function values at those points.
+            tri: Precomputed Delaunay triangulation.
+            xtree: Precomputed cKDTree for nearest neighbor search.
+            pool: Optional multiprocessing pool.
+            num_cpu_cores: Number of CPU cores to use for parallel processing.
+        """
+        assert y.dtype == torch.float64
+
+        if tri is not None:
+            # Use the provided Delaunay triangulation and cKDTree
+            self.tri = tri
+            self.xtree = xtree
+            self.pool = pool
+            self.own_pool = False
+            self.num_cpu_cores = num_cpu_cores
+        else:
+            # Ensure x is on CPU for Delaunay
+            x_cpu = x.detach().cpu().numpy()
+
+            # Build the Delaunay triangulation on CPU
+            self.tri = Delaunay(x_cpu)
+            self.xtree = cKDTree(x_cpu)
+
+            if num_cpu_cores is None:
+                num_cpu_cores = cpu_count()
+            self.num_cpu_cores = num_cpu_cores
+            if self.num_cpu_cores >= 1:
+                self.pool = Pool(num_cpu_cores, _initialize, [self.tri, self.xtree])
+                self.own_pool = True
+            else:
+                self.pool = None
+                self.own_pool = False
+            self.own_pool = True
+
+        self.x = x
+        self.y = y
+
+        # Convert triangle simplices to a torch tensor
+        simplices = torch.tensor(self.tri.simplices, device=y.device)
+        self.simplices = simplices  # Shape: (M, D+1), M = # of simplices
+
+        # Gather simplex vertex positions and function values
+        self.tri_pts = x[simplices]  # Shape: (M, D+1, D)
+        self.tri_y = y[simplices]  # Shape: (M, D+1)
+
+        # Precompute matrices for barycentric transformation
+        v0 = self.tri_pts[:, 0, :]  # First vertex of each simplex
+        T = self.tri_pts[:, 1:, :] - v0[:, None, :]  # (M, D, D)
+        T = T.transpose(-1, -2)
+        self.T_inv = torch.inverse(T)  # (M, D, D)
+        self.v0 = v0  # Store v0 for barycentric computation
+
+    def __del__(self):
+        if self.own_pool and self.pool is not None:
+            self.pool.close()
+            self.pool.join()
+
+    def __call__(self, xp: torch.Tensor) -> torch.Tensor:
+        """
+        Interpolate y-values at query points xp. Does not keep order of points the same!
+
+        Args:
+            xp: (B, D) tensor of query points in D-dimensional space.
+
+        Returns:
+            out: (B,) tensor of interpolated values.
+        """
+
+        xp_cpu = xp.cpu().numpy()  # (B, D)
+
+        # 1) Use Delaunay.find_simplex on CPU to find simplices
+        # simplex_idx = self.tri.find_simplex(xp_cpu)  # (B,)
+
+        # Split xp_cpu into batches for parallel processing
+        if self.pool is not None:
+            batches = np.array_split(xp_cpu, self.num_cpu_cores)
+            # Use multiprocessing to parallelize find_simplex
+            results = self.pool.map_async(
+                _find_exact_or_simplex_batch, [batch for batch in batches]
+            ).get(timeout=10)
+        else:
+            # Run find_simplex sequentially
+            results = [_find_exact_or_simplex_batch(xp_cpu, self.tri, self.xtree)]
+
+        # Concatenate the results back into a single array
+        # results = [(out_exact_matches0, xp0, simplices0), (out_exact_matches1, xp1, simplices1), ...]
+        exact_matches_idx_list = []
+        exact_match_mask_list = []
+        simplices_list = []
+        for exact_matches_idx, simplices, exact_match_mask in results:
+            exact_matches_idx_list.append(exact_matches_idx)
+            exact_match_mask_list.append(exact_match_mask)
+            simplices_list.append(simplices)
+
+        exact_matches_idx = np.concatenate(exact_matches_idx_list)
+        exact_matches_idx = torch.tensor(exact_matches_idx, device=self.y.device)
+        exact_matches_y = self.y[exact_matches_idx]
+        if len(exact_matches_y) == len(xp):
+            return exact_matches_y
+
+        exact_match_mask = np.concatenate(exact_match_mask_list)
+        exact_match_mask = torch.tensor(exact_match_mask, device=self.y.device)
+
+        simplices_remaining = np.concatenate(simplices_list)
+        simplices_remaining = torch.tensor(
+            simplices_remaining, device=self.y.device
+        )  # (B,)
+
+        xp_remaining = xp[~exact_match_mask]
+
+        # p - v0: (Bv, D)
+        p_v0 = xp_remaining - self.v0[simplices_remaining]
+
+        # alpha = T_inv @ (p - v0): (Bv, D)
+        T_inv_local = self.T_inv[simplices_remaining]  # (Bv, D, D)
+        # Batched Matrix multiplication, but T_inv_local is transposed
+        bary_coords = torch.einsum("bij, bj -> bi", T_inv_local, p_v0)  # (Bv, D)
+
+        # Compute last barycentric coordinate
+        bary_coords = torch.cat(
+            [1 - bary_coords.sum(dim=-1, keepdim=True), bary_coords], dim=-1
+        )  # (Bv, D+1)
+
+        # 5) Interpolate y-values using barycentric coordinates
+        tri_y_local = self.tri_y[simplices_remaining]  # (Bv, D+1)
+        out_interpolated = (bary_coords * tri_y_local).sum(dim=-1)  # (Bv,)
+
+        # 6) Store results for valid points
+        result = torch.empty(len(xp), dtype=self.y.dtype, device=self.y.device)
+        result[exact_match_mask] = exact_matches_y
+        result[~exact_match_mask] = out_interpolated
+        return result
+
+
+def plot_simple_function():
+
+    # Define a simple 2D function
+    def simple_function(x, y):
+        return np.sin(np.pi * x) * np.cos(np.pi * y)
+
+    # Generate a grid of points for the original function
+    n_points = 21  # Number of points along each axis
+    x_vals = np.linspace(0, 1, n_points)
+    y_vals = np.linspace(0, 1, n_points)
+    x_grid, y_grid = np.meshgrid(x_vals, y_vals)
+    z_grid = simple_function(x_grid, y_grid)  # Compute function values
+
+    # Flatten the grid for input to the interpolator
+    x_flat = x_grid.flatten()
+    y_flat = y_grid.flatten()
+    z_flat = z_grid.flatten()
+
+    # Convert to PyTorch tensors
+    x_train = torch.tensor(
+        np.column_stack((x_flat, y_flat)), dtype=torch.float64, device="cuda"
+    )
+
+    y_train = torch.tensor(z_flat, dtype=torch.float64, device="cuda")
+
+    # Create the interpolator
+    interpolator = InterpolatorND(x_train, y_train, num_cpu_cores=0)
+
+    # Generate slightly offset query points
+    n_query = 21
+    x_query_vals = np.linspace(0.010, 1.01, n_query)
+    y_query_vals = np.linspace(0.010, 1.01, n_query)
+    # x_query_vals = np.array([0.31])
+    # y_query_vals = np.array([0.01])
+    x_query_grid, y_query_grid = np.meshgrid(x_query_vals, y_query_vals)
+    x_query_grid = np.concat(
+        [np.linspace(0.0, 1.0, n_query).reshape(1, -1), x_query_grid]
+    )
+    y_query_grid = np.concat([np.zeros([1, n_query]), y_query_grid])
+    x_query_flat = x_query_grid.flatten()
+    y_query_flat = y_query_grid.flatten()
+
+    # Convert query points to PyTorch tensors
+    x_query = torch.tensor(
+        np.column_stack((x_query_flat, y_query_flat)),
+        dtype=torch.float64,
+        device="cuda",
+    )
+
+    # Perform interpolation
+    z_query = interpolator(x_query).cpu().numpy()  # Interpolated values
+
+    # Plot the original function as a scatter plot
+    plt.figure(figsize=(30, 10))
+    plt.subplot(1, 2, 1)
+    plt.scatter(x_flat, y_flat, c=z_flat, cmap="viridis", s=40)
+    plt.title("Original Function")
+    plt.colorbar()
+    plt.subplot(1, 2, 2)
+    plt.scatter(x_query_flat, y_query_flat, c=z_query, cmap="viridis", s=200)
+    plt.scatter(
+        x_flat,
+        y_flat,
+        c=z_flat,
+        cmap="viridis",
+        s=200,
+    )
+    plt.title("Interpolated Function with Original Points")
+    plt.colorbar()
+    # Show plots
+    plt.tight_layout()
+    plt.show()
+    print("hi")
+
+
+def interploate_between():
+
+    # Define 5 points in a 1x1 field
+    x_points = np.array([[0.1, 0.1], [1, 0], [0, 1], [1, 1], [0.3, 0.3], [0.7, 0.7]])
+    # Add random noise to x_points
+    noise = np.random.normal(scale=0.01, size=x_points.shape)
+    # x_points += noise
+    y_values = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+
+    # Convert to PyTorch tensors
+    x_train = torch.tensor(x_points, dtype=torch.float64, device="cuda")
+    y_train = torch.tensor(y_values, dtype=torch.float64, device="cuda")
+
+    # Create the interpolator
+    interpolator = InterpolatorND(x_train, y_train, num_cpu_cores=1)
+
+    # Generate a grid of query points
+    n_query = 10  # Number of query points along each axis
+    x_query_vals = np.linspace(0, 1, n_query)
+    y_query_vals = np.linspace(0, 1, n_query)
+    x_query_grid, y_query_grid = np.meshgrid(x_query_vals, y_query_vals)
+    x_query_flat = x_query_grid.flatten()
+    y_query_flat = y_query_grid.flatten()
+
+    # Convert query points to PyTorch tensors
+    x_query = torch.tensor(
+        np.column_stack((x_query_flat, y_query_flat)),
+        dtype=torch.float64,
+        device="cuda",
+    )
+
+    # Perform interpolation
+    z_query = interpolator(x_query).cpu().numpy()  # Interpolated values
+
+    # Plot the interpolated values
+    plt.figure(figsize=(10, 10))
+    plt.scatter(x_query_flat, y_query_flat, c=z_query, cmap="viridis", s=40)
+    plt.scatter(x_points[:, 0], x_points[:, 1], c=y_values, cmap="viridis", s=200)
+    plt.title("Interpolated Values")
+    plt.colorbar()
+    plt.show()
+    print("hi")
+
+
+if __name__ == "__main__":
+
+    import torch
+    import numpy as np
+    import matplotlib.pyplot as plt
+
+    plot_simple_function()
diff --git a/stpy/helpers/plot_helper.py b/stpy/helpers/plot_helper.py
index 1b2eece..3b891b7 100644
--- a/stpy/helpers/plot_helper.py
+++ b/stpy/helpers/plot_helper.py
@@ -4,116 +4,133 @@
 import webcolors
 
 
-def plot_ellipse(offset, cov, scale=1, theta_num=1e3, axis=None, plot_kwargs=None, fill=False, fill_kwargs=None):
-	'''
-	offset = 2d array which gives center of ellipse
-	cov = covariance of ellipse
-	scale = scale ellipse by constant factor
-	theta_num = used for a linspace below, not sure exactly (?)
-
-	'''
-	# Get Ellipse Properties from cov matrix
-
-	eig_vec, eig_val, u = np.linalg.svd(cov)
-	# Make sure 0th eigenvector has positive x-coordinate
-	if eig_vec[0][0] < 0:
-		eig_vec[0] *= -1
-
-	semimaj = np.sqrt(eig_val[0])
-	semimin = np.sqrt(eig_val[1])
-	semimaj *= scale
-	semimin *= scale
-
-	phi = np.arccos(np.dot(eig_vec[0], np.array([1, 0])))
-	if eig_vec[0][1] < 0 and phi > 0:
-		phi *= -1
-
-	# Generate data for ellipse structure
-	theta = np.linspace(0, 2 * np.pi, theta_num)
-	r = 1 / np.sqrt((np.cos(theta)) ** 2 + (np.sin(theta)) ** 2)
-	x = r * np.cos(theta)
-	y = r * np.sin(theta)
-	data = np.array([x, y])
-	S = np.array([[semimaj, 0], [0, semimin]])
-	R = np.array([[np.cos(phi), -np.sin(phi)], [np.sin(phi), np.cos(phi)]])
-	T = np.dot(R, S)
-	data = np.dot(T, data)
-	data[0] += offset[0]
-	data[1] += offset[1]
-
-	# Plot!
-	return_fig = False
-	if axis is None:
-		axis = plt.gca()
-
-	if plot_kwargs is None:
-		p, = axis.plot(data[0], data[1], color='r', linestyle='-')
-	else:
-		p, = axis.plot(data[0], data[1], **plot_kwargs)
-
-	if fill == True:
-		if fill_kwargs is None:
-			fill_kwargs = dict()
-		axis.fill(data[0], data[1], alpha=0.2, color='r')
+def plot_ellipse(
+    offset,
+    cov,
+    scale=1,
+    theta_num=1e3,
+    axis=None,
+    plot_kwargs=None,
+    fill=False,
+    fill_kwargs=None,
+):
+    """
+    offset = 2d array which gives center of ellipse
+    cov = covariance of ellipse
+    scale = scale ellipse by constant factor
+    theta_num = used for a linspace below, not sure exactly (?)
+
+    """
+    # Get Ellipse Properties from cov matrix
+
+    eig_vec, eig_val, u = np.linalg.svd(cov)
+    # Make sure 0th eigenvector has positive x-coordinate
+    if eig_vec[0][0] < 0:
+        eig_vec[0] *= -1
+
+    semimaj = np.sqrt(eig_val[0])
+    semimin = np.sqrt(eig_val[1])
+    semimaj *= scale
+    semimin *= scale
+
+    phi = np.arccos(np.dot(eig_vec[0], np.array([1, 0])))
+    if eig_vec[0][1] < 0 and phi > 0:
+        phi *= -1
+
+    # Generate data for ellipse structure
+    theta = np.linspace(0, 2 * np.pi, theta_num)
+    r = 1 / np.sqrt((np.cos(theta)) ** 2 + (np.sin(theta)) ** 2)
+    x = r * np.cos(theta)
+    y = r * np.sin(theta)
+    data = np.array([x, y])
+    S = np.array([[semimaj, 0], [0, semimin]])
+    R = np.array([[np.cos(phi), -np.sin(phi)], [np.sin(phi), np.cos(phi)]])
+    T = np.dot(R, S)
+    data = np.dot(T, data)
+    data[0] += offset[0]
+    data[1] += offset[1]
+
+    # Plot!
+    return_fig = False
+    if axis is None:
+        axis = plt.gca()
+
+    if plot_kwargs is None:
+        (p,) = axis.plot(data[0], data[1], color="r", linestyle="-")
+    else:
+        (p,) = axis.plot(data[0], data[1], **plot_kwargs)
+
+    if fill == True:
+        if fill_kwargs is None:
+            fill_kwargs = dict()
+        axis.fill(data[0], data[1], alpha=0.2, color="r")
 
 
 def closest_colour(requested_colour):
-	min_colours = {}
-	for name, key in webcolors.css3_hex_to_names.items():
-		r_c, g_c, b_c = webcolors.hex_to_rgb(key)
-		rd = (r_c - requested_colour[0]) ** 2
-		gd = (g_c - requested_colour[1]) ** 2
-		bd = (b_c - requested_colour[2]) ** 2
-		min_colours[(rd + gd + bd)] = name
-	return min_colours[min(min_colours.keys())]
+    min_colours = {}
+    for name, key in webcolors.css3_hex_to_names.items():
+        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
+        rd = (r_c - requested_colour[0]) ** 2
+        gd = (g_c - requested_colour[1]) ** 2
+        bd = (b_c - requested_colour[2]) ** 2
+        min_colours[(rd + gd + bd)] = name
+    return min_colours[min(min_colours.keys())]
 
 
 def get_colour_name(requested_colour):
-	try:
-		closest_name = actual_name = webcolors.rgb_to_name(requested_colour)
-	except ValueError:
-		closest_name = closest_colour(requested_colour)
-		actual_name = None
-	return actual_name, closest_name
-
-
-def colorline(x, y, z=None, cmap=plt.get_cmap('copper'), norm=plt.Normalize(0.0, 1.0),
-			  linewidth=3, alpha=1.0):
-	"""
-	http://nbviewer.ipython.org/github/dpsanders/matplotlib-examples/blob/master/colorline.ipynb
-	http://matplotlib.org/examples/pylab_examples/multicolored_line.html
-	Plot a colored line with coordinates x and y
-	Optionally specify colors in the array z
-	Optionally specify a colormap, a norm function and a line width
-	"""
-
-	# Default colors equally spaced on [0,1]:
-	if z is None:
-		z = np.linspace(0.0, 1.0, len(x))
-
-	# Special case if a single number:
-	if not hasattr(z, "__iter__"):  # to check for numerical input -- this is a hack
-		z = np.array([z])
-
-	z = np.asarray(z)
-
-	segments = make_segments(x, y)
-	lc = mcoll.LineCollection(segments, array=z, cmap=cmap, norm=norm,
-							  linewidth=linewidth, alpha=alpha)
-
-	ax = plt.gca()
-	ax.add_collection(lc)
-
-	return lc
+    try:
+        closest_name = actual_name = webcolors.rgb_to_name(requested_colour)
+    except ValueError:
+        closest_name = closest_colour(requested_colour)
+        actual_name = None
+    return actual_name, closest_name
+
+
+def colorline(
+    x,
+    y,
+    z=None,
+    cmap=plt.get_cmap("copper"),
+    norm=plt.Normalize(0.0, 1.0),
+    linewidth=3,
+    alpha=1.0,
+):
+    """
+    http://nbviewer.ipython.org/github/dpsanders/matplotlib-examples/blob/master/colorline.ipynb
+    http://matplotlib.org/examples/pylab_examples/multicolored_line.html
+    Plot a colored line with coordinates x and y
+    Optionally specify colors in the array z
+    Optionally specify a colormap, a norm function and a line width
+    """
+
+    # Default colors equally spaced on [0,1]:
+    if z is None:
+        z = np.linspace(0.0, 1.0, len(x))
+
+    # Special case if a single number:
+    if not hasattr(z, "__iter__"):  # to check for numerical input -- this is a hack
+        z = np.array([z])
+
+    z = np.asarray(z)
+
+    segments = make_segments(x, y)
+    lc = mcoll.LineCollection(
+        segments, array=z, cmap=cmap, norm=norm, linewidth=linewidth, alpha=alpha
+    )
+
+    ax = plt.gca()
+    ax.add_collection(lc)
+
+    return lc
 
 
 def make_segments(x, y):
-	"""
-	Create list of line segments from x and y coordinates, in the correct format
-	for LineCollection: an array of the form numlines x (points per line) x 2 (x
-	and y) array
-	"""
-
-	points = np.array([x, y]).T.reshape(-1, 1, 2)
-	segments = np.concatenate([points[:-1], points[1:]], axis=1)
-	return segments
+    """
+    Create list of line segments from x and y coordinates, in the correct format
+    for LineCollection: an array of the form numlines x (points per line) x 2 (x
+    and y) array
+    """
+
+    points = np.array([x, y]).T.reshape(-1, 1, 2)
+    segments = np.concatenate([points[:-1], points[1:]], axis=1)
+    return segments
diff --git a/stpy/helpers/plotting_helper.py b/stpy/helpers/plotting_helper.py
index 6a4fa7d..c350229 100644
--- a/stpy/helpers/plotting_helper.py
+++ b/stpy/helpers/plotting_helper.py
@@ -2,7 +2,18 @@
 import sklearn
 
 
-def plot_R2(vals, lcb, ucb, truth, s, truth_lcb=None, truth_ucb=None, show=False, save_file_name=None, name=None):
+def plot_R2(
+    vals,
+    lcb,
+    ucb,
+    truth,
+    s,
+    truth_lcb=None,
+    truth_ucb=None,
+    show=False,
+    save_file_name=None,
+    name=None,
+):
     r2 = sklearn.metrics.r2_score(truth, vals)
     if save_file_name is not None:
         filename = save_file_name
@@ -15,22 +26,28 @@ def plot_R2(vals, lcb, ucb, truth, s, truth_lcb=None, truth_ucb=None, show=False
     plt.xlabel("true")
     plt.ylabel("predicted")
 
-    plt.plot(truth, truth, 'k-')
-    plt.plot(truth, truth + s, 'k--')
-    plt.plot(truth, truth - s, 'k--')
-    plt.plot(truth, vals, color='k', marker='o', linestyle='')
-
-    plt.errorbar(truth, vals, yerr=vals - lcb, color='k', marker='o', linestyle='')
+    plt.plot(truth, truth, "k-")
+    plt.plot(truth, truth + s, "k--")
+    plt.plot(truth, truth - s, "k--")
+    plt.plot(truth, vals, color="k", marker="o", linestyle="")
 
+    plt.errorbar(truth, vals, yerr=vals - lcb, color="k", marker="o", linestyle="")
 
     if save_file_name is not None:
         plt.savefig(filename + "_0.png", dpi=150)
 
-    plt.errorbar(truth, vals, yerr=vals - lcb + 2 * s, color='r', marker='o', linestyle='', zorder = -10)
+    plt.errorbar(
+        truth,
+        vals,
+        yerr=vals - lcb + 2 * s,
+        color="r",
+        marker="o",
+        linestyle="",
+        zorder=-10,
+    )
 
     if save_file_name is not None:
         plt.savefig(filename + "_1.png", dpi=150)
 
-
     if show:
         plt.show()
diff --git a/stpy/helpers/posterior_sampling.py b/stpy/helpers/posterior_sampling.py
index 2318f27..46d3472 100644
--- a/stpy/helpers/posterior_sampling.py
+++ b/stpy/helpers/posterior_sampling.py
@@ -1,4 +1,7 @@
+import sys
 import numpy as np
+import scipy
+from tqdm import tqdm
 
 
 # Python implementation of "Exact Hamiltonian Monte Carlo for Truncated Multivariate Gaussian"
@@ -7,214 +10,221 @@
 
 
 class HmcSampler:
-	min_t = 0.00001
-
-	def __init__(self, dim, init, f, g, verbose):
-		"""
-
-		:param dim:  dimension
-		:param init: (dim, ), the initial value for HMC
-		:param f:    (q, dim), coefficient for linear constraints
-		:param g:    (q,), linear constraints: f*X+g >= 0
-		"""
-		self.dim = dim
-		self.lastSample = init
-		self.f = f
-		self.g = g
-		self.verbose = verbose
-
-	def getNextLinearHitTime(self, a, b):
-		"""
-		the position x(t) = a * sin(t) + b * cos(t)
-
-		:param a: (dim, ) initial value for a (initial velocity)
-		:param b: (dim, ) initial value for b (initial position)
-		:return: hit_time: the time for the hit
-				 cn : the cn-th constraint is active at hit time.
-		"""
-		hit_time = 0
-		cn = 0
-
-		if self.f is None:
-			return hit_time, cn
-
-		f = self.f
-		g = self.g
-		for i in range(f.shape[0]):
-			# constraints: f[i].dot(x)+g[i] >= 0
-			fa = f[i].dot(a)
-			fb = f[i].dot(b)
-			u = np.sqrt(fa * fa + fb * fb)
-			# if u > g[i] and u > -g[i]:
-			if -u < g[i] < u:
-				# otherwise the constrain will always be satisfied
-				phi = np.arctan2(-fa, fb)  # -pi < phi < pi
-				t1 = np.arccos(-g[i] / u) - phi  # -pi < t1 < 2*pi
-
-				if t1 < 0:
-					t1 += 2 * np.pi  # 0 < t1 < 2*pi
-				if np.abs(t1) < self.min_t or \
-						np.abs(t1 - 2 * np.pi) < self.min_t:
-					t1 = 0
-
-				t2 = -t1 - 2 * phi  # -4*pi < t2 < 2*pi
-				if t2 < 0:
-					t2 += 2 * np.pi  # -2*pi < t2 < 2*pi
-				if t2 < 0:
-					t2 += 2 * np.pi  # 0 < t2 < 2*pi
-
-				if np.abs(t2) < self.min_t or \
-						np.abs(t2 - 2 * np.pi) < self.min_t:
-					t2 = 0
-
-				if t1 == 0:
-					t = t2
-				elif t2 == 0:
-					t = t1
-				else:
-					t = np.minimum(t1, t2)
-
-				if self.min_t < t and (hit_time == 0 or t < hit_time):
-					hit_time = t
-					cn = i
-		return hit_time, cn
-
-	def verifyConstraints(self, b):
-		"""
-
-		:param b:
-		:return:
-		"""
-		if self.f is not None:
-			return np.min(self.f @ b + self.g)
-		else:
-			return 1
-
-	def sampleNext(self):
-		T = np.pi / 2  # how much time to move
-		b = self.lastSample
-		dim = self.dim
-
-		count_sample_vel = 0
-
-		while True:
-			velsign = 0
-			# sample new initial velocity
-			a = np.random.normal(0, 1, dim)
-
-			count_sample_vel += 1
-			if self.verbose and count_sample_vel % 50 == 0:
-				print("Has sampled %d times of initial velocity." % count_sample_vel)
-
-			tt = T  # the time left to move
-			while True:
-				t, c1 = self.getNextLinearHitTime(a, b)
-				# t: how much time to move to hit the boundary, if t == 0, move tt
-				# c1: the strict constraint at hit time
-
-				if t == 0 or tt < t:
-					# if no wall to be hit (t == 0) or not enough
-					# time left to hit the wall (tt < t)
-					break
-
-				tt -= t  # time left to move after hitting the wall
-				new_b = np.sin(t) * a + np.cos(t) * b  # hit location
-				hit_vel = np.cos(t) * a - np.sin(t) * b  # hit velocity
-				b = new_b
-				# reflect the velocity and verify that it points in the right direction
-				f2 = np.dot(self.f[c1], self.f[c1])
-				alpha = np.dot(self.f[c1], hit_vel) / f2
-				a = hit_vel - 2 * alpha * self.f[c1]  # reflected velocity
-
-				velsign = a.dot(self.f[c1])
-
-				if velsign < 0:
-					# get out of inner while, resample the velocity and start again
-					# this occurs rarelly, due to numerical instabilities
-					break
-
-			if velsign < 0:
-				# go to the beginning of outer while
-				continue
-
-			bb = np.sin(tt) * a + np.cos(tt) * b
-			check = self.verifyConstraints(bb)
-			if check >= 0:
-				# verify that we don't violate the constraints
-				# due to a numerical instability
-				if self.verbose:
-					print("total number of velocity samples : %d" % count_sample_vel)
-
-				self.lastSample = bb
-				return bb
+    min_t = 0.00001
+
+    def __init__(self, dim, init, f, g, verbose):
+        """
+
+        :param dim:  dimension
+        :param init: (dim, ), the initial value for HMC
+        :param f:    (q, dim), coefficient for linear constraints
+        :param g:    (q,), linear constraints: f*X+g >= 0
+        """
+        self.dim = dim
+        self.lastSample = init
+        self.f = f
+        self.g = g
+        self.verbose = verbose
+
+    def getNextLinearHitTime(self, a, b):
+        """
+        the position x(t) = a * sin(t) + b * cos(t)
+
+        :param a: (dim, ) initial value for a (initial velocity)
+        :param b: (dim, ) initial value for b (initial position)
+        :return: hit_time: the time for the hit
+                         cn : the cn-th constraint is active at hit time.
+        """
+        hit_time = 0
+        cn = 0
+
+        if self.f is None:
+            return hit_time, cn
+
+        f = self.f
+        g = self.g
+        for i in range(f.shape[0]):
+            # constraints: f[i].dot(x)+g[i] >= 0
+            fa = f[i].dot(a)
+            fb = f[i].dot(b)
+            u = np.sqrt(fa * fa + fb * fb)
+            # if u > g[i] and u > -g[i]:
+            if -u < g[i] < u:
+                # otherwise the constrain will always be satisfied
+                phi = np.arctan2(-fa, fb)  # -pi < phi < pi
+                t1 = np.arccos(-g[i] / u) - phi  # -pi < t1 < 2*pi
+
+                if t1 < 0:
+                    t1 += 2 * np.pi  # 0 < t1 < 2*pi
+                if np.abs(t1) < self.min_t or np.abs(t1 - 2 * np.pi) < self.min_t:
+                    t1 = 0
+
+                t2 = -t1 - 2 * phi  # -4*pi < t2 < 2*pi
+                if t2 < 0:
+                    t2 += 2 * np.pi  # -2*pi < t2 < 2*pi
+                if t2 < 0:
+                    t2 += 2 * np.pi  # 0 < t2 < 2*pi
+
+                if np.abs(t2) < self.min_t or np.abs(t2 - 2 * np.pi) < self.min_t:
+                    t2 = 0
+
+                if t1 == 0:
+                    t = t2
+                elif t2 == 0:
+                    t = t1
+                else:
+                    t = np.minimum(t1, t2)
+
+                if self.min_t < t and (hit_time == 0 or t < hit_time):
+                    hit_time = t
+                    cn = i
+        return hit_time, cn
+
+    def verifyConstraints(self, b):
+        """
+
+        :param b:
+        :return:
+        """
+        if self.f is not None:
+            return np.min(self.f @ b + self.g)
+        else:
+            return 1
+
+    def sampleNext(self):
+        T = np.pi / 2  # how much time to move
+        b = self.lastSample
+        dim = self.dim
+
+        count_sample_vel = 0
+
+        while True:
+            velsign = 0
+            # sample new initial velocity
+            a = np.random.normal(0, 1, dim)
+
+            count_sample_vel += 1
+            if self.verbose and count_sample_vel % 50 == 0:
+                print("Has sampled %d times of initial velocity." % count_sample_vel)
+
+            tt = T  # the time left to move
+            while True:
+                t, c1 = self.getNextLinearHitTime(a, b)
+                # t: how much time to move to hit the boundary, if t == 0, move tt
+                # c1: the strict constraint at hit time
+
+                if t == 0 or tt < t:
+                    # if no wall to be hit (t == 0) or not enough
+                    # time left to hit the wall (tt < t)
+                    break
+
+                tt -= t  # time left to move after hitting the wall
+                new_b = np.sin(t) * a + np.cos(t) * b  # hit location
+                hit_vel = np.cos(t) * a - np.sin(t) * b  # hit velocity
+                b = new_b
+                # reflect the velocity and verify that it points in the right direction
+                f2 = np.dot(self.f[c1], self.f[c1])
+                alpha = np.dot(self.f[c1], hit_vel) / f2
+                a = hit_vel - 2 * alpha * self.f[c1]  # reflected velocity
+
+                velsign = a.dot(self.f[c1])
+
+                if velsign < 0:
+                    # get out of inner while, resample the velocity and start again
+                    # this occurs rarelly, due to numerical instabilities
+                    break
+
+            if velsign < 0:
+                # go to the beginning of outer while
+                continue
+
+            bb = np.sin(tt) * a + np.cos(tt) * b
+            check = self.verifyConstraints(bb)
+            if check >= 0:
+                # verify that we don't violate the constraints
+                # due to a numerical instability
+
+                self.lastSample = bb
+                return bb, count_sample_vel
 
 
 def tmg(n, mu, M, initial, f=None, g=None, burn_in=30, verbose=False):
-	"""
-	This function generates samples from a Markov chain whose equilibrium distribution is a d-dimensional
-	multivariate Gaussian truncated by linear inequalities. The log probability density is
-	log p(X) = -0.5 (X-mu)^T M^-1 (X-mu) + const
-	in terms of a covariance matrix M and a mean vector mu. The constraints are imposed as explained below.
-	The Markov chain is built using the Hamiltonian Monte Carlo technique.
-
-	:param n:       Number of samples.
-	:param mu:      (m,) vector for the mean of multivariate Gaussian density
-	:param M:       (m,m) covariance matrix of the multivariate Gaussian density
-	:param initial: (m,) vector with the initial value of the Markov chain. Must satisfy
-					the truncation inequalities strictly.
-	:param f:       (q,m) matrix, where q is the number of linear constraints. The constraints require each component
-					of the m-dimensional vector fX+g to be non-negative
-	:param g:       (q,) vector with the constant terms in the above linear constraints.
-	:param burn_in: The number of burn-in iterations. The Markov chain is sampled n + burn_in
-					times, and the last n samples are returned.
-	:param verbose:
-	:return: (n, m)
-	"""
-
-	dim = len(mu)
-	if M.shape[1] != dim:
-		raise ValueError("The covariance matrix must be square.")
-
-	if len(initial) != dim:
-		raise ValueError("Wrong length for initial value vector.")
-
-	# verify that M is positive definite, it will raise an error if M is not SPD
-	R = np.linalg.cholesky(M)
-
-	# we change variable to the canonical frame, and transform back after sampling
-	# X ~ N(mu, M), then R^-1(X-mu) ~ N(0, I)
-	init_trans = scipy.linalg.solve(R, initial - mu)  # the new initial value
-
-	if f is not None:
-		if f.shape[0] != len(g) or f.shape[1] != dim:
-			raise ValueError("Inconsistent linear constraints. f must \
-                              be an d-by-m matrix and g an d-dimensional vector.")
-		# g may contains infinity, extract valid constraints
-		valid = np.logical_and(g < np.inf, g > -np.inf)
-		g = g[valid]
-		f = f[valid]
-
-		# verify initial value satisfies linear constraints
-		if np.any(f @ initial + g < 0):
-			raise ValueError("Initial point violates linear constraints.")
-
-		# map linear constraints to canonical frame
-		f_trans = f @ R
-		g_trans = f @ mu + g
-
-		hmc = HmcSampler(dim, init_trans, f_trans, g_trans, verbose=verbose)
-	else:
-		hmc = HmcSampler(dim, init_trans, f, g, verbose=verbose)
-
-	samples = np.zeros((n, dim))
-	for i in range(burn_in):
-		if verbose:
-			print("=" * 30 + " (burn in) sample {} ".format(i) + "=" * 30)
-		hmc.sampleNext()
-	for i in range(n):
-		if verbose:
-			print("=" * 30 + " sample {} ".format(i) + "=" * 30)
-		samples[i] = hmc.sampleNext()
-
-	# transform back
-	return samples @ R.T + mu
+    """
+    This function generates samples from a Markov chain whose equilibrium distribution is a d-dimensional
+    multivariate Gaussian truncated by linear inequalities. The log probability density is
+    log p(X) = -0.5 (X-mu)^T M^-1 (X-mu) + const
+    in terms of a covariance matrix M and a mean vector mu. The constraints are imposed as explained below.
+    The Markov chain is built using the Hamiltonian Monte Carlo technique.
+
+    :param n:       Number of samples.
+    :param mu:      (m,) vector for the mean of multivariate Gaussian density
+    :param M:       (m,m) covariance matrix of the multivariate Gaussian density
+    :param initial: (m,) vector with the initial value of the Markov chain. Must satisfy
+                                    the truncation inequalities strictly.
+    :param f:       (q,m) matrix, where q is the number of linear constraints. The constraints require each component
+                                    of the m-dimensional vector fX+g to be non-negative
+    :param g:       (q,) vector with the constant terms in the above linear constraints.
+    :param burn_in: The number of burn-in iterations. The Markov chain is sampled n + burn_in
+                                    times, and the last n samples are returned.
+    :param verbose:
+    :return: (n, m)
+    """
+
+    dim = len(mu)
+    if M.shape[1] != dim:
+        raise ValueError("The covariance matrix must be square.")
+
+    if len(initial) != dim:
+        raise ValueError("Wrong length for initial value vector.")
+
+    # verify that M is positive definite, it will raise an error if M is not SPD
+    R = np.linalg.cholesky(M)
+
+    # we change variable to the canonical frame, and transform back after sampling
+    # X ~ N(mu, M), then R^-1(X-mu) ~ N(0, I)
+    init_trans = scipy.linalg.solve(R, initial - mu)  # the new initial value
+
+    if f is not None:
+        if f.shape[0] != len(g) or f.shape[1] != dim:
+            raise ValueError(
+                "Inconsistent linear constraints. f must                              "
+                " be an d-by-m matrix and g an d-dimensional vector."
+            )
+        # g may contains infinity, extract valid constraints
+        valid = np.logical_and(g < np.inf, g > -np.inf)
+        g = g[valid]
+        f = f[valid]
+
+        # verify initial value satisfies linear constraints
+        if np.any(f @ initial + g < 0):
+            raise ValueError("Initial point violates linear constraints.")
+
+        # map linear constraints to canonical frame
+        f_trans = f @ R
+        g_trans = f @ mu + g
+
+        hmc = HmcSampler(dim, init_trans, f_trans, g_trans, verbose=verbose)
+    else:
+        hmc = HmcSampler(dim, init_trans, f, g, verbose=verbose)
+
+    samples = np.zeros((n, dim))
+    for num_steps, desc in [(burn_in, "Burn-In"), (n, "sampling")]:
+        progress_bar = tqdm(range(num_steps), desc=desc, position=0)
+        numbers_bar = tqdm(total=1, bar_format="{desc}", position=1)
+        count_sample_vels = []
+
+        for i in progress_bar:
+            s, count_sample_vel = hmc.sampleNext()
+            if desc == "sampling":
+                samples[i] = s
+
+            if hmc.verbose:
+                count_sample_vels.append(count_sample_vel)
+                numbers_bar.set_description(
+                    "\rtotal number of velocity samples:"
+                    f" {', '.join(map(str, count_sample_vels))}"
+                )
+                numbers_bar.refresh()
+
+    # transform back
+    return samples @ R.T + mu
diff --git a/stpy/helpers/quadrature_helper.py b/stpy/helpers/quadrature_helper.py
index b03d28e..997cd27 100644
--- a/stpy/helpers/quadrature_helper.py
+++ b/stpy/helpers/quadrature_helper.py
@@ -6,253 +6,320 @@
 
 
 def integrate_sin_sin(a, b, omega1, omega2):
-	"""
-
-	:param a:
-	:param b:
-	:param omega1:
-	:param omega2:
-	:return:
-	>>> np.round(integrate_sin_sin(0.2,0.5,2,3),6)
-	0.164678
-	"""
-	eps = 10e-5
-	if np.abs(omega1 - omega2) < eps:
-		F = lambda x: x / 2 - np.sin(2 * omega1 * x) / (4 * omega1)
-	else:
-		F = lambda x: (omega2 * np.sin(omega1 * x) * np.cos(x * omega2) -
-					   omega1 * np.cos(omega1 * x) * np.sin(omega2 * x)) / (omega1 ** 2 - omega2 ** 2)
-	return F(b) - F(a)
+    """
+
+    :param a:
+    :param b:
+    :param omega1:
+    :param omega2:
+    :return:
+    >>> np.round(integrate_sin_sin(0.2,0.5,2,3),6)
+    0.164678
+    """
+    eps = 10e-5
+    if np.abs(omega1 - omega2) < eps:
+        F = lambda x: x / 2 - np.sin(2 * omega1 * x) / (4 * omega1)
+    else:
+        F = lambda x: (
+            omega2 * np.sin(omega1 * x) * np.cos(x * omega2)
+            - omega1 * np.cos(omega1 * x) * np.sin(omega2 * x)
+        ) / (omega1**2 - omega2**2)
+    return F(b) - F(a)
 
 
 def integrate_sin_cos(a, b, omega1, omega2):
-	"""
-
-	:param a:
-	:param b:
-	:param omega1:
-	:param omega2:
-	:return:
-	>>> np.round(integrate_sin_cos(0.2,0.5,2,3),6)
-	0.082903
-	"""
-	eps = 10e-5
-	if np.abs(omega1 - omega2) < eps:
-		F = lambda x: -np.cos(omega1 * x) ** 2 / (2 * omega1)
-	else:
-		F = lambda x: -(omega2 * np.sin(omega1 * x) * np.sin(x * omega2) +
-						omega1 * np.cos(omega1 * x) * np.cos(omega2 * x)) / (omega1 ** 2 - omega2 ** 2)
-	return F(b) - F(a)
+    """
+
+    :param a:
+    :param b:
+    :param omega1:
+    :param omega2:
+    :return:
+    >>> np.round(integrate_sin_cos(0.2,0.5,2,3),6)
+    0.082903
+    """
+    eps = 10e-5
+    if np.abs(omega1 - omega2) < eps:
+        F = lambda x: -np.cos(omega1 * x) ** 2 / (2 * omega1)
+    else:
+        F = lambda x: -(
+            omega2 * np.sin(omega1 * x) * np.sin(x * omega2)
+            + omega1 * np.cos(omega1 * x) * np.cos(omega2 * x)
+        ) / (omega1**2 - omega2**2)
+    return F(b) - F(a)
 
 
 def integrate_cos_cos(a, b, omega1, omega2):
-	"""
-
-	:param a:
-	:param b:
-	:param omega1:
-	:param omega2:
-	:return:
-	>>> np.round(integrate_cos_cos(0.2,0.5,2,3),6)
-	0.116078
-	"""
-	eps = 10e-5
-	if np.abs(omega1 - omega2) < eps:
-		F = lambda x: x / 2 + np.sin(2 * omega1 * x) / (4 * omega1)
-	else:
-		F = lambda x: (omega1 * np.sin(omega1 * x) * np.cos(x * omega2) -
-					   omega2 * np.cos(omega1 * x) * np.sin(omega2 * x)) / (omega1 ** 2 - omega2 ** 2)
-	return F(b) - F(a)
+    """
+
+    :param a:
+    :param b:
+    :param omega1:
+    :param omega2:
+    :return:
+    >>> np.round(integrate_cos_cos(0.2,0.5,2,3),6)
+    0.116078
+    """
+    eps = 10e-5
+    if np.abs(omega1 - omega2) < eps:
+        F = lambda x: x / 2 + np.sin(2 * omega1 * x) / (4 * omega1)
+    else:
+        F = lambda x: (
+            omega1 * np.sin(omega1 * x) * np.cos(x * omega2)
+            - omega2 * np.cos(omega1 * x) * np.sin(omega2 * x)
+        ) / (omega1**2 - omega2**2)
+    return F(b) - F(a)
 
 
 def integrate2d_sin_sin(A, B, C, D, a, b, c, d):
-	Cos = lambda x: np.cos(x)
-	val = (1 / (2 * (b - d) * (b + d))) * (-(((b + d) * (Cos(a * A - A * c + b * C - C * d) -
-														 Cos(a * B - B * c + b * C - C * d))) / (a - c)) + (
-													   (b + d) * (Cos(a * A - A * c + b * D - d * D) -
-																  Cos(a * B - B * c + b * D - d * D))) / (a - c) + (
-													   1 / (
-													   a + c)) * (b - d) * (Cos(A * (a + c) + C * (b + d)) - Cos(
-		B * (a + c) + C * (b + d)) - Cos(A * (a + c) + (b + d) *
-										 D) + Cos(B * (a + c) + (b + d) * D)))
-	return val
+    Cos = lambda x: np.cos(x)
+    val = (1 / (2 * (b - d) * (b + d))) * (
+        -(
+            (
+                (b + d)
+                * (
+                    Cos(a * A - A * c + b * C - C * d)
+                    - Cos(a * B - B * c + b * C - C * d)
+                )
+            )
+            / (a - c)
+        )
+        + (
+            (b + d)
+            * (Cos(a * A - A * c + b * D - d * D) - Cos(a * B - B * c + b * D - d * D))
+        )
+        / (a - c)
+        + (1 / (a + c))
+        * (b - d)
+        * (
+            Cos(A * (a + c) + C * (b + d))
+            - Cos(B * (a + c) + C * (b + d))
+            - Cos(A * (a + c) + (b + d) * D)
+            + Cos(B * (a + c) + (b + d) * D)
+        )
+    )
+    return val
 
 
 def integrate2d_sin_cos(A, B, C, D, a, b, c, d):
-	Sin = lambda x: np.sin(x)
-	val = (1 / (2 * (b - d) * (b + d))) * (((b + d) * (-Sin(a * A - A * c + b * C - C * d) +
-													   Sin(a * B - B * c + b * C - C * d))) / (a - c) + (
-													   (b + d) * (Sin(a * A - A * c + b * D - d * D) -
-																  Sin(a * B - B * c + b * D - d * D))) / (a - c) - (
-													   1 / (a + c)) * (b - d) * (Sin(A * (a + c) + C * (b + d)) -
-																				 Sin(B * (a + c) + C * (b + d)) - Sin(
-				A * (a + c) + (b + d) * D) +
-																				 Sin(B * (a + c) + (b + d) * D)))
-	return val
+    Sin = lambda x: np.sin(x)
+    val = (1 / (2 * (b - d) * (b + d))) * (
+        (
+            (b + d)
+            * (-Sin(a * A - A * c + b * C - C * d) + Sin(a * B - B * c + b * C - C * d))
+        )
+        / (a - c)
+        + (
+            (b + d)
+            * (Sin(a * A - A * c + b * D - d * D) - Sin(a * B - B * c + b * D - d * D))
+        )
+        / (a - c)
+        - (1 / (a + c))
+        * (b - d)
+        * (
+            Sin(A * (a + c) + C * (b + d))
+            - Sin(B * (a + c) + C * (b + d))
+            - Sin(A * (a + c) + (b + d) * D)
+            + Sin(B * (a + c) + (b + d) * D)
+        )
+    )
+    return val
 
 
 def integrate2d_cos_cos(A, B, C, D, a, b, c, d):
-	Cos = lambda x: np.cos(x)
-	val = -(1 / (2 * (b - d) * (b + d))) * (((b + d)(Cos(a * A - A * c + b * C - C * d) -
-													 Cos(a * B - B * c + b * C - C * d))) / (
-													a - c) - ((b + d) * (Cos(a * A - A * c + b * D - d * D) -
-																		 Cos(a * B - B * c + b * D - d * D))) / (
-														a - c) + (1 / (
-			a + c)) * (b - d) * (Cos(A * (a + c) + C * (b + d)) -
-								 Cos(B * (a + c) + C * (b + d)) - Cos(A * (a + c) + (b + d) * D) + Cos(
-				B * (a + c) + (b + d) * D)))
-	return val
+    Cos = lambda x: np.cos(x)
+    val = -(1 / (2 * (b - d) * (b + d))) * (
+        (
+            (b + d)(
+                Cos(a * A - A * c + b * C - C * d) - Cos(a * B - B * c + b * C - C * d)
+            )
+        )
+        / (a - c)
+        - (
+            (b + d)
+            * (Cos(a * A - A * c + b * D - d * D) - Cos(a * B - B * c + b * D - d * D))
+        )
+        / (a - c)
+        + (1 / (a + c))
+        * (b - d)
+        * (
+            Cos(A * (a + c) + C * (b + d))
+            - Cos(B * (a + c) + C * (b + d))
+            - Cos(A * (a + c) + (b + d) * D)
+            + Cos(B * (a + c) + (b + d) * D)
+        )
+    )
+    return val
 
 
 def integrate_sin_multidimensional(a, b, omegas):
-	"""
-
-	:param a: bounds start
-	:param b: bounds end
-	:param omegas: frequencies
-	:return:
-	>>> np.round(integrate_sin_multidimensional(np.array([0.5]),np.array([1.]),np.array([2.])),5)
-	0.47822
-	>>> np.round(integrate_sin_multidimensional(np.array([0.5,0.3]),np.array([1.,4.]),np.array([2.,5.])),5)
-	-0.01037
-	>>> np.round(integrate_sin_multidimensional(np.array([0.5,0.3,0.8]),np.array([1.,4.,3.1]),np.array([2.,5.,1.5])),5)
-	0.02232
-	"""
-	d = omegas.shape[0]
-
-	z = np.array([omegas * b, omegas * a])
-	sign = np.array([omegas * 0, omegas * 0 + 1])
-	ar = cartesian([z[:, i] for i in range(z.shape[1])])
-	signs = cartesian([sign[:, i] for i in range(sign.shape[1])])
-	signs = np.sum(signs, axis=1)
-	ar = np.sum(ar, axis=1)
-	k = 1. / np.prod(omegas)
-	# print (ar)
-
-	if d % 2 == 1:
-		r = np.cos(ar)
-		if d % 4 == 1:
-			r = -r
-		for i in range(r.shape[0]):
-			if signs[i] % 2 == 1:
-				r[i] = -r[i]
-	else:
-		r = np.sin(ar)
-		if d % 4 == 3:
-			r = -r
-		for i in range(r.shape[0]):
-			if signs[i] % 2 == 0:
-				r[i] = -r[i]
-	return k * np.sum(r)
+    """
+
+    :param a: bounds start
+    :param b: bounds end
+    :param omegas: frequencies
+    :return:
+    >>> np.round(integrate_sin_multidimensional(np.array([0.5]),np.array([1.]),np.array([2.])),5)
+    0.47822
+    >>> np.round(integrate_sin_multidimensional(np.array([0.5,0.3]),np.array([1.,4.]),np.array([2.,5.])),5)
+    -0.01037
+    >>> np.round(integrate_sin_multidimensional(np.array([0.5,0.3,0.8]),np.array([1.,4.,3.1]),np.array([2.,5.,1.5])),5)
+    0.02232
+    """
+    d = omegas.shape[0]
+
+    z = np.array([omegas * b, omegas * a])
+    sign = np.array([omegas * 0, omegas * 0 + 1])
+    ar = cartesian([z[:, i] for i in range(z.shape[1])])
+    signs = cartesian([sign[:, i] for i in range(sign.shape[1])])
+    signs = np.sum(signs, axis=1)
+    ar = np.sum(ar, axis=1)
+    k = 1.0 / np.prod(omegas)
+    # print (ar)
+
+    if d % 2 == 1:
+        r = np.cos(ar)
+        if d % 4 == 1:
+            r = -r
+        for i in range(r.shape[0]):
+            if signs[i] % 2 == 1:
+                r[i] = -r[i]
+    else:
+        r = np.sin(ar)
+        if d % 4 == 3:
+            r = -r
+        for i in range(r.shape[0]):
+            if signs[i] % 2 == 0:
+                r[i] = -r[i]
+    return k * np.sum(r)
 
 
 def integrate_cos_multidimensional(a, b, omegas):
-	"""
-
-	:param a: bounds start
-	:param b: bounds end
-	:param omegas: frequencies
-	:return:
-	>>> np.round(integrate_cos_multidimensional(np.array([0.5]),np.array([1.]),np.array([2.])),5)
-	0.03391
-	>>> np.round(integrate_cos_multidimensional(np.array([0.5,0.3]),np.array([1.,4.]),np.array([2.,5.])),5)
-	0.03169
-	>>> np.round(integrate_cos_multidimensional(np.array([0.5,0.3,0.8]),np.array([1.,4.,3.1]),np.array([2.,5.,1.5])),5)
-	-0.03784
-	"""
-	d = omegas.shape[0]
-
-	z = np.array([omegas * b, omegas * a])
-	sign = np.array([omegas * 0, omegas * 0 + 1])
-	# print(z)
-	ar = cartesian([z[:, i] for i in range(z.shape[1])])
-	signs = cartesian([sign[:, i] for i in range(sign.shape[1])])
-	signs = np.sum(signs, axis=1)
-	ar = np.sum(ar, axis=1)
-	k = 1. / np.prod(omegas)
-	# print (ar)
-
-	if d % 2 == 1:
-		r = np.sin(ar)
-		if d % 4 == 3:
-			r = -r
-		for i in range(r.shape[0]):
-			if signs[i] % 2 == 1:
-				r[i] = -r[i]
-	else:
-		r = np.cos(ar)
-		if d % 4 == 1:
-			r = -r
-		for i in range(r.shape[0]):
-			if signs[i] % 2 == 0:
-				r[i] = -r[i]
-
-	return k * np.sum(r)
+    """
+
+    :param a: bounds start
+    :param b: bounds end
+    :param omegas: frequencies
+    :return:
+    >>> np.round(integrate_cos_multidimensional(np.array([0.5]),np.array([1.]),np.array([2.])),5)
+    0.03391
+    >>> np.round(integrate_cos_multidimensional(np.array([0.5,0.3]),np.array([1.,4.]),np.array([2.,5.])),5)
+    0.03169
+    >>> np.round(integrate_cos_multidimensional(np.array([0.5,0.3,0.8]),np.array([1.,4.,3.1]),np.array([2.,5.,1.5])),5)
+    -0.03784
+    """
+    d = omegas.shape[0]
+
+    z = np.array([omegas * b, omegas * a])
+    sign = np.array([omegas * 0, omegas * 0 + 1])
+    # print(z)
+    ar = cartesian([z[:, i] for i in range(z.shape[1])])
+    signs = cartesian([sign[:, i] for i in range(sign.shape[1])])
+    signs = np.sum(signs, axis=1)
+    ar = np.sum(ar, axis=1)
+    k = 1.0 / np.prod(omegas)
+    # print (ar)
+
+    if d % 2 == 1:
+        r = np.sin(ar)
+        if d % 4 == 3:
+            r = -r
+        for i in range(r.shape[0]):
+            if signs[i] % 2 == 1:
+                r[i] = -r[i]
+    else:
+        r = np.cos(ar)
+        if d % 4 == 1:
+            r = -r
+        for i in range(r.shape[0]):
+            if signs[i] % 2 == 0:
+                r[i] = -r[i]
+
+    return k * np.sum(r)
 
 
 def romberg2d(func, x1, x2, y1, y2):
-	"""
-
-	:param func:
-	:param x1:
-	:param x2:
-	:param y1:
-	:param y2:
-	:return:
-	>>> np.round(romberg2d(lambda x,y:2*x**2+y**2,0,1,1,2),5)
-	3.0
-	"""
-	func2 = lambda y, a, b: integrate.romberg(func, a, b, args=(y,))
-	return integrate.romberg(func2, y1, y2, args=(x1, x2))
-
-
-def quadvec2(func, x1, x2, y1, y2, epsabs=1e-200, epsrel=1e-08, limit=1000, workers=1, quadrature='gk21'):
-	"""
-	>>> alpha = np.linspace(0.0, 2.0, num=30)
-	>>> np.round(quadvec2(lambda x,y: x**alpha + y**alpha,0,1,1,2)[0],5)
-	2.0
-	>>> np.round(quadvec2(lambda x,y: 2*x**alpha + y**alpha,0,1,1,2)[-1],5)
-	3.0
-	"""
-	func2 = lambda y: \
-	integrate.quad_vec(lambda x: func(x, y), x1, x2, epsabs=epsabs, epsrel=epsrel, limit=limit, quadrature=quadrature)[
-		0]
-	res = integrate.quad_vec(func2, y1, y2, epsabs=epsabs, epsrel=epsrel, limit=limit, quadrature=quadrature)
-	return res[0]
+    """
+
+    :param func:
+    :param x1:
+    :param x2:
+    :param y1:
+    :param y2:
+    :return:
+    >>> np.round(romberg2d(lambda x,y:2*x**2+y**2,0,1,1,2),5)
+    3.0
+    """
+    func2 = lambda y, a, b: integrate.romberg(func, a, b, args=(y,))
+    return integrate.romberg(func2, y1, y2, args=(x1, x2))
+
+
+def quadvec2(
+    func,
+    x1,
+    x2,
+    y1,
+    y2,
+    epsabs=1e-200,
+    epsrel=1e-08,
+    limit=1000,
+    workers=1,
+    quadrature="gk21",
+):
+    """
+    >>> alpha = np.linspace(0.0, 2.0, num=30)
+    >>> np.round(quadvec2(lambda x,y: x**alpha + y**alpha,0,1,1,2)[0],5)
+    2.0
+    >>> np.round(quadvec2(lambda x,y: 2*x**alpha + y**alpha,0,1,1,2)[-1],5)
+    3.0
+    """
+    func2 = lambda y: integrate.quad_vec(
+        lambda x: func(x, y),
+        x1,
+        x2,
+        epsabs=epsabs,
+        epsrel=epsrel,
+        limit=limit,
+        quadrature=quadrature,
+    )[0]
+    res = integrate.quad_vec(
+        func2, y1, y2, epsabs=epsabs, epsrel=epsrel, limit=limit, quadrature=quadrature
+    )
+    return res[0]
 
 
 def AvgEig(Phi, xtest):
-	n = Phi(xtest[0].view(1, -1)).size()[0]
-	A = torch.zeros(size=(n, n), dtype=torch.float64)
-	for x in xtest:
-		v = Phi(x.view(1, -1)).view(-1, 1)
-		A = A + v @ v.T
-	A = A / xtest.size()[0]
-	# import matplotlib.pyplot as plt
-	# plt.imshow(A)
-	# plt.colorbar()
-	# plt.show()
-	maxeig = torch.min(torch.symeig(A)[0])
-	return maxeig
+    n = Phi(xtest[0].view(1, -1)).size()[0]
+    A = torch.zeros(size=(n, n), dtype=torch.float64)
+    for x in xtest:
+        v = Phi(x.view(1, -1)).view(-1, 1)
+        A = A + v @ v.T
+    A = A / xtest.size()[0]
+    # import matplotlib.pyplot as plt
+    # plt.imshow(A)
+    # plt.colorbar()
+    # plt.show()
+    maxeig = torch.min(torch.symeig(A)[0])
+    return maxeig
 
 
 def volume_eig(Phi, xtest, alpha=0.5):
-	n = Phi(xtest[0].view(1, -1)).size()[0]
-	A = torch.zeros(size=(n, n), dtype=torch.float64)
-	for x in xtest:
-		v = Phi(x.view(1, -1)).view(-1, 1)
-		mineig = torch.min(torch.symeig(v @ v.T)[0])
-		print(mineig)
-	vol = 0
-	return vol
+    n = Phi(xtest[0].view(1, -1)).size()[0]
+    A = torch.zeros(size=(n, n), dtype=torch.float64)
+    for x in xtest:
+        v = Phi(x.view(1, -1)).view(-1, 1)
+        mineig = torch.min(torch.symeig(v @ v.T)[0])
+        print(mineig)
+    vol = 0
+    return vol
 
 
 def chebyschev_nodes(n, d=1, L_infinity_ball=1):
-	nodes, w = np.polynomial.chebyshev.chebgauss(n)
-	arrays = [nodes.reshape(n, 1) for i in range(d)]
-	xtest = cartesian(arrays)
-	return xtest
+    nodes, w = np.polynomial.chebyshev.chebgauss(n)
+    arrays = [nodes.reshape(n, 1) for i in range(d)]
+    xtest = cartesian(arrays)
+    return xtest
 
 
 if __name__ == "__main__":
-	pass
+    pass
diff --git a/stpy/helpers/scores.py b/stpy/helpers/scores.py
index 133db9c..af1cf97 100644
--- a/stpy/helpers/scores.py
+++ b/stpy/helpers/scores.py
@@ -1,4 +1,5 @@
 import torch
 
-def r_score_std(y_true, y_pred, std, alpha = 1.):
-	return 1 - torch.mean((y_true - y_pred)**2)/(alpha*std**2)
\ No newline at end of file
+
+def r_score_std(y_true, y_pred, std, alpha=1.0):
+    return 1 - torch.mean((y_true - y_pred) ** 2) / (alpha * std**2)
diff --git a/stpy/helpers/transformations.py b/stpy/helpers/transformations.py
index 6a77ee1..2ea4eb1 100644
--- a/stpy/helpers/transformations.py
+++ b/stpy/helpers/transformations.py
@@ -5,40 +5,48 @@
 
 
 def transform(X, low=-1, high=1, functions=True, offsets=None):
-	n, d = X.size()
-	Y = X.clone()
-	transforms = []
-	inv_transforms = []
-
-	for i in range(d):
-
-		if offsets is None:
-			xmin = torch.min(X[:, i]).clone().numpy()
-			xmax = torch.max(X[:, i]).clone().numpy()
-		else:
-			xmin = offsets[i][0]
-			xmax = offsets[i][1]
-
-		k = copy.copy(float((xmin - xmax) / ((low - high))))
-		q = copy.copy(float(xmin - low * k))
-
-		k2 = copy.copy(float((low - high) / (xmin - xmax)))
-		q2 = copy.copy(float(high - xmax * k2))
-
-		inv_transform = lambda a, k=k, q=q: k * a + q
-		transform = lambda a, k2=k2, q2=q2: k2 * a + q2
-
-		transforms.append(copy.copy(transform))
-		inv_transforms.append(copy.copy(inv_transform))
-
-		Y[:, i] = torch.from_numpy(np.apply_along_axis(transform, 0, X[:, i].numpy()))
-
-	trans = lambda Z: torch.stack(
-		[torch.from_numpy(np.apply_along_axis(transforms[i], 0, Z[:, i].numpy())) for i in range(d)]).T
-	inv_trans = lambda Y: torch.stack(
-		[torch.from_numpy(np.apply_along_axis(inv_transforms[i], 0, Y[:, i].numpy())) for i in range(d)]).T
-
-	if functions == True:
-		return Y, trans, inv_trans, transforms, inv_transforms
-	else:
-		return Y
+    n, d = X.size()
+    Y = X.clone()
+    transforms = []
+    inv_transforms = []
+
+    for i in range(d):
+
+        if offsets is None:
+            xmin = torch.min(X[:, i]).clone().numpy()
+            xmax = torch.max(X[:, i]).clone().numpy()
+        else:
+            xmin = offsets[i][0]
+            xmax = offsets[i][1]
+
+        k = copy.copy(float((xmin - xmax) / ((low - high))))
+        q = copy.copy(float(xmin - low * k))
+
+        k2 = copy.copy(float((low - high) / (xmin - xmax)))
+        q2 = copy.copy(float(high - xmax * k2))
+
+        inv_transform = lambda a, k=k, q=q: k * a + q
+        transform = lambda a, k2=k2, q2=q2: k2 * a + q2
+
+        transforms.append(copy.copy(transform))
+        inv_transforms.append(copy.copy(inv_transform))
+
+        Y[:, i] = torch.from_numpy(np.apply_along_axis(transform, 0, X[:, i].numpy()))
+
+    trans = lambda Z: torch.stack(
+        [
+            torch.from_numpy(np.apply_along_axis(transforms[i], 0, Z[:, i].numpy()))
+            for i in range(d)
+        ]
+    ).T
+    inv_trans = lambda Y: torch.stack(
+        [
+            torch.from_numpy(np.apply_along_axis(inv_transforms[i], 0, Y[:, i].numpy()))
+            for i in range(d)
+        ]
+    ).T
+
+    if functions == True:
+        return Y, trans, inv_trans, transforms, inv_transforms
+    else:
+        return Y
diff --git a/stpy/helpers/voxel_grid.py b/stpy/helpers/voxel_grid.py
new file mode 100644
index 0000000..b4c309e
--- /dev/null
+++ b/stpy/helpers/voxel_grid.py
@@ -0,0 +1,63 @@
+from typing import List, Optional, Union
+
+
+import torch
+from torch import Tensor
+
+from torch_cluster import grid_cluster
+
+
+def _get_n_voxels(x, size: float):
+    size = torch.full([x.shape[1]], size)
+    indices = grid_cluster(x, size)
+    return indices.unique().numel()
+
+
+def voxel_grid(
+    x: Tensor,
+    size: Union[float, Tensor, None] = None,
+    max_n_voxels: int | None = None,
+) -> Tensor:
+
+    # Do binary search to find the right voxel size that yields <= max_n_voxels
+    if size is None:
+        assert max_n_voxels is not None, "One of size, n_voxels must be given"
+        max_size = (x.max(dim=0).values - x.min(dim=0).values).max().item()
+        tol = max_size / 1e7
+        low, high = 0, max_size
+        while high - low > tol:
+            mid = (low + high) / 2
+            n_voxels = _get_n_voxels(x, mid)
+            if n_voxels > max_n_voxels:
+                low = mid
+            else:
+                high = mid
+        size = high
+
+    if isinstance(size, float):
+        size = torch.full([x.shape[1]], size)
+    indices = grid_cluster(x, size).unsqueeze(1).expand(-1, x.shape[1])
+    out = torch.full(
+        [indices.max() + 1, x.shape[1]], torch.nan, dtype=x.dtype, device=x.device
+    )
+    averaged = out.scatter_reduce(0, indices, x, reduce="mean", include_self=False)
+    return averaged[~torch.isnan(averaged).any(dim=1)]
+
+
+if __name__ == "__main__":
+
+    # Example usage of voxel_grid
+    x = torch.tensor(
+        [
+            [0.1, 0.2, 0.3],
+            [2, 0, 0],
+            [0, 2, 0],
+            [0, 0, 2],
+            [2.1, 2.2, 2.3],
+            [3, 3, 3],
+        ]
+    )
+    size = 1.0
+
+    result = voxel_grid(x, max_n_voxels=3)
+    print(result)
diff --git a/stpy/helpers/wavelets.py b/stpy/helpers/wavelets.py
index 9a378af..5872fde 100644
--- a/stpy/helpers/wavelets.py
+++ b/stpy/helpers/wavelets.py
@@ -1,26 +1,26 @@
 from mpmath import *
 
 phi = lambda x: (0 <= x < 1)  # scaling fct
-psi = lambda x: (0 <= x < .5) - (.5 <= x < 1)  # wavelet fct
-phi_j_k = lambda x, j, k: 2 ** (j / 2) * phi(2 ** j * x - k)
-psi_j_k = lambda x, j, k: 2 ** (j / 2) * psi(2 ** j * x - k)
+psi = lambda x: (0 <= x < 0.5) - (0.5 <= x < 1)  # wavelet fct
+phi_j_k = lambda x, j, k: 2 ** (j / 2) * phi(2**j * x - k)
+psi_j_k = lambda x, j, k: 2 ** (j / 2) * psi(2**j * x - k)
 
 
 def haar(f, interval, level):
-	c0 = quadgl(lambda t: f(t) * phi_j_k(t, 0, 0), interval)
+    c0 = quadgl(lambda t: f(t) * phi_j_k(t, 0, 0), interval)
 
-	coef = []
-	for j in xrange(0, level):
-		for k in xrange(0, 2 ** j):
-			djk = quadgl(lambda t: f(t) * psi_j_k(t, j, k), interval)
-			coef.append((j, k, djk))
+    coef = []
+    for j in xrange(0, level):
+        for k in xrange(0, 2**j):
+            djk = quadgl(lambda t: f(t) * psi_j_k(t, j, k), interval)
+            coef.append((j, k, djk))
 
-	return c0, coef
+    return c0, coef
 
 
 def haarval(haar_coef, x):
-	c0, coef = haar_coef
-	s = c0 * phi_j_k(x, 0, 0)
-	for j, k, djk in coef:
-		s += djk * psi_j_k(x, j, k)
-	return s
+    c0, coef = haar_coef
+    s = c0 * phi_j_k(x, 0, 0)
+    for j, k, djk in coef:
+        s += djk * psi_j_k(x, j, k)
+    return s
diff --git a/stpy/kernel_functions/additive_decorator.py b/stpy/kernel_functions/additive_decorator.py
index 718ef62..a477f67 100644
--- a/stpy/kernel_functions/additive_decorator.py
+++ b/stpy/kernel_functions/additive_decorator.py
@@ -1,5 +1,6 @@
 def additive(func):
     def wrapper():
 
-	    func()
-    return wrapper
\ No newline at end of file
+        func()
+
+    return wrapper
diff --git a/stpy/kernel_functions/ard_kernel.py b/stpy/kernel_functions/ard_kernel.py
index 353cdc7..21a1ba7 100644
--- a/stpy/kernel_functions/ard_kernel.py
+++ b/stpy/kernel_functions/ard_kernel.py
@@ -3,91 +3,91 @@
 
 
 def ard_kernel(a, b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["ard_gamma", "kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["ard_gamma", "kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
+    a = a[:, p.group]
+    b = b[:, p.group]
 
-	D = torch.diag(1. / (p.ard_gamma[p.group]))
+    D = torch.diag(1.0 / (p.ard_gamma[p.group]))
 
-	a = torch.mm(a, D)
-	b = torch.mm(b, D)
+    a = torch.mm(a, D)
+    b = torch.mm(b, D)
 
-	normx = torch.sum(a ** 2, dim=1).reshape(-1, 1)
-	normy = torch.sum(b ** 2, dim=1).reshape(-1, 1)
+    normx = torch.sum(a**2, dim=1).reshape(-1, 1)
+    normy = torch.sum(b**2, dim=1).reshape(-1, 1)
 
-	product = torch.mm(b, torch.t(a))
-	sqdist = -2 * product + torch.t(normx) + normy
-	arg = - 0.5 * sqdist
-	res = torch.exp(arg)
-	return p.kappa * res
+    product = torch.mm(b, torch.t(a))
+    sqdist = -2 * product + torch.t(normx) + normy
+    arg = -0.5 * sqdist
+    res = torch.exp(arg)
+    return p.kappa * res
 
 
 def ard_kernel_diag(a, b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["ard_gamma", "kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["ard_gamma", "kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
+    a = a[:, p.group]
+    b = b[:, p.group]
 
-	D = torch.diag(1. / (p.ard_gamma[p.group]))
-	a = torch.mm(a, D)
-	b = torch.mm(b, D)
-	normx = torch.sum(a ** 2, dim=1).reshape(-1, 1)
-	normy = torch.sum(b ** 2, dim=1).reshape(-1, 1)
+    D = torch.diag(1.0 / (p.ard_gamma[p.group]))
+    a = torch.mm(a, D)
+    b = torch.mm(b, D)
+    normx = torch.sum(a**2, dim=1).reshape(-1, 1)
+    normy = torch.sum(b**2, dim=1).reshape(-1, 1)
 
-	product = torch.mm(b, torch.t(a))
-	sqdist = -2 * product + torch.t(normx) + normy
-	arg = - 0.5 * sqdist
-	res = torch.exp(arg)
-	return p.kappa * res
+    product = torch.mm(b, torch.t(a))
+    sqdist = -2 * product + torch.t(normx) + normy
+    arg = -0.5 * sqdist
+    res = torch.exp(arg)
+    return p.kappa * res
 
 
 def ard_per_group_kernel_additive(self, a, b, **kwargs):
-	if 'kappa' in kwargs.keys():
-		kappa = kwargs['kappa']
-	else:
-		kappa = self.kappa
-
-	if 'groups' in kwargs.keys():
-		groups = kwargs['groups']
-	else:
-		groups = self.groups
-
-	if 'ard_per_group' in kwargs.keys():
-		ard_per_group = kwargs['ard_per_group']
-	else:
-		raise AssertionError("This kernel requires 'ard_per_group' initial parameters")
-
-	(n, z) = tuple(a.size())
-	(q, m) = tuple(b.size())
-
-	r = torch.zeros(size=(q, n), dtype=torch.float64)
-	groups_index = 0
-
-	for group_add in groups:
-		kwargs['group'] = group_add
-
-		size_group = len(group_add)
-		# use per group lenghtscale
-		# kwargs['ard_gamma'] = ard_per_group[groups_index:groups_index+size_group]
-		gamma = ard_per_group[groups_index:groups_index + size_group]
-		groups_index += size_group
-
-		ax = a[:, group_add]
-		bx = b[:, group_add]
-		D = torch.diag(1. / (gamma))
-		ax = torch.mm(ax, D)
-		bx = torch.mm(bx, D)
-		normx = torch.sum(ax ** 2, dim=1).reshape(-1, 1)
-		normy = torch.sum(bx ** 2, dim=1).reshape(-1, 1)
-		product = torch.mm(bx, torch.t(ax))
-		# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-		sqdist = -2 * product + torch.t(normx) + normy
-		arg = - 0.5 * sqdist
-		res = torch.exp(arg)
-		r = r + res
-
-	r = r / float(len(groups))
-	return kappa * r
\ No newline at end of file
+    if "kappa" in kwargs.keys():
+        kappa = kwargs["kappa"]
+    else:
+        kappa = self.kappa
+
+    if "groups" in kwargs.keys():
+        groups = kwargs["groups"]
+    else:
+        groups = self.groups
+
+    if "ard_per_group" in kwargs.keys():
+        ard_per_group = kwargs["ard_per_group"]
+    else:
+        raise AssertionError("This kernel requires 'ard_per_group' initial parameters")
+
+    (n, z) = tuple(a.size())
+    (q, m) = tuple(b.size())
+
+    r = torch.zeros(size=(q, n), dtype=torch.float64)
+    groups_index = 0
+
+    for group_add in groups:
+        kwargs["group"] = group_add
+
+        size_group = len(group_add)
+        # use per group lenghtscale
+        # kwargs['ard_gamma'] = ard_per_group[groups_index:groups_index+size_group]
+        gamma = ard_per_group[groups_index : groups_index + size_group]
+        groups_index += size_group
+
+        ax = a[:, group_add]
+        bx = b[:, group_add]
+        D = torch.diag(1.0 / (gamma))
+        ax = torch.mm(ax, D)
+        bx = torch.mm(bx, D)
+        normx = torch.sum(ax**2, dim=1).reshape(-1, 1)
+        normy = torch.sum(bx**2, dim=1).reshape(-1, 1)
+        product = torch.mm(bx, torch.t(ax))
+        # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+        sqdist = -2 * product + torch.t(normx) + normy
+        arg = -0.5 * sqdist
+        res = torch.exp(arg)
+        r = r + res
+
+    r = r / float(len(groups))
+    return kappa * r
diff --git a/stpy/kernel_functions/covar_kernel.py b/stpy/kernel_functions/covar_kernel.py
index 070cdc3..2ebdecc 100644
--- a/stpy/kernel_functions/covar_kernel.py
+++ b/stpy/kernel_functions/covar_kernel.py
@@ -1,20 +1,21 @@
 import torch
 from stpy.kernel_functions.kernel_params import KernelParams
 
+
 def covar_kernel(a, b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["cov", "kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["cov", "kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
-	a = torch.mm(a, p.cov)
-	b = torch.mm(b, p.cov)
+    a = a[:, p.group]
+    b = b[:, p.group]
+    a = torch.mm(a, p.cov)
+    b = torch.mm(b, p.cov)
 
-	normx = torch.sum(a ** 2, dim=1).reshape(-1, 1)
-	normy = torch.sum(b ** 2, dim=1).reshape(-1, 1)
-	product = torch.mm(b, torch.t(a))
+    normx = torch.sum(a**2, dim=1).reshape(-1, 1)
+    normy = torch.sum(b**2, dim=1).reshape(-1, 1)
+    product = torch.mm(b, torch.t(a))
 
-	sqdist = -2 * product + torch.t(normx) + normy
-	arg = - 0.5 * sqdist
-	res = torch.exp(arg)
-	return p.kappa * res
\ No newline at end of file
+    sqdist = -2 * product + torch.t(normx) + normy
+    arg = -0.5 * sqdist
+    res = torch.exp(arg)
+    return p.kappa * res
diff --git a/stpy/kernel_functions/custom_map_kernel.py b/stpy/kernel_functions/custom_map_kernel.py
index 62cc068..0e2ddea 100644
--- a/stpy/kernel_functions/custom_map_kernel.py
+++ b/stpy/kernel_functions/custom_map_kernel.py
@@ -1,14 +1,15 @@
 from stpy.kernel_functions.kernel_params import KernelParams
 from stpy.kernel_functions.linear_kernel import linear_kernel
 
+
 def custom_map_kernel(a, b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["map", "kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["map", "kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
+    a = a[:, p.group]
+    b = b[:, p.group]
 
-	if map is not None:
-		return p.kappa * linear_kernel(torch.t(p.map(a)), torch.t(p.map(b))).detach()
-	else:
-		return p.kappa * linear_kernel(a, b)
\ No newline at end of file
+    if map is not None:
+        return p.kappa * linear_kernel(torch.t(p.map(a)), torch.t(p.map(b))).detach()
+    else:
+        return p.kappa * linear_kernel(a, b)
diff --git a/stpy/kernel_functions/gibbs_custom_kernel.py b/stpy/kernel_functions/gibbs_custom_kernel.py
index d3f3a37..dd6b153 100644
--- a/stpy/kernel_functions/gibbs_custom_kernel.py
+++ b/stpy/kernel_functions/gibbs_custom_kernel.py
@@ -1,23 +1,24 @@
 from stpy.kernel_functions.kernel_params import KernelParams
 import torch
 
+
 def gibbs_custom_kernel(a, b, **kwargs):
 
-	p = KernelParams(kwargs)
-	p.assert_existence(["gamma_fun", "kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["gamma_fun", "kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
-	#	print (a.shape, b.shape)
-	normx = torch.sum(a ** 2, dim=1).view(-1, 1)
-	normy = torch.sum(b ** 2, dim=1).view(-1, 1)
+    a = a[:, p.group]
+    b = b[:, p.group]
+    # 	print (a.shape, b.shape)
+    normx = torch.sum(a**2, dim=1).view(-1, 1)
+    normy = torch.sum(b**2, dim=1).view(-1, 1)
 
-	product = torch.mm(b, torch.t(a))
-	# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-	sqdist = -2 * product + torch.t(normx) + normy
+    product = torch.mm(b, torch.t(a))
+    # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+    sqdist = -2 * product + torch.t(normx) + normy
 
-	lengthscales = p.gamma_fun(a, b)
+    lengthscales = p.gamma_fun(a, b)
 
-	arg = (-0.5 / lengthscales) * sqdist
-	res = torch.exp(arg)
-	return p.kappa * res
\ No newline at end of file
+    arg = (-0.5 / lengthscales) * sqdist
+    res = torch.exp(arg)
+    return p.kappa * res
diff --git a/stpy/kernel_functions/gibbs_kernel.py b/stpy/kernel_functions/gibbs_kernel.py
index dbb4dc6..c9d9eca 100644
--- a/stpy/kernel_functions/gibbs_kernel.py
+++ b/stpy/kernel_functions/gibbs_kernel.py
@@ -1,24 +1,25 @@
 import torch
 from stpy.kernel_functions.kernel_params import KernelParams
 
+
 def gibbs_kernel(a, b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["gamma_fun", "kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["gamma_fun", "kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
-	#	print (a.shape, b.shape)
-	normx = torch.sum(a ** 2, dim=1).view(-1, 1)
-	normy = torch.sum(b ** 2, dim=1).view(-1, 1)
+    a = a[:, p.group]
+    b = b[:, p.group]
+    # 	print (a.shape, b.shape)
+    normx = torch.sum(a**2, dim=1).view(-1, 1)
+    normy = torch.sum(b**2, dim=1).view(-1, 1)
 
-	product = torch.mm(b, torch.t(a))
-	# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-	sqdist = -2 * product + torch.t(normx) + normy
+    product = torch.mm(b, torch.t(a))
+    # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+    sqdist = -2 * product + torch.t(normx) + normy
 
-	lengthscales = (p.gamma_fun(a) ** 2 + p.gamma_fun(b).T ** 2)
+    lengthscales = p.gamma_fun(a) ** 2 + p.gamma_fun(b).T ** 2
 
-	print(lengthscales)
+    print(lengthscales)
 
-	arg = (-0.5 / lengthscales) * sqdist
-	res = torch.exp(arg)
-	return p.kappa * res
\ No newline at end of file
+    arg = (-0.5 / lengthscales) * sqdist
+    res = torch.exp(arg)
+    return p.kappa * res
diff --git a/stpy/kernel_functions/kernel_params.py b/stpy/kernel_functions/kernel_params.py
index 047091b..9e9f766 100644
--- a/stpy/kernel_functions/kernel_params.py
+++ b/stpy/kernel_functions/kernel_params.py
@@ -1,11 +1,10 @@
+class KernelParams:
 
-class KernelParams():
+    def __init__(self, param_dict):
+        for key in param_dict:
+            setattr(self, key, param_dict[key])
 
-	def __init__(self, param_dict):
-		for key in param_dict:
-			setattr(self, key, param_dict[key])
-
-	def assert_existence(self, names):
-		for name in names:
-			if not hasattr(self, name):
-				raise AttributeError("Missing attribute of the kernel %s" % str(name))
+    def assert_existence(self, names):
+        for name in names:
+            if not hasattr(self, name):
+                raise AttributeError("Missing attribute of the kernel %s" % str(name))
diff --git a/stpy/kernel_functions/laplace_kernel.py b/stpy/kernel_functions/laplace_kernel.py
index a1f1ce0..ffa34ee 100644
--- a/stpy/kernel_functions/laplace_kernel.py
+++ b/stpy/kernel_functions/laplace_kernel.py
@@ -3,12 +3,13 @@
 from sklearn.metrics.pairwise import check_pairwise_arrays, manhattan_distances
 from stpy.kernel_functions.kernel_params import KernelParams
 
+
 def laplace_kernel(a, b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["gamma", "kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["gamma", "kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
-	K = - manhattan_distances(a, b) / p.gamma ** 2
-	K = np.exp(K)  # exponentiate K in-place
-	return p.kappa * torch.from_numpy(K).T
\ No newline at end of file
+    a = a[:, p.group]
+    b = b[:, p.group]
+    K = -manhattan_distances(a, b) / p.gamma**2
+    K = np.exp(K)  # exponentiate K in-place
+    return p.kappa * torch.from_numpy(K).T
diff --git a/stpy/kernel_functions/linear_kernel.py b/stpy/kernel_functions/linear_kernel.py
index 93403c4..1c0603b 100644
--- a/stpy/kernel_functions/linear_kernel.py
+++ b/stpy/kernel_functions/linear_kernel.py
@@ -1,15 +1,16 @@
 from stpy.kernel_functions.kernel_params import KernelParams
 
+
 def linear_kernel(a, b, **kwargs):
-	"""
-	linear kernl
-	:param a:
-	:param b:
-	:param kwargs:
-	:return:
-	"""
-	p = KernelParams(kwargs)
-	p.assert_existence(["kappa", "group"])
-	a = a[:, group]
-	b = b[:, group]
-	return kappa * (b @ a.T)
\ No newline at end of file
+    """
+    linear kernl
+    :param a:
+    :param b:
+    :param kwargs:
+    :return:
+    """
+    p = KernelParams(kwargs)
+    p.assert_existence(["kappa", "group"])
+    a = a[:, group]
+    b = b[:, group]
+    return kappa * (b @ a.T)
diff --git a/stpy/kernel_functions/squared_exponential_kernel.py b/stpy/kernel_functions/squared_exponential_kernel.py
index 0297a99..7b6fe26 100644
--- a/stpy/kernel_functions/squared_exponential_kernel.py
+++ b/stpy/kernel_functions/squared_exponential_kernel.py
@@ -1,38 +1,131 @@
-import numpy as np
 import torch
 from stpy.kernel_functions.kernel_params import KernelParams
 
+
 def squared_exponential_kernel(a, b, **kwargs):
-	"""
-
-	:param a:
-	:param b:
-	:param kwargs: must include gamma, kappa, group
-	:return:
-	"""
-	p = KernelParams(kwargs)
-	p.assert_existence(["gamma", "kappa", "group"])
-
-	a = a[:, p.group]
-	b = b[:, p.group]
-	#	print (a.shape, b.shape)
-	normx = torch.sum(a ** 2, dim=1).view(-1, 1)
-	normy = torch.sum(b ** 2, dim=1).view(-1, 1)
-
-	product = torch.mm(b, torch.t(a))
-	# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-	sqdist = -2 * product + torch.t(normx) + normy
-	arg = (-0.5 / (p.gamma * p.gamma)) * sqdist
-	res = torch.exp(arg)
-	return p.kappa * res
-
-def squared_exponential_kernel_diag(a,b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["gamma", "kappa", "group"])
-
-	a = a[:, p.group]
-	b = b[:, p.group]
-	sqdist = (a-b)**2
-	arg = (-0.5 / (p.gamma * p.gamma)) * sqdist
-	res = torch.exp(arg)
-	return p.kappa * res
\ No newline at end of file
+    """
+
+    :param a:
+    :param b:
+    :param kwargs: must include gamma, kappa, group
+    :return:
+    """
+    p = KernelParams(kwargs)
+    p.assert_existence(["gamma", "kappa", "group"])
+
+    a = a[:, p.group]
+    b = b[:, p.group]
+    # 	print (a.shape, b.shape)
+    normx = torch.sum(a**2, dim=1).view(-1, 1)
+    normy = torch.sum(b**2, dim=1).view(-1, 1)
+
+    product = torch.mm(b, torch.t(a))
+    # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+    sqdist = -2 * product + torch.t(normx) + normy
+    arg = (-0.5 / (p.gamma * p.gamma)) * sqdist
+    res = torch.exp(arg)
+    return p.kappa * res
+
+
+def squared_exponential_kernel_diag(a, b, **kwargs):
+    p = KernelParams(kwargs)
+    p.assert_existence(["gamma", "kappa", "group"])
+
+    a = a[:, p.group]
+    b = b[:, p.group]
+    sqdist = (a - b) ** 2
+    arg = (-0.5 / (p.gamma * p.gamma)) * sqdist
+    res = torch.exp(arg)
+    return p.kappa * res
+
+
+def squared_exponential_integral(a_x, a_y, b_x, b_y, **kwargs):
+    """
+    Returns $g(x) = \int_{a_1, b_1}^{a_2, b_2} \kappa \cdot \exp(-\gamma \| x - s \|^2 ds$
+
+    Parameters:
+    - a_x: torch.Tensor, lower bounds in x-dimension (shape: [N])
+    - a_y: torch.Tensor, lower bounds in y-dimension (shape: [N])
+    - b_x: torch.Tensor, upper bounds in x-dimension (shape: [N])
+    - b_y: torch.Tensor, upper bounds in y-dimension (shape: [N])
+    - kwargs: should give attributes gamma (float) and kappa (float)
+
+    Returns:
+    - A function such that squared_exponetial_integral(a_x, a_y, b_x, b_y)(x)[i][j]
+        is equal to $g(x_j)$ where $g$ is created from a_x[i], a_y[i], b_x[i], b_y[i]
+    """
+    p = KernelParams(kwargs)
+    p.assert_existence(["gamma", "kappa"])
+    gamma = p.gamma
+    kappa = p.kappa
+    sqrt_2 = torch.sqrt(torch.tensor(2.0))
+
+    def g(x):
+        """
+        Compute the integral g(x) for multiple 2D points x.
+
+        Parameters:
+        - x: torch.Tensor, input points of shape [M, 2] where each row is a 2D point.
+
+        Returns:
+        - torch.Tensor of shape [len(a_x), len(x)], where result[i][j] is g(x_j) for bounds from a_x[i], a_y[i], b_x[i], b_y[i].
+        """
+        x1, x2 = x[:, 0], x[:, 1]  # Extract x1 and x2 from input tensor x
+        a_x_broadcast = a_x.unsqueeze(1)  # Shape [N, 1]
+        a_y_broadcast = a_y.unsqueeze(1)  # Shape [N, 1]
+        b_x_broadcast = b_x.unsqueeze(1)  # Shape [N, 1]
+        b_y_broadcast = b_y.unsqueeze(1)  # Shape [N, 1]
+
+        # Compute the error function terms
+        erf_x1_a = torch.erf((a_x_broadcast - x1) / (gamma * sqrt_2))
+        erf_x1_b = torch.erf((b_x_broadcast - x1) / (gamma * sqrt_2))
+        erf_x2_a = torch.erf((a_y_broadcast - x2) / (gamma * sqrt_2))
+        erf_x2_b = torch.erf((b_y_broadcast - x2) / (gamma * sqrt_2))
+
+        # Compute the product of error function differences
+        integral_values = (erf_x1_a - erf_x1_b) * (erf_x2_a - erf_x2_b)
+
+        # Scale by constants
+        result = (torch.pi * kappa * (gamma**2) / 2.0) * integral_values
+
+        return result
+
+    return g
+
+
+if __name__ == "__main__":
+    # Test squared_exponential_integral
+    a_x = torch.tensor([-float("inf"), -float("inf")])
+    a_y = torch.tensor([-float("inf"), -float("inf")])
+    b_x = torch.tensor([float("inf"), float("inf")])
+    b_y = torch.tensor([float("inf"), float("inf")])
+
+    gamma = 1.0
+    kappa = 1.0
+    g = squared_exponential_integral(a_x, a_y, b_x, b_y, gamma=gamma, kappa=kappa)
+    x = torch.tensor([[87, 0], [1123, 11]])
+    assert torch.allclose(g(x), torch.tensor([torch.pi, torch.pi]))
+
+    # Test with new bounds x in [0,1] and y in [0,1]
+    a_x = torch.tensor([0.0])
+    a_y = torch.tensor([0.0])
+    b_x = torch.tensor([1.0])
+    b_y = torch.tensor([1.0])
+
+    g = squared_exponential_integral(a_x, a_y, b_x, b_y, gamma=10e-6, kappa=kappa)
+    x = torch.tensor([[0.5, 0.5], [0.25, 0.75]])
+    assert torch.allclose(g(x), torch.tensor([1.0, 1.0]))
+
+    a_x = torch.tensor([0.0, 1.0])
+    a_y = torch.tensor([0.0, 2.0])
+    b_x = torch.tensor([1.0, 3.0])
+    b_y = torch.tensor([1.0, 4.0])
+
+    g = squared_exponential_integral(a_x, a_y, b_x, b_y, gamma=0.5, kappa=3.0)
+    x = torch.tensor([[0.5, 0.5], [2.0, 3.0]])
+    result = g(x)
+    assert torch.allclose(
+        result, torch.tensor([[2.7639, 0.0548], [0.3794, 8.7851]]), atol=1e-4
+    )
+
+    torch.ones(())
diff --git a/stpy/kernel_functions/step_kernel.py b/stpy/kernel_functions/step_kernel.py
index 0ddfa78..0a643d4 100644
--- a/stpy/kernel_functions/step_kernel.py
+++ b/stpy/kernel_functions/step_kernel.py
@@ -1,20 +1,21 @@
 from stpy.kernel_functions.kernel_params import KernelParams
 import torch
 
+
 def step_kernel(a, b, **kwargs):
-	p = KernelParams(kwargs)
-	p.assert_existence(["kappa", "group"])
+    p = KernelParams(kwargs)
+    p.assert_existence(["kappa", "group"])
 
-	a = a[:, p.group]
-	b = b[:, p.group]
+    a = a[:, p.group]
+    b = b[:, p.group]
 
-	n, d = a.size()
-	m, d = b.size()
+    n, d = a.size()
+    m, d = b.size()
 
-	K = torch.zeros(size=(n, m)).double()
+    K = torch.zeros(size=(n, m)).double()
 
-	for i in range(n):
-		for j in range(m):
-			K[i, j] = a[i, :] + b[j, :] - torch.abs(a[i, :] - b[j, :])
+    for i in range(n):
+        for j in range(m):
+            K[i, j] = a[i, :] + b[j, :] - torch.abs(a[i, :] - b[j, :])
 
-	return p.kappa * K.T
\ No newline at end of file
+    return p.kappa * K.T
diff --git a/stpy/kernels.py b/stpy/kernels.py
index c05e32b..a151417 100755
--- a/stpy/kernels.py
+++ b/stpy/kernels.py
@@ -5,1119 +5,1174 @@
 from scipy.spatial.distance import cdist
 from scipy.special import kv
 from sklearn.metrics.pairwise import check_pairwise_arrays, manhattan_distances
-from stpy.kernel_functions.squared_exponential_kernel import squared_exponential_kernel_diag
+from stpy.kernel_functions.squared_exponential_kernel import (
+    squared_exponential_integral,
+    squared_exponential_kernel_diag,
+)
+
 
 class KernelFunction:
 
-	def __init__(self, kernel_function=None, kernel_name="squared_exponential", \
-				 freq=None, groups=None, d=1, gamma=1, ard_gamma=None, nu=1.5, kappa=1, map=None, power=2,
-				 cov=None, params=None, group=None, offset = 0. ):
-
-		if kernel_function is not None:
-			self.kernel_function = kernel_function
-			self.optkernel = "custom"
-			self.kappa = kappa
-			self.offset = offset
-			if params is None:
-				self.params = {'kappa': self.kappa}
-			else:
-				self.params = params
-			self.initial_params = self.params
-
-			if group is None:
-				self.group = [i for i in range(d)]
-			else:
-				self.group = group
-			self.d = d
-		else:
-			self.offset = offset
-			self.optkernel = kernel_name
-			self.gamma = gamma
-			if ard_gamma is None:
-				self.ard_gamma = torch.ones(d).double()
-			else:
-				try:
-					self.ard_gamma = torch.Tensor([ard_gamma]).double()
-				except:
-					self.ard_gamma = ard_gamma
-			self.power = power
-			self.v = nu
-
-			if params is not None:
-				self.initial_params = params
-			else:
-				self.initial_params = {'kappa':kappa}
-
-			if cov is None:
-				self.cov = torch.eye(d).double()
-			else:
-				self.cov = cov
-
-			if group is None:
-				self.group = [i for i in range(d)]
-			else:
-				self.group = group
-
-			self.map = map
-			self.groups = groups
-			self.kappa = kappa
-			self.freq = freq
-			self.d = d
-			self.add = False
-
-		self.kernel_function_list = [self.get_kernel_internal()]
-		self.kernel_diag_function_list = [self.get_kernel_internal(diag = True)]
-		self.optkernel_list = [self.optkernel]
-		self.params_dict = {'0': self.params}
-		self.kernel_items = 1
-
-		self.operations = ["-"]
-
-	def __combine__(self, second_kernel_object):
-		self.kernel_function_list = self.kernel_function_list + second_kernel_object.kernel_function_list
-		self.optkernel_list = self.optkernel_list + second_kernel_object.optkernel_list
-		self.operations = self.operations + second_kernel_object.operations[1:]
-		for key, value in second_kernel_object.params_dict.items():
-			self.params_dict[str(self.kernel_items)] = value
-			self.kernel_items += 1
-
-	def __add__(self, second_kernel_object):
-		self.__combine__(second_kernel_object)
-		diff = len(set(second_kernel_object.group) - set(self.group))
-		self.d += diff
-		self.operations.append("+")
-		return self
-
-	def __mul__(self, second_kernel_object):
-		self.__combine__(second_kernel_object)
-		self.operations.append("*")
-		return self
-
-	def description(self):
-		desc = "Kernel description:"
-		for index in range(0, self.kernel_items, 1):
-			desc = desc + "\n\n\tkernel: " + self.optkernel_list[index]
-			desc = desc + "\n\toperation: " + self.operations[index]
-			desc = desc + "\n\t" + "\n\t".join(
-				["{0}={1}".format(key, value) for key, value in self.params_dict[str(index)].items()])
-		return desc
-
-	def add_groups(self, dict):
-		for a in self.params_dict.keys():
-			if a not in dict.keys():
-				dict[a] = {}
-			dict[a]['group'] = self.params_dict[a]['group']
-		return dict
-
-	def kernel_diag(self, a,b, **kwargs):
-		if len(kwargs) > 0:
-			# params_dict = list(kwargs)
-			# we need to send
-			params_dict = kwargs
-			self.add_groups(params_dict)
-		else:
-			params_dict = self.params_dict
-
-		for i in range(0, len(self.kernel_function_list), 1):
-			k = self.kernel_diag_function_list[i]
-			if str(i) in params_dict.keys():
-				arg = params_dict[str(i)]
-			else:
-				arg = {}
-			if self.operations[i] == "+":
-				output = output + k(a, b, **arg)
-			elif self.operations[i] == "*":
-				output = output * k(a, b, **arg)
-			else:
-				output = k(a, b, **arg)
-
-		return output
-
-	def kernel(self, a, b, **kwargs):
-
-		if len(kwargs) > 0:
-			# params_dict = list(kwargs)
-			# we need to send
-			params_dict = kwargs
-			self.add_groups(params_dict)
-		else:
-			params_dict = self.params_dict
-
-		for i in range(0, len(self.kernel_function_list), 1):
-			k = self.kernel_function_list[i]
-			if str(i) in params_dict.keys():
-				arg = params_dict[str(i)]
-			else:
-				arg = {}
-			if self.operations[i] == "+":
-				output = output + k(a, b, **arg)
-			elif self.operations[i] == "*":
-				output = output * k(a, b, **arg)
-			else:
-				output = k(a, b, **arg)
-
-		return output
-
-	def get_param_refs(self):
-		return self.params_dict
-
-	def get_kernel(self):
-		return self.kernel
-
-	def get_kernel_internal(self, diag = False):
-
-		self.params = {**self.initial_params, 'kappa': self.kappa, 'group': self.group, 'offset': self.offset}
-
-		if self.optkernel == "squared_exponential":
-			self.params = dict(**self.params, **{'gamma': self.gamma})
-			if diag:
-				return squared_exponential_kernel_diag
-			else:
-				return self.squared_exponential_kernel
-
-		elif self.optkernel == "ard" and (self.groups is None):
-			self.params = dict(**self.params, **{'ard_gamma': self.ard_gamma})
-			if diag:
-				return self.ard_kernel
-			else:
-				return self.ard_kernel_diag
-
-
-		elif self.optkernel == "linear":
-			return self.linear_kernel
-
-		elif self.optkernel == "laplace":
-			self.params = dict(**self.params, **{'gamma': self.gamma})
-			return self.laplace_kernel
-
-		elif self.optkernel == "modified_matern":
-			self.params = dict(**self.params, **{'gamma': self.gamma, 'nu': self.v})
-			return self.modified_matern_kernel
-
-		elif self.optkernel == "custom":
-			return self.kernel_function
-
-		elif self.optkernel == "tanh":
-			return self.tanh_kernel
-
-		elif self.optkernel == 'step':
-			return self.step_kernel
-
-		elif self.optkernel == "angsim":
-			return self.angsim_kernel
-
-		elif self.optkernel == "matern":
-			self.params = dict(**self.params, **{'gamma': self.gamma, 'nu': self.v})
-			return self.matern_kernel
-
-		elif self.optkernel == "ard_matern":
-			self.params = dict(**self.params, **{'ard_gamma': self.ard_gamma, 'nu': self.v})
-
-			if diag:
-				return self.ard_matern_kernel_diag
-			else:
-				return self.ard_matern_kernel
-
-		elif self.optkernel == "full_covariance_se":
-			self.params = dict(**self.params, **{'cov': self.cov})
-			return self.covar_kernel
-
-		elif self.optkernel == "full_covariance_matern":
-			self.params = dict(**self.params, **{'cov': self.cov, 'nu': self.v})
-			return self.covar_kernel_matern
-
-		elif (self.optkernel == "polynomial") and (self.groups is None):
-			self.params = dict(**self.params, **{'degree': self.power})
-			return self.polynomial_kernel
-
-		elif (self.optkernel == "polynomial") and (self.groups is not None):
-			self.params = dict(**self.params, **{'degree': self.power, 'groups': self.groups})
-			return self.polynomial_additive_kernel
-
-		elif self.optkernel == "ard" and (self.groups is not None):
-			self.params = dict(**self.params, **{'ard_gamma': self.ard_gamma, 'groups': self.groups})
-			return self.ard_kernel_additive
-
-		elif self.optkernel == "squared_exponential_per_group" and (self.groups is not None):
-			self.params = dict(**self.params, **{'groups': self.groups})
-			return self.squared_exponential_per_group_kernel_additive
-		
-		elif self.optkernel == "ard_per_group" and (self.groups is not None):
-			self.params = dict(**self.params, **{'groups': self.groups})
-			return self.ard_per_group_kernel_additive
-
-		elif self.optkernel == "gibbs":
-			self.params = dict(**self.params, **{'groups': self.groups})
-			return self.gibbs_kernel
-
-		elif self.optkernel == "gibbs_custom":
-			self.params = dict(**self.params, **{'groups': self.groups})
-			return self.gibbs_custom_kernel
-
-		elif self.optkernel == "random_map":
-			return self.random_map_kernel
-
-		else:
-			raise AssertionError("Kernel not implemented.")
-
-	def embed(self, x):
-		if self.optkernel == "linear":
-			return x
-		else:
-			raise AttributeError("This type of kernel does not support a finite dimensional embedding")
-
-	def get_basis_size(self):
-		if self.optkernel == "linear":
-			return self.d
-		else:
-			raise AttributeError("This type of kernel does not support a finite dimensional embedding")
-
-	def step_kernel(self, a, b, **kwargs):
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		n, d = a.size()
-		m, d = b.size()
-
-		K = torch.zeros(size=(n, m)).double()
-
-		for i in range(n):
-			for j in range(m):
-				K[i, j] = a[i, :] + b[j, :] - torch.abs(a[i, :] - b[j, :])
-
-		return kappa * K.T
-
-	def linear_kernel(self, a, b, **kwargs):
-		"""
-			GP linear kernel
-		"""
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		if 'offset' in kwargs.keys():
-			offset = kwargs['offset']
-		else:
-			offset = self.offset
-		a = a[:, group]
-		b = b[:, group]
-		return kappa * (b @ a.T) + offset
-
-	def custom_map_kernel(self, a, b, **kwargs):
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-
-		if 'map' in kwargs.keys():
-			map = kwargs['map']
-		else:
-			map = self.map
-
-		a = a[:, group]
-		b = b[:, group]
-
-		if map is not None:
-			return kappa * self.linear_kernel(torch.t(self.map.map(a)), torch.t(self.map.map(b))).detach()
-		else:
-			return kappa * self.linear_kernel(a, b)
-
-	def laplace_kernel(self, a, b, **kwargs):
-		if 'gamma' in kwargs.keys():
-			gamma = kwargs['gamma']
-		else:
-			gamma = self.gamma
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-		K = - manhattan_distances(a, b) / gamma ** 2
-		K = np.exp(K)  # exponentiate K in-place
-		return kappa * torch.from_numpy(K).T
-
-	def squared_exponential_kernel(self, a, b, **kwargs):
-		"""
-			GP squared exponential kernel
-		"""
-		if 'gamma' in kwargs.keys():
-			gamma = kwargs['gamma']
-		else:
-			gamma = self.gamma
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-		#	print (a.shape, b.shape)
-		normx = torch.sum(a ** 2, dim=1).view(-1, 1)
-		normy = torch.sum(b ** 2, dim=1).view(-1, 1)
-
-		product = torch.mm(b, torch.t(a))
-		# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-		sqdist = -2 * product + torch.t(normx) + normy
-		arg = (-0.5 / (gamma * gamma)) * sqdist
-		res = torch.exp(arg)
-		return kappa * res
-
-	def gibbs_custom_kernel(self, a, b, **kwargs):
-		if 'gamma_fun' in kwargs.keys():
-			gamma_fun = kwargs['gamma_fun']
-		else:
-			raise AttributeError("Missing gamma_fun in Gibbs kernel definition.")
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-		#	print (a.shape, b.shape)
-		normx = torch.sum(a ** 2, dim=1).view(-1, 1)
-		normy = torch.sum(b ** 2, dim=1).view(-1, 1)
-
-		product = torch.mm(b, torch.t(a))
-		# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-		sqdist = -2 * product + torch.t(normx) + normy
-
-		lengthscales = gamma_fun(a, b)
-
-		arg = (-0.5 / lengthscales) * sqdist
-		res = torch.exp(arg)
-		return kappa * res
-
-	def gibbs_kernel(self, a, b, **kwargs):
-		if 'gamma_fun' in kwargs.keys():
-			gamma_fun = kwargs['gamma_fun']
-		else:
-			raise AttributeError("Missing gamma_fun in Gibbs kernel definition.")
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-		#	print (a.shape, b.shape)
-		normx = torch.sum(a ** 2, dim=1).view(-1, 1)
-		normy = torch.sum(b ** 2, dim=1).view(-1, 1)
-
-		product = torch.mm(b, torch.t(a))
-		# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-		sqdist = -2 * product + torch.t(normx) + normy
-
-		lengthscales = (gamma_fun(a) ** 2 + gamma_fun(b).T ** 2)
-
-		print(lengthscales)
-
-		arg = (-0.5 / lengthscales) * sqdist
-		res = torch.exp(arg)
-		return kappa * res
-
-	def covar_kernel(self, a, b, **kwargs):
-		"""
-		:param a:
-		:param b:
-		:param cov: square-root of the covariance matrix
-		:return:
-		"""
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'cov' in kwargs.keys():
-			cov = kwargs['cov']
-		else:
-			cov = self.cov
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-		a = torch.mm(a, cov)
-		b = torch.mm(b, cov)
-
-		normx = torch.sum(a ** 2, dim=1).reshape(-1, 1)
-		normy = torch.sum(b ** 2, dim=1).reshape(-1, 1)
-
-		product = torch.mm(b, torch.t(a))
-		sqdist = -2 * product + torch.t(normx) + normy
-		arg = - 0.5 * sqdist
-		res = torch.exp(arg)
-		return kappa * res
-
-
-	def covar_kernel_matern(self, a, b, **kwargs):
-		"""
-		:param a:
-		:param b:
-		:param cov: square-root of the covariance matrix
-		:return:
-		"""
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'cov' in kwargs.keys():
-			cov = kwargs['cov']
-		else:
-			cov = self.cov
-		if 'v' in kwargs.keys():
-			v = kwargs['v']
-		else:
-			v = self.v
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-		a = torch.mm(a, cov)
-		b = torch.mm(b, cov)
-
-		dists = torch.cdist(a, b, p=2).T
-
-		if v == 0.5:
-			K = torch.exp(-dists)
-		elif v == 1.5:
-			K = dists * np.sqrt(3)
-			K = (1. + K) * torch.exp(-K)
-		elif v == 2.5:
-			K = dists * np.sqrt(5)
-			K = (1. + K + K ** 2 / 3.0) * torch.exp(-K)
-		else:  # general case; expensive to evaluate
-			K = dists
-			K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
-			tmp = (np.sqrt(2 * v) * K)
-			K.fill((2 ** (1. - v)) / math.gamma(v))
-			K *= tmp ** v
-			K *= kv(v, tmp)
-		return kappa * K
-
-
-	def ard_kernel(self, a, b, **kwargs):
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'ard_gamma' in kwargs.keys():
-			gamma = kwargs['ard_gamma']
-		else:
-			gamma = self.ard_gamma
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		D = torch.diag(1. / (gamma[group]))
-		a = torch.mm(a, D)
-		b = torch.mm(b, D)
-		normx = torch.sum(a ** 2, dim=1).reshape(-1, 1)
-		normy = torch.sum(b ** 2, dim=1).reshape(-1, 1)
-
-		product = torch.mm(b, torch.t(a))
-		# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-		sqdist = -2 * product + torch.t(normx) + normy
-		arg = - 0.5 * sqdist
-		res = torch.exp(arg)
-		return kappa * res
-
-	def ard_kernel_diag(self, a, b, **kwargs):
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'ard_gamma' in kwargs.keys():
-			gamma = kwargs['ard_gamma']
-		else:
-			gamma = self.ard_gamma
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		D = torch.diag(1. / (gamma[group]))
-		a = torch.mm(a, D)
-		b = torch.mm(b, D)
-		normx = torch.sum(a ** 2, dim=1).reshape(-1, 1)
-		normy = torch.sum(b ** 2, dim=1).reshape(-1, 1)
-
-		product = torch.mm(b, torch.t(a))
-		# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-		sqdist = -2 * product + torch.t(normx) + normy
-		arg = - 0.5 * sqdist
-		res = torch.exp(arg)
-		return kappa * res
-
-
-
-	def ard_per_group_kernel_additive(self,a,b,**kwargs):
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-		
-		if 'groups' in kwargs.keys():
-			groups = kwargs['groups']
-		else:
-			groups = self.groups
-
-		if 'ard_per_group' in kwargs.keys():
-			ard_per_group = kwargs['ard_per_group']
-		else:
-			raise AssertionError("This kernel requires 'ard_per_group' initial parameters")
-
-		(n, z) = tuple(a.size())
-		(q, m) = tuple(b.size())
-
-		r = torch.zeros(size=(q, n), dtype=torch.float64)
-		groups_index = 0
-
-		for group_add in groups:
-			kwargs['group'] = group_add
-			
-			size_group = len(group_add)
-			# use per group lenghtscale 
-			#kwargs['ard_gamma'] = ard_per_group[groups_index:groups_index+size_group]
-			gamma = ard_per_group[groups_index:groups_index+size_group]
-			groups_index +=size_group
-
-			ax = a[:, group_add]
-			bx = b[:, group_add]
-			D = torch.diag(1. / (gamma))
-			ax = torch.mm(ax, D)
-			bx = torch.mm(bx, D)
-			normx = torch.sum(ax ** 2, dim=1).reshape(-1, 1)
-			normy = torch.sum(bx ** 2, dim=1).reshape(-1, 1)
-			product = torch.mm(bx, torch.t(ax))
-			# sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
-			sqdist = -2 * product + torch.t(normx) + normy
-			arg = - 0.5 * sqdist
-			res = torch.exp(arg)
-			r = r + res
-
-		r = r / float(len(groups))
-		return kappa*r
-
-	def squared_exponential_per_group_kernel_additive(self,a,b,**kwargs):
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-		
-		if 'groups' in kwargs.keys():
-			groups = kwargs['groups']
-		else:
-			groups = self.groups
-
-		if 'gamma_per_group' in kwargs.keys():
-			gamma_per_group = kwargs['gamma_per_group']
-		else:
-			raise AssertionError("This kernel requires 'gamma_per_group' initial parameters")
-
-		(n, z) = tuple(a.size())
-		(q, m) = tuple(b.size())
-
-		r = torch.zeros(size=(q, n), dtype=torch.float64)
-
-		for group_add, gamma in zip(groups,gamma_per_group):
-			kwargs['group'] = group_add
-			
-			# use per group lenghtscale 
-			kwargs['gamma'] = gamma
-
-			r = r + self.squared_exponential_kernel(a, b, **kwargs)
-
-		r = kappa * r / float(len(groups))
-		return r
-
-	def ard_kernel_additive(self, a, b, **kwargs):
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-		
-		if 'groups' in kwargs.keys():
-			groups = kwargs['groups']
-		else:
-			groups = self.groups
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		(n, z) = tuple(a.size())
-		(q, m) = tuple(b.size())
-
-		r = torch.zeros(size=(q, n), dtype=torch.float64)
-
-		for group_add in groups:
-			kwargs['group'] = group_add
-			r = r + self.ard_kernel(a, b, **kwargs)
-
-		r = r / float(len(groups))
-		return r
-
-	def tanh_kernel(self, a, b, **kwargs):
-		"""
-			GP squared exponential kernel
-		"""
-		#	print (a.shape, b.shape)
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		X, Y = check_pairwise_arrays(a.numpy(), b.numpy())
-		K = manhattan_distances(a.numpy(), b.numpy())
-		K = K.T
-		eps = 10e-10
-		q = 3
-		A = (np.tanh(K) ** q) / (eps + K ** q)
-		return kappa * torch.from_numpy(A)
-
-	def angsim_kernel(self, a, b, **kwargs):
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		return kappa * (2. / np.pi) * np.arcsin((a.dot(b)) / (a.norm() * b.norm()))
-
-	def polynomial_kernel(self, a, b, **kwargs):
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-		if 'degree' in kwargs.keys():
-			power = kwargs['degree']
-		else:
-			power = self.power
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		K = (torch.mm(b, torch.t(a)) + 1) ** power
-		return kappa * K
-
-	def polynomial_additive_kernel(self, a, b, **kwargs):
-
-		if 'groups' in kwargs.keys():
-			groups = kwargs['groups']
-		else:
-			groups = self.groups
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		(n, z) = tuple(a.size())
-		(q, m) = tuple(b.size())
-		no_groups = float(len(groups))
-		r = torch.zeros(size=(q, n), dtype=torch.float64)
-		for i, group in enumerate(groups):
-			z = self.polynomial_kernel(a[:, group], b[:, group], **kwargs)
-			r = r + z
-		r = r / no_groups
-		return r
-
-
-	def matern_kernel(self, a, b, **kwargs):
-		"""
-		:param a: matrices
-		:param b: matrices
-		:param gamma: smoothness
-		:param v: Bessel function type
-		:return:
-		"""
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'nu' in kwargs.keys():
-			v = kwargs['nu']
-		else:
-			v = self.v
-
-		if 'gamma' in kwargs.keys():
-			gamma = kwargs['gamma']
-		else:
-			gamma = self.gamma
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group].numpy()
-		b = b[:, group].numpy()
-
-		dists = cdist(a / gamma, b / gamma, metric='euclidean').T
-		if v == 0.5:
-			K = np.exp(-dists)
-		elif v == 1.5:
-			K = dists * math.sqrt(3)
-			K = (1. + K) * np.exp(-K)
-		elif v == 2.5:
-			K = dists * math.sqrt(5)
-			K = (1. + K + K ** 2 / 3.0) * np.exp(-K)
-		else:  # general case; expensive to evaluate
-			K = dists
-			K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
-			tmp = (math.sqrt(2 * v) * K)
-			K.fill((2 ** (1. - v)) / math.gamma(v))
-			K *= tmp ** v
-			K *= kv(v, tmp)
-		return kappa * torch.from_numpy(K)
-
-
-	def ard_matern_kernel_diag(self, a, b, **kwargs):
-		"""
-		:param a: matrices
-		:param b: matrices
-		:param gamma: smoothness
-		:param v: Bessel function type
-		:return:
-		"""
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'nu' in kwargs.keys():
-			v = kwargs['nu']
-		else:
-			v = self.v
-
-		if 'ard_gamma' in kwargs.keys():
-			ard_gamma = kwargs['ard_gamma']
-		else:
-			ard_gamma = self.ard_gamma
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		D = torch.diag(1. / (ard_gamma[group]))
-		a = torch.mm(a, D)
-		b = torch.mm(b, D)
-
-		a = a[:, group]
-		b = b[:, group]
-
-		#dists = torch.cdist(a , b , p = 2).T
-		dists = torch.sqrt(torch.sum((a - b)**2))
-
-		if v == 0.5:
-			K = torch.exp(-dists)
-		elif v == 1.5:
-			K = dists * np.sqrt(3)
-			K = (1. + K) * torch.exp(-K)
-		elif v == 2.5:
-			K = dists * np.sqrt(5)
-			K = (1. + K + K ** 2 / 3.0) * torch.exp(-K)
-		else:  # general case; expensive to evaluate
-			K = dists
-			K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
-			tmp = (np.sqrt(2 * v) * K)
-			K.fill((2 ** (1. - v)) / math.gamma(v))
-			K *= tmp ** v
-			K *= kv(v, tmp)
-		return kappa * K
-
-	def ard_matern_kernel(self, a, b, **kwargs):
-		"""
-		:param a: matrices
-		:param b: matrices
-		:param gamma: smoothness
-		:param v: Bessel function type
-		:return:
-		"""
-
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'nu' in kwargs.keys():
-			v = kwargs['nu']
-		else:
-			v = self.v
-
-		if 'ard_gamma' in kwargs.keys():
-			ard_gamma = kwargs['ard_gamma']
-		else:
-			ard_gamma = self.ard_gamma
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		D = torch.diag(1. / (ard_gamma[group]))
-		a = torch.mm(a, D)
-		b = torch.mm(b, D)
-
-		a = a[:, group]
-		b = b[:, group]
-
-		dists = torch.cdist(a , b , p = 2).T
-
-		if v == 0.5:
-			K = torch.exp(-dists)
-		elif v == 1.5:
-			K = dists * np.sqrt(3)
-			K = (1. + K) * torch.exp(-K)
-		elif v == 2.5:
-			K = dists * np.sqrt(5)
-			K = (1. + K + K ** 2 / 3.0) * torch.exp(-K)
-		else:  # general case; expensive to evaluate
-			K = dists
-			K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
-			tmp = (np.sqrt(2 * v) * K)
-			K.fill((2 ** (1. - v)) / math.gamma(v))
-			K *= tmp ** v
-			K *= kv(v, tmp)
-		return kappa * K
-
-	def modified_matern_kernel(self, X, Y, **kwargs):
-		"""
-		:param a: matrices
-		:param b: matrices
-		:param gamma: smoothness
-		:param v: Bessel function type
-		:return:
-		"""
-		if 'kappa' in kwargs.keys():
-			kappa = kwargs['kappa']
-		else:
-			kappa = self.kappa
-
-		if 'nu' in kwargs.keys():
-			v = kwargs['nu']
-		else:
-			v = self.v
-
-		if 'gamma' in kwargs.keys():
-			gamma = kwargs['gamma']
-		else:
-			gamma = self.gamma
-
-		if 'group' in kwargs.keys():
-			group = kwargs['group']
-		else:
-			group = self.group
-
-		a = a[:, group]
-		b = b[:, group]
-
-		d = X.size()[1]
-		# Z = np.ones(shape = (X.shape[0],Y.shape[0]))
-		Z = torch.ones(size=(Y.size()[0], X.size()[0]), dtype=torch.float64)
-		for i in range(d):
-			a = X[:, i].view(-1, 1)
-			b = Y[:, i].view(-1, 1)
-			# dists = cdist(a/gamma,b/gamma,metric='cityblock').T
-			dists = cdist(a.numpy() / gamma, b.numpy() / gamma, metric='euclidean').T
-			# dists = manhattan_distances(a, b).T/ gamma
-			dists = torch.from_numpy(dists)
-			if v == 1:
-				K = torch.exp(-dists)
-			elif v == 2:
-				K = (1 + dists) * torch.exp(-dists)
-			elif v == 3:
-				K = (dists ** 2 + 3 * torch.abs(dists) + 3) * torch.exp(-dists) / 3.
-			elif v == 4:
-				K = (dists ** 3 + 6 * dists ** 2 + 15 * torch.abs(dists) + 15) * torch.exp(-dists) / 15.
-			else:
-				raise AssertionError("Kernel with nu = " + str(v) + "not implemented.")
-			Z = Z * K
-		return kappa * Z
-
-	def spectral_kernel(self, a, b):
-		if self.freq is not None:
-			(n, d) = a.size()
-			(m, d) = b.size()
-			dist = torch.zeros(size=(n, m), dtype=torch.float64)
-			c = 0
-			for x in a:
-				z = 0
-				for y in b:
-					dist[c, z] = torch.sum(torch.cos(torch.mm(x.view(1, 1) - y.view(1, 1), self.freq)))
-					z = z + 1
-				c = c + 1
-			N = self.freq.size()[0]
-			return torch.t(dist) / N
-		else:
-			raise AssertionError("No frequencies passed")
-
-	def wiener_kernel(self, a, b):
-		"""
-			Wiener process kernel
-			k(x,y) = min(x,y)
-			k(x,y) = \sum_i min(x_i,y_i)
-		"""
-		(n, d) = a.size()
-		(m, d) = b.size()
-		dist = torch.zeros(size=(n, m))
-		# dist = 0.1*np.eye(max(n,m))[0:m,0:n]
-		c = 0
-		for x in a:
-			z = 0
-			for y in b:
-				print(x, y)
-				dist[c, z] = torch.from_numpy(np.sum(np.min(np.array([x, y]), axis=0)))
-				z = z + 1
-			c = c + 1
-
-		# print (dist)
-		return dist.T
-
-	def derivative_1(self, fixed, x):
-		"""
-
-		"""
-		d = x.size()[1]
-		n = x.size()[0]
-
-		size = fixed.size()[0]
-
-		if self.optkernel == "squared_exponential":
-			k_original = self.squared_exponential_kernel(fixed, x)
-			second = fixed.unsqueeze(1) - x
-			second = second / self.gamma ** 2
-			res = self.kappa * torch.einsum('ij,jik->ijk', k_original, second)
-		else:
-			raise AssertionError("Not implemented for this kernel")
-
-		# result should be (n,d)
-		return res
-
-	def derivative_2(self, fixed, x):
-		"""
-
-		"""
-		d = x.size()[1]
-		n = x.size()[0]
-
-		size = fixed.size()[0]
-
-		if self.optkernel == "squared_exponential":
-			k_original = self.squared_exponential_kernel(fixed, x)
-			second = fixed.unsqueeze(1) - x
-			second = second / self.gamma ** 2
-			second2 = torch.einsum('ijk,ijl->ijkl', second, second)
-			res1 = torch.einsum('ij,jikl->ijkl', k_original, second2)
-
-			ones = torch.zeros(size=(size, n, d, d))
-			for j in range(d):
-				ones[:, :, j, j] = 1.
-			ones = -ones / self.gamma ** 2
-			res2 = torch.einsum('ij,jikl->ijkl', k_original, ones)
-			res = self.kappa * (res1 + res2)
-		# res = self.kappa * res2
-		else:
-			raise AssertionError("Not implemented for this kernel")
-
-		return res
-
-	def square_dist(self, a, b):
-		if (a.shape == b.shape):
-			normx = np.sum(a ** 2, axis=1).reshape(-1, 1)
-			normy = np.sum(b ** 2, axis=1).reshape(-1, 1)
-		else:
-			normx = np.sum(a ** 2, axis=1).reshape(-1, 1)
-			normy = np.sum(b ** 2, axis=1).reshape(-1, 1)
-
-		product = b.dot(a.T)
-		sqdist = np.tile(normx, b.shape[0]).T + np.tile(normy, a.shape[0]) - 2 * product
-		return sqdist
+    def __init__(
+        self,
+        kernel_function=None,
+        kernel_name="squared_exponential",
+        freq=None,
+        groups=None,
+        d=1,
+        gamma: float = 1.0,
+        ard_gamma=None,
+        nu=1.5,
+        kappa: float = 1.0,
+        map=None,
+        power=2,
+        cov=None,
+        params=None,
+        group=None,
+        offset=0.0,
+    ):
+
+        if kernel_function is not None:
+            self.kernel_function = kernel_function
+            self.optkernel = "custom"
+            self.kappa = kappa
+            self.offset = offset
+            if params is None:
+                self.params = {"kappa": self.kappa}
+            else:
+                self.params = params
+            self.initial_params = self.params
+
+            if group is None:
+                self.group = [i for i in range(d)]
+            else:
+                self.group = group
+            self.d = d
+        else:
+            self.offset = offset
+            self.optkernel = kernel_name
+            self.gamma = gamma
+            if ard_gamma is None:
+                self.ard_gamma = torch.ones(d).double()
+            else:
+                try:
+                    self.ard_gamma = torch.tensor([ard_gamma]).double()
+                except:
+                    self.ard_gamma = ard_gamma
+            self.power = power
+            self.v = nu
+
+            if params is not None:
+                self.initial_params = params
+            else:
+                self.initial_params = {"kappa": kappa}
+
+            if cov is None:
+                self.cov = torch.eye(d).double()
+            else:
+                self.cov = cov
+
+            if group is None:
+                self.group = [i for i in range(d)]
+            else:
+                self.group = group
+
+            self.map = map
+            self.groups = groups
+            self.kappa = kappa
+            self.freq = freq
+            self.d = d
+            self.add = False
+
+        self.kernel_function_list = [self.get_kernel_internal()]
+        self.kernel_diag_function_list = [self.get_kernel_internal(diag=True)]
+        self.optkernel_list = [self.optkernel]
+        self.params_dict = {"0": self.params}
+        self.kernel_items = 1
+
+        self.operations = ["-"]
+
+    def __combine__(self, second_kernel_object):
+        self.kernel_function_list = (
+            self.kernel_function_list + second_kernel_object.kernel_function_list
+        )
+        self.optkernel_list = self.optkernel_list + second_kernel_object.optkernel_list
+        self.operations = self.operations + second_kernel_object.operations[1:]
+        for key, value in second_kernel_object.params_dict.items():
+            self.params_dict[str(self.kernel_items)] = value
+            self.kernel_items += 1
+
+    def __add__(self, second_kernel_object):
+        self.__combine__(second_kernel_object)
+        diff = len(set(second_kernel_object.group) - set(self.group))
+        self.d += diff
+        self.operations.append("+")
+        return self
+
+    def __mul__(self, second_kernel_object):
+        self.__combine__(second_kernel_object)
+        self.operations.append("*")
+        return self
+
+    def description(self):
+        desc = "Kernel description:"
+        for index in range(0, self.kernel_items, 1):
+            desc = desc + "\n\n\tkernel: " + self.optkernel_list[index]
+            desc = desc + "\n\toperation: " + self.operations[index]
+            desc = (
+                desc
+                + "\n\t"
+                + "\n\t".join(
+                    [
+                        "{0}={1}".format(key, value)
+                        for key, value in self.params_dict[str(index)].items()
+                    ]
+                )
+            )
+        return desc
+
+    def add_groups(self, dict):
+        for a in self.params_dict.keys():
+            if a not in dict.keys():
+                dict[a] = {}
+            dict[a]["group"] = self.params_dict[a]["group"]
+        return dict
+
+    def kernel_diag(self, a, b, **kwargs):
+        if len(kwargs) > 0:
+            # params_dict = list(kwargs)
+            # we need to send
+            params_dict = kwargs
+            self.add_groups(params_dict)
+        else:
+            params_dict = self.params_dict
+
+        for i in range(0, len(self.kernel_function_list), 1):
+            k = self.kernel_diag_function_list[i]
+            if str(i) in params_dict.keys():
+                arg = params_dict[str(i)]
+            else:
+                arg = {}
+            if self.operations[i] == "+":
+                output = output + k(a, b, **arg)
+            elif self.operations[i] == "*":
+                output = output * k(a, b, **arg)
+            else:
+                output = k(a, b, **arg)
+
+        return output
+
+    def kernel(self, a, b, **kwargs):
+
+        if len(kwargs) > 0:
+            # params_dict = list(kwargs)
+            # we need to send
+            params_dict = kwargs
+            self.add_groups(params_dict)
+        else:
+            params_dict = self.params_dict
+
+        for i in range(0, len(self.kernel_function_list), 1):
+            k = self.kernel_function_list[i]
+            if str(i) in params_dict.keys():
+                arg = params_dict[str(i)]
+            else:
+                arg = {}
+            if self.operations[i] == "+":
+                output = output + k(a, b, **arg)
+            elif self.operations[i] == "*":
+                output = output * k(a, b, **arg)
+            else:
+                output = k(a, b, **arg)
+
+        return output
+
+    def get_param_refs(self):
+        return self.params_dict
+
+    def get_kernel(self):
+        return self.kernel
+
+    def integral(self, a_x, a_y, b_x, b_y):
+        if self.optkernel == "squared_exponential":
+            return squared_exponential_integral(a_x, a_y, b_x, b_y, **self.params)
+        else:
+            raise NotImplementedError()
+
+    def get_kernel_internal(self, diag=False):
+
+        self.params = {
+            **self.initial_params,
+            "kappa": self.kappa,
+            "group": self.group,
+            "offset": self.offset,
+        }
+
+        if self.optkernel == "squared_exponential":
+            self.params = dict(**self.params, **{"gamma": self.gamma})
+            if diag:
+                return squared_exponential_kernel_diag
+            else:
+                return self.squared_exponential_kernel
+
+        elif self.optkernel == "ard" and (self.groups is None):
+            self.params = dict(**self.params, **{"ard_gamma": self.ard_gamma})
+            if diag:
+                return self.ard_kernel
+            else:
+                return self.ard_kernel_diag
+
+        elif self.optkernel == "linear":
+            return self.linear_kernel
+
+        elif self.optkernel == "laplace":
+            self.params = dict(**self.params, **{"gamma": self.gamma})
+            return self.laplace_kernel
+
+        elif self.optkernel == "modified_matern":
+            self.params = dict(**self.params, **{"gamma": self.gamma, "nu": self.v})
+            return self.modified_matern_kernel
+
+        elif self.optkernel == "custom":
+            return self.kernel_function
+
+        elif self.optkernel == "tanh":
+            return self.tanh_kernel
+
+        elif self.optkernel == "step":
+            return self.step_kernel
+
+        elif self.optkernel == "angsim":
+            return self.angsim_kernel
+
+        elif self.optkernel == "matern":
+            self.params = dict(**self.params, **{"gamma": self.gamma, "nu": self.v})
+            return self.matern_kernel
+
+        elif self.optkernel == "ard_matern":
+            self.params = dict(
+                **self.params, **{"ard_gamma": self.ard_gamma, "nu": self.v}
+            )
+
+            if diag:
+                return self.ard_matern_kernel_diag
+            else:
+                return self.ard_matern_kernel
+
+        elif self.optkernel == "full_covariance_se":
+            self.params = dict(**self.params, **{"cov": self.cov})
+            return self.covar_kernel
+
+        elif self.optkernel == "full_covariance_matern":
+            self.params = dict(**self.params, **{"cov": self.cov, "nu": self.v})
+            return self.covar_kernel_matern
+
+        elif (self.optkernel == "polynomial") and (self.groups is None):
+            self.params = dict(**self.params, **{"degree": self.power})
+            return self.polynomial_kernel
+
+        elif (self.optkernel == "polynomial") and (self.groups is not None):
+            self.params = dict(
+                **self.params, **{"degree": self.power, "groups": self.groups}
+            )
+            return self.polynomial_additive_kernel
+
+        elif self.optkernel == "ard" and (self.groups is not None):
+            self.params = dict(
+                **self.params, **{"ard_gamma": self.ard_gamma, "groups": self.groups}
+            )
+            return self.ard_kernel_additive
+
+        elif self.optkernel == "squared_exponential_per_group" and (
+            self.groups is not None
+        ):
+            self.params = dict(**self.params, **{"groups": self.groups})
+            return self.squared_exponential_per_group_kernel_additive
+
+        elif self.optkernel == "ard_per_group" and (self.groups is not None):
+            self.params = dict(**self.params, **{"groups": self.groups})
+            return self.ard_per_group_kernel_additive
+
+        elif self.optkernel == "gibbs":
+            self.params = dict(**self.params, **{"groups": self.groups})
+            return self.gibbs_kernel
+
+        elif self.optkernel == "gibbs_custom":
+            self.params = dict(**self.params, **{"groups": self.groups})
+            return self.gibbs_custom_kernel
+
+        elif self.optkernel == "random_map":
+            return self.random_map_kernel
+
+        else:
+            raise AssertionError("Kernel not implemented.")
+
+    def embed(self, x):
+        if self.optkernel == "linear":
+            return x
+        else:
+            raise AttributeError(
+                "This type of kernel does not support a finite dimensional embedding"
+            )
+
+    def get_basis_size(self):
+        if self.optkernel == "linear":
+            return self.d
+        else:
+            raise AttributeError(
+                "This type of kernel does not support a finite dimensional embedding"
+            )
+
+    def step_kernel(self, a, b, **kwargs):
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        n, d = a.size()
+        m, d = b.size()
+
+        K = torch.zeros(size=(n, m)).double()
+
+        for i in range(n):
+            for j in range(m):
+                K[i, j] = a[i, :] + b[j, :] - torch.abs(a[i, :] - b[j, :])
+
+        return kappa * K.T
+
+    def linear_kernel(self, a, b, **kwargs):
+        """
+        GP linear kernel
+        """
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        if "offset" in kwargs.keys():
+            offset = kwargs["offset"]
+        else:
+            offset = self.offset
+        a = a[:, group]
+        b = b[:, group]
+        return kappa * (b @ a.T) + offset
+
+    def custom_map_kernel(self, a, b, **kwargs):
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        if "map" in kwargs.keys():
+            map = kwargs["map"]
+        else:
+            map = self.map
+
+        a = a[:, group]
+        b = b[:, group]
+
+        if map is not None:
+            return (
+                kappa
+                * self.linear_kernel(
+                    torch.t(self.map.map(a)), torch.t(self.map.map(b))
+                ).detach()
+            )
+        else:
+            return kappa * self.linear_kernel(a, b)
+
+    def laplace_kernel(self, a, b, **kwargs):
+        if "gamma" in kwargs.keys():
+            gamma = kwargs["gamma"]
+        else:
+            gamma = self.gamma
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+        K = -manhattan_distances(a, b) / gamma**2
+        K = np.exp(K)  # exponentiate K in-place
+        return kappa * torch.from_numpy(K).T
+
+    def squared_exponential_kernel(self, a, b, **kwargs):
+        """
+        GP squared exponential kernel
+        """
+        if "gamma" in kwargs.keys():
+            gamma = kwargs["gamma"]
+        else:
+            gamma = self.gamma
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+        # 	print (a.shape, b.shape)
+        normx = torch.sum(a**2, dim=1).view(-1, 1)
+        normy = torch.sum(b**2, dim=1).view(-1, 1)
+
+        product = torch.mm(b, torch.t(a))
+        # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+        sqdist = -2 * product + torch.t(normx) + normy
+        arg = (-0.5 / (gamma * gamma)) * sqdist
+        res = torch.exp(arg)
+        return kappa * res
+
+    def gibbs_custom_kernel(self, a, b, **kwargs):
+        if "gamma_fun" in kwargs.keys():
+            gamma_fun = kwargs["gamma_fun"]
+        else:
+            raise AttributeError("Missing gamma_fun in Gibbs kernel definition.")
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+        # 	print (a.shape, b.shape)
+        normx = torch.sum(a**2, dim=1).view(-1, 1)
+        normy = torch.sum(b**2, dim=1).view(-1, 1)
+
+        product = torch.mm(b, torch.t(a))
+        # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+        sqdist = -2 * product + torch.t(normx) + normy
+
+        lengthscales = gamma_fun(a, b)
+
+        arg = (-0.5 / lengthscales) * sqdist
+        res = torch.exp(arg)
+        return kappa * res
+
+    def gibbs_kernel(self, a, b, **kwargs):
+        if "gamma_fun" in kwargs.keys():
+            gamma_fun = kwargs["gamma_fun"]
+        else:
+            raise AttributeError("Missing gamma_fun in Gibbs kernel definition.")
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+        # 	print (a.shape, b.shape)
+        normx = torch.sum(a**2, dim=1).view(-1, 1)
+        normy = torch.sum(b**2, dim=1).view(-1, 1)
+
+        product = torch.mm(b, torch.t(a))
+        # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+        sqdist = -2 * product + torch.t(normx) + normy
+
+        lengthscales = gamma_fun(a) ** 2 + gamma_fun(b).T ** 2
+
+        print(lengthscales)
+
+        arg = (-0.5 / lengthscales) * sqdist
+        res = torch.exp(arg)
+        return kappa * res
+
+    def covar_kernel(self, a, b, **kwargs):
+        """
+        :param a:
+        :param b:
+        :param cov: square-root of the covariance matrix
+        :return:
+        """
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "cov" in kwargs.keys():
+            cov = kwargs["cov"]
+        else:
+            cov = self.cov
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+        a = torch.mm(a, cov)
+        b = torch.mm(b, cov)
+
+        normx = torch.sum(a**2, dim=1).reshape(-1, 1)
+        normy = torch.sum(b**2, dim=1).reshape(-1, 1)
+
+        product = torch.mm(b, torch.t(a))
+        sqdist = -2 * product + torch.t(normx) + normy
+        arg = -0.5 * sqdist
+        res = torch.exp(arg)
+        return kappa * res
+
+    def covar_kernel_matern(self, a, b, **kwargs):
+        """
+        :param a:
+        :param b:
+        :param cov: square-root of the covariance matrix
+        :return:
+        """
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "cov" in kwargs.keys():
+            cov = kwargs["cov"]
+        else:
+            cov = self.cov
+        if "v" in kwargs.keys():
+            v = kwargs["v"]
+        else:
+            v = self.v
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+        a = torch.mm(a, cov)
+        b = torch.mm(b, cov)
+
+        dists = torch.cdist(a, b, p=2).T
+
+        if v == 0.5:
+            K = torch.exp(-dists)
+        elif v == 1.5:
+            K = dists * np.sqrt(3)
+            K = (1.0 + K) * torch.exp(-K)
+        elif v == 2.5:
+            K = dists * np.sqrt(5)
+            K = (1.0 + K + K**2 / 3.0) * torch.exp(-K)
+        else:  # general case; expensive to evaluate
+            K = dists
+            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
+            tmp = np.sqrt(2 * v) * K
+            K.fill((2 ** (1.0 - v)) / math.gamma(v))
+            K *= tmp**v
+            K *= kv(v, tmp)
+        return kappa * K
+
+    def ard_kernel(self, a, b, **kwargs):
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "ard_gamma" in kwargs.keys():
+            gamma = kwargs["ard_gamma"]
+        else:
+            gamma = self.ard_gamma
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        D = torch.diag(1.0 / (gamma[group]))
+        a = torch.mm(a, D)
+        b = torch.mm(b, D)
+        normx = torch.sum(a**2, dim=1).reshape(-1, 1)
+        normy = torch.sum(b**2, dim=1).reshape(-1, 1)
+
+        product = torch.mm(b, torch.t(a))
+        # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+        sqdist = -2 * product + torch.t(normx) + normy
+        arg = -0.5 * sqdist
+        res = torch.exp(arg)
+        return kappa * res
+
+    def ard_kernel_diag(self, a, b, **kwargs):
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "ard_gamma" in kwargs.keys():
+            gamma = kwargs["ard_gamma"]
+        else:
+            gamma = self.ard_gamma
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        D = torch.diag(1.0 / (gamma[group]))
+        a = torch.mm(a, D)
+        b = torch.mm(b, D)
+        normx = torch.sum(a**2, dim=1).reshape(-1, 1)
+        normy = torch.sum(b**2, dim=1).reshape(-1, 1)
+
+        product = torch.mm(b, torch.t(a))
+        # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+        sqdist = -2 * product + torch.t(normx) + normy
+        arg = -0.5 * sqdist
+        res = torch.exp(arg)
+        return kappa * res
+
+    def ard_per_group_kernel_additive(self, a, b, **kwargs):
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "groups" in kwargs.keys():
+            groups = kwargs["groups"]
+        else:
+            groups = self.groups
+
+        if "ard_per_group" in kwargs.keys():
+            ard_per_group = kwargs["ard_per_group"]
+        else:
+            raise AssertionError(
+                "This kernel requires 'ard_per_group' initial parameters"
+            )
+
+        (n, z) = tuple(a.size())
+        (q, m) = tuple(b.size())
+
+        r = torch.zeros(size=(q, n), dtype=torch.float64)
+        groups_index = 0
+
+        for group_add in groups:
+            kwargs["group"] = group_add
+
+            size_group = len(group_add)
+            # use per group lenghtscale
+            # kwargs['ard_gamma'] = ard_per_group[groups_index:groups_index+size_group]
+            gamma = ard_per_group[groups_index : groups_index + size_group]
+            groups_index += size_group
+
+            ax = a[:, group_add]
+            bx = b[:, group_add]
+            D = torch.diag(1.0 / (gamma))
+            ax = torch.mm(ax, D)
+            bx = torch.mm(bx, D)
+            normx = torch.sum(ax**2, dim=1).reshape(-1, 1)
+            normy = torch.sum(bx**2, dim=1).reshape(-1, 1)
+            product = torch.mm(bx, torch.t(ax))
+            # sqdist = torch.tile(normx, b.shape[0]).T + torch.tile(normy, a.shape[0]) - 2 * product
+            sqdist = -2 * product + torch.t(normx) + normy
+            arg = -0.5 * sqdist
+            res = torch.exp(arg)
+            r = r + res
+
+        r = r / float(len(groups))
+        return kappa * r
+
+    def squared_exponential_per_group_kernel_additive(self, a, b, **kwargs):
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "groups" in kwargs.keys():
+            groups = kwargs["groups"]
+        else:
+            groups = self.groups
+
+        if "gamma_per_group" in kwargs.keys():
+            gamma_per_group = kwargs["gamma_per_group"]
+        else:
+            raise AssertionError(
+                "This kernel requires 'gamma_per_group' initial parameters"
+            )
+
+        (n, z) = tuple(a.size())
+        (q, m) = tuple(b.size())
+
+        r = torch.zeros(size=(q, n), dtype=torch.float64)
+
+        for group_add, gamma in zip(groups, gamma_per_group):
+            kwargs["group"] = group_add
+
+            # use per group lenghtscale
+            kwargs["gamma"] = gamma
+
+            r = r + self.squared_exponential_kernel(a, b, **kwargs)
+
+        r = kappa * r / float(len(groups))
+        return r
+
+    def ard_kernel_additive(self, a, b, **kwargs):
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "groups" in kwargs.keys():
+            groups = kwargs["groups"]
+        else:
+            groups = self.groups
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        (n, z) = tuple(a.size())
+        (q, m) = tuple(b.size())
+
+        r = torch.zeros(size=(q, n), dtype=torch.float64)
+
+        for group_add in groups:
+            kwargs["group"] = group_add
+            r = r + self.ard_kernel(a, b, **kwargs)
+
+        r = r / float(len(groups))
+        return r
+
+    def tanh_kernel(self, a, b, **kwargs):
+        """
+        GP squared exponential kernel
+        """
+        # 	print (a.shape, b.shape)
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        X, Y = check_pairwise_arrays(a.numpy(), b.numpy())
+        K = manhattan_distances(a.numpy(), b.numpy())
+        K = K.T
+        eps = 10e-10
+        q = 3
+        A = (np.tanh(K) ** q) / (eps + K**q)
+        return kappa * torch.from_numpy(A)
+
+    def angsim_kernel(self, a, b, **kwargs):
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        return kappa * (2.0 / np.pi) * np.arcsin((a.dot(b)) / (a.norm() * b.norm()))
+
+    def polynomial_kernel(self, a, b, **kwargs):
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+        if "degree" in kwargs.keys():
+            power = kwargs["degree"]
+        else:
+            power = self.power
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        K = (torch.mm(b, torch.t(a)) + 1) ** power
+        return kappa * K
+
+    def polynomial_additive_kernel(self, a, b, **kwargs):
+
+        if "groups" in kwargs.keys():
+            groups = kwargs["groups"]
+        else:
+            groups = self.groups
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        (n, z) = tuple(a.size())
+        (q, m) = tuple(b.size())
+        no_groups = float(len(groups))
+        r = torch.zeros(size=(q, n), dtype=torch.float64)
+        for i, group in enumerate(groups):
+            z = self.polynomial_kernel(a[:, group], b[:, group], **kwargs)
+            r = r + z
+        r = r / no_groups
+        return r
+
+    def matern_kernel(self, a, b, **kwargs):
+        """
+        :param a: matrices
+        :param b: matrices
+        :param gamma: smoothness
+        :param v: Bessel function type
+        :return:
+        """
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "nu" in kwargs.keys():
+            v = kwargs["nu"]
+        else:
+            v = self.v
+
+        if "gamma" in kwargs.keys():
+            gamma = kwargs["gamma"]
+        else:
+            gamma = self.gamma
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group].numpy()
+        b = b[:, group].numpy()
+
+        dists = cdist(a / gamma, b / gamma, metric="euclidean").T
+        if v == 0.5:
+            K = np.exp(-dists)
+        elif v == 1.5:
+            K = dists * math.sqrt(3)
+            K = (1.0 + K) * np.exp(-K)
+        elif v == 2.5:
+            K = dists * math.sqrt(5)
+            K = (1.0 + K + K**2 / 3.0) * np.exp(-K)
+        else:  # general case; expensive to evaluate
+            K = dists
+            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
+            tmp = math.sqrt(2 * v) * K
+            K.fill((2 ** (1.0 - v)) / math.gamma(v))
+            K *= tmp**v
+            K *= kv(v, tmp)
+        return kappa * torch.from_numpy(K)
+
+    def ard_matern_kernel_diag(self, a, b, **kwargs):
+        """
+        :param a: matrices
+        :param b: matrices
+        :param gamma: smoothness
+        :param v: Bessel function type
+        :return:
+        """
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "nu" in kwargs.keys():
+            v = kwargs["nu"]
+        else:
+            v = self.v
+
+        if "ard_gamma" in kwargs.keys():
+            ard_gamma = kwargs["ard_gamma"]
+        else:
+            ard_gamma = self.ard_gamma
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        D = torch.diag(1.0 / (ard_gamma[group]))
+        a = torch.mm(a, D)
+        b = torch.mm(b, D)
+
+        a = a[:, group]
+        b = b[:, group]
+
+        # dists = torch.cdist(a , b , p = 2).T
+        dists = torch.sqrt(torch.sum((a - b) ** 2))
+
+        if v == 0.5:
+            K = torch.exp(-dists)
+        elif v == 1.5:
+            K = dists * np.sqrt(3)
+            K = (1.0 + K) * torch.exp(-K)
+        elif v == 2.5:
+            K = dists * np.sqrt(5)
+            K = (1.0 + K + K**2 / 3.0) * torch.exp(-K)
+        else:  # general case; expensive to evaluate
+            K = dists
+            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
+            tmp = np.sqrt(2 * v) * K
+            K.fill((2 ** (1.0 - v)) / math.gamma(v))
+            K *= tmp**v
+            K *= kv(v, tmp)
+        return kappa * K
+
+    def ard_matern_kernel(self, a, b, **kwargs):
+        """
+        :param a: matrices
+        :param b: matrices
+        :param gamma: smoothness
+        :param v: Bessel function type
+        :return:
+        """
+
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "nu" in kwargs.keys():
+            v = kwargs["nu"]
+        else:
+            v = self.v
+
+        if "ard_gamma" in kwargs.keys():
+            ard_gamma = kwargs["ard_gamma"]
+        else:
+            ard_gamma = self.ard_gamma
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        D = torch.diag(1.0 / (ard_gamma[group]))
+        a = torch.mm(a, D)
+        b = torch.mm(b, D)
+
+        a = a[:, group]
+        b = b[:, group]
+
+        dists = torch.cdist(a, b, p=2).T
+
+        if v == 0.5:
+            K = torch.exp(-dists)
+        elif v == 1.5:
+            K = dists * np.sqrt(3)
+            K = (1.0 + K) * torch.exp(-K)
+        elif v == 2.5:
+            K = dists * np.sqrt(5)
+            K = (1.0 + K + K**2 / 3.0) * torch.exp(-K)
+        else:  # general case; expensive to evaluate
+            K = dists
+            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
+            tmp = np.sqrt(2 * v) * K
+            K.fill((2 ** (1.0 - v)) / math.gamma(v))
+            K *= tmp**v
+            K *= kv(v, tmp)
+        return kappa * K
+
+    def modified_matern_kernel(self, X, Y, **kwargs):
+        """
+        :param a: matrices
+        :param b: matrices
+        :param gamma: smoothness
+        :param v: Bessel function type
+        :return:
+        """
+        if "kappa" in kwargs.keys():
+            kappa = kwargs["kappa"]
+        else:
+            kappa = self.kappa
+
+        if "nu" in kwargs.keys():
+            v = kwargs["nu"]
+        else:
+            v = self.v
+
+        if "gamma" in kwargs.keys():
+            gamma = kwargs["gamma"]
+        else:
+            gamma = self.gamma
+
+        if "group" in kwargs.keys():
+            group = kwargs["group"]
+        else:
+            group = self.group
+
+        a = a[:, group]
+        b = b[:, group]
+
+        d = X.size()[1]
+        # Z = np.ones(shape = (X.shape[0],Y.shape[0]))
+        Z = torch.ones(size=(Y.size()[0], X.size()[0]), dtype=torch.float64)
+        for i in range(d):
+            a = X[:, i].view(-1, 1)
+            b = Y[:, i].view(-1, 1)
+            # dists = cdist(a/gamma,b/gamma,metric='cityblock').T
+            dists = cdist(a.numpy() / gamma, b.numpy() / gamma, metric="euclidean").T
+            # dists = manhattan_distances(a, b).T/ gamma
+            dists = torch.from_numpy(dists)
+            if v == 1:
+                K = torch.exp(-dists)
+            elif v == 2:
+                K = (1 + dists) * torch.exp(-dists)
+            elif v == 3:
+                K = (dists**2 + 3 * torch.abs(dists) + 3) * torch.exp(-dists) / 3.0
+            elif v == 4:
+                K = (
+                    (dists**3 + 6 * dists**2 + 15 * torch.abs(dists) + 15)
+                    * torch.exp(-dists)
+                    / 15.0
+                )
+            else:
+                raise AssertionError("Kernel with nu = " + str(v) + "not implemented.")
+            Z = Z * K
+        return kappa * Z
+
+    def spectral_kernel(self, a, b):
+        if self.freq is not None:
+            (n, d) = a.size()
+            (m, d) = b.size()
+            dist = torch.zeros(size=(n, m), dtype=torch.float64)
+            c = 0
+            for x in a:
+                z = 0
+                for y in b:
+                    dist[c, z] = torch.sum(
+                        torch.cos(torch.mm(x.view(1, 1) - y.view(1, 1), self.freq))
+                    )
+                    z = z + 1
+                c = c + 1
+            N = self.freq.size()[0]
+            return torch.t(dist) / N
+        else:
+            raise AssertionError("No frequencies passed")
+
+    def wiener_kernel(self, a, b):
+        """
+        Wiener process kernel
+        k(x,y) = min(x,y)
+        k(x,y) = \sum_i min(x_i,y_i)
+        """
+        (n, d) = a.size()
+        (m, d) = b.size()
+        dist = torch.zeros(size=(n, m))
+        # dist = 0.1*np.eye(max(n,m))[0:m,0:n]
+        c = 0
+        for x in a:
+            z = 0
+            for y in b:
+                print(x, y)
+                dist[c, z] = torch.from_numpy(np.sum(np.min(np.array([x, y]), axis=0)))
+                z = z + 1
+            c = c + 1
+
+        # print (dist)
+        return dist.T
+
+    def derivative_1(self, fixed, x):
+        """ """
+        d = x.size()[1]
+        n = x.size()[0]
+
+        size = fixed.size()[0]
+
+        if self.optkernel == "squared_exponential":
+            k_original = self.squared_exponential_kernel(fixed, x)
+            second = fixed.unsqueeze(1) - x
+            second = second / self.gamma**2
+            res = self.kappa * torch.einsum("ij,jik->ijk", k_original, second)
+        else:
+            raise AssertionError("Not implemented for this kernel")
+
+        # result should be (n,d)
+        return res
+
+    def derivative_2(self, fixed, x):
+        """ """
+        d = x.size()[1]
+        n = x.size()[0]
+
+        size = fixed.size()[0]
+
+        if self.optkernel == "squared_exponential":
+            k_original = self.squared_exponential_kernel(fixed, x)
+            second = fixed.unsqueeze(1) - x
+            second = second / self.gamma**2
+            second2 = torch.einsum("ijk,ijl->ijkl", second, second)
+            res1 = torch.einsum("ij,jikl->ijkl", k_original, second2)
+
+            ones = torch.zeros(size=(size, n, d, d))
+            for j in range(d):
+                ones[:, :, j, j] = 1.0
+            ones = -ones / self.gamma**2
+            res2 = torch.einsum("ij,jikl->ijkl", k_original, ones)
+            res = self.kappa * (res1 + res2)
+        # res = self.kappa * res2
+        else:
+            raise AssertionError("Not implemented for this kernel")
+
+        return res
+
+    def square_dist(self, a, b):
+        if a.shape == b.shape:
+            normx = np.sum(a**2, axis=1).reshape(-1, 1)
+            normy = np.sum(b**2, axis=1).reshape(-1, 1)
+        else:
+            normx = np.sum(a**2, axis=1).reshape(-1, 1)
+            normy = np.sum(b**2, axis=1).reshape(-1, 1)
+
+        product = b.dot(a.T)
+        sqdist = np.tile(normx, b.shape[0]).T + np.tile(normy, a.shape[0]) - 2 * product
+        return sqdist
diff --git a/stpy/legacy/integral_kernels.py b/stpy/legacy/integral_kernels.py
index fef208c..d56aa98 100755
--- a/stpy/legacy/integral_kernels.py
+++ b/stpy/legacy/integral_kernels.py
@@ -9,576 +9,614 @@
 
 class IntegralKernel:
 
-	def __init__(self, dataset, s=0.1):
-
-		self.x = dataset[0]
-		self.y = dataset[1]
-
-		self.s = s
-		self.gamma = 1.0
-		self.distibution = lambda size: torch.from_numpy(np.random.normal(size=size) * (1. / self.gamma))
-
-		self.n = self.x.size()[0]
-		self.d = self.x.size()[1]
-
-		self.basis_func = lambda x, theta: torch.cat((torch.cos(torch.mm(theta, x)), torch.sin(torch.mm(theta, x))), 1)
-		self.size = 2
-
-		self.set = []
-		self.weights = []
-		self.params = []
-		self.active_basis = None
-
-	def set_distribution(self, distibution):
-		self.distibution = distibution
-
-	def set_basis_function(self, fun, size):
-		self.basis_func = fun
-		self.size = size
-
-	def sample_basis_function(self):
-		param = self.distibution(self.d).view(-1, 1)
-		return [self.get_basis_function(param), param]
-
-	def sample_basis_function_qmc(self, size=1):
-		inv_cum_dist = lambda x: norm.ppf(x) * (1. / 1.)
-		params = torch.from_numpy(sample_qmc_halton(inv_cum_dist, size=(size, self.d)))
-		return params
-
-	def sample_basis_vector(self):
-		fun = self.sample_basis_function()[0]
-		return fun(self.x).view(-1) / np.sqrt(self.n)
-
-	def get_basis_function(self, param):
-		return lambda x: self.basis_func(param, x)
-
-	def add_to_basis(self, fun, weight, param):
-		self.set.append(fun)
-		self.weights.append(weight)
-		self.params.append(param)
-
-	def basis_func_dataset(self, param):
-		return self.basis_func(param, self.x).view(-1) / np.sqrt(self.n)
-
-	def basis_map_set(self, x, set, weights):
-		value = torch.zeros(len(set), x.size()[0] * self.size, dtype=torch.float64)
-		# print (value.size(),x.size(),self.set[0](x).view(-1).size())
-		for index, elem in enumerate(set):
-			# print (np.sqrt(np.array(self.weights[index]).astype(complex)))
-			value[index, :] = elem(x).view(-1) / np.sqrt(self.n)  # * np.sqrt(weights[index])
-		return value
-
-	def empty(self):
-		self.active_basis = None
-		self.set = []
-		self.weights = []
-		self.params = []
-
-	def empty_add_random(self):
-		self.empty()
-		self.random_increase(1)
-
-	def basis_map(self, x):
-		return self.basis_map_set(x, self.set, self.weights)
-
-	def kernel(self, x, y, noise=True):
-		value = torch.zeros(x.size()[0], y.size()[0], dtype=torch.float64)
-
-		for index, elem in enumerate(self.set):
-			value += torch.mm(elem(x), torch.t(elem(y))) * self.weights[index]
-		if noise == True:
-			value = value + self.s * self.s * torch.eye(x.size()[0], y.size()[0], dtype=torch.float64)
-
-		return value
-
-	def outer_kernel(self, x):
-		Phi = self.basis_map(x)
-		value = torch.mm(Phi, torch.t(Phi))
-		return value
-
-	def expected_phi(self, x, base=10000):
-		Ephi = torch.zeros(x.size()[0] * self.size, dtype=torch.float64)
-		for _ in range(base):
-			Ephi += self.sample_basis_function()[0](x).view(-1) / np.sqrt(self.n)
-		Ephi = Ephi / base
-		return Ephi
-
-	def expected_phi_squared(self, x, fun, base=10000):
-		prod = 0
-		v = fun(x).view(-1) / np.sqrt(self.n)
-		for _ in range(base):
-			sample = self.sample_basis_function()[0](x).view(-1) / np.sqrt(self.n)
-			prod += torch.dot(sample, v) ** 2
-		prod = prod / base
-		return prod
-
-	def expected_phi_squared_set(self, x, base=10000):
-		v = self.active_basis
-
-		prod = torch.zeros(x.size()[0], )
-		for _ in range(base):
-			sample = self.sample_basis_function()[0](x).view(-1) / np.sqrt(self.n)
-			prod += torch.mm(sample, v) ** 2
-		prod = prod / base
-		return prod
-
-	def update_basis(self):
-		if self.active_basis is None:
-			Phi = self.basis_map(self.x)
-			self.active_basis = Phi
-			W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(len(self.set), dtype=torch.float64)
-			self.W_inv = torch.inverse(W)
-		else:
-			v = self.set[-1](self.x).view(1, -1) / np.sqrt(self.x.size()[0])
-			self.active_basis = torch.cat((self.active_basis, v), dim=0)
-			W = torch.mm(self.active_basis, torch.t(self.active_basis)) + self.s * self.s * torch.eye(len(self.set),
-																									  dtype=torch.float64)
-			self.W_inv = torch.inverse(W)
-
-	"""
+    def __init__(self, dataset, s=0.1):
+
+        self.x = dataset[0]
+        self.y = dataset[1]
+
+        self.s = s
+        self.gamma = 1.0
+        self.distibution = lambda size: torch.from_numpy(
+            np.random.normal(size=size) * (1.0 / self.gamma)
+        )
+
+        self.n = self.x.size()[0]
+        self.d = self.x.size()[1]
+
+        self.basis_func = lambda x, theta: torch.cat(
+            (torch.cos(torch.mm(theta, x)), torch.sin(torch.mm(theta, x))), 1
+        )
+        self.size = 2
+
+        self.set = []
+        self.weights = []
+        self.params = []
+        self.active_basis = None
+
+    def set_distribution(self, distibution):
+        self.distibution = distibution
+
+    def set_basis_function(self, fun, size):
+        self.basis_func = fun
+        self.size = size
+
+    def sample_basis_function(self):
+        param = self.distibution(self.d).view(-1, 1)
+        return [self.get_basis_function(param), param]
+
+    def sample_basis_function_qmc(self, size=1):
+        inv_cum_dist = lambda x: norm.ppf(x) * (1.0 / 1.0)
+        params = torch.from_numpy(sample_qmc_halton(inv_cum_dist, size=(size, self.d)))
+        return params
+
+    def sample_basis_vector(self):
+        fun = self.sample_basis_function()[0]
+        return fun(self.x).view(-1) / np.sqrt(self.n)
+
+    def get_basis_function(self, param):
+        return lambda x: self.basis_func(param, x)
+
+    def add_to_basis(self, fun, weight, param):
+        self.set.append(fun)
+        self.weights.append(weight)
+        self.params.append(param)
+
+    def basis_func_dataset(self, param):
+        return self.basis_func(param, self.x).view(-1) / np.sqrt(self.n)
+
+    def basis_map_set(self, x, set, weights):
+        value = torch.zeros(len(set), x.size()[0] * self.size, dtype=torch.float64)
+        # print (value.size(),x.size(),self.set[0](x).view(-1).size())
+        for index, elem in enumerate(set):
+            # print (np.sqrt(np.array(self.weights[index]).astype(complex)))
+            value[index, :] = elem(x).view(-1) / np.sqrt(
+                self.n
+            )  # * np.sqrt(weights[index])
+        return value
+
+    def empty(self):
+        self.active_basis = None
+        self.set = []
+        self.weights = []
+        self.params = []
+
+    def empty_add_random(self):
+        self.empty()
+        self.random_increase(1)
+
+    def basis_map(self, x):
+        return self.basis_map_set(x, self.set, self.weights)
+
+    def kernel(self, x, y, noise=True):
+        value = torch.zeros(x.size()[0], y.size()[0], dtype=torch.float64)
+
+        for index, elem in enumerate(self.set):
+            value += torch.mm(elem(x), torch.t(elem(y))) * self.weights[index]
+        if noise == True:
+            value = value + self.s * self.s * torch.eye(
+                x.size()[0], y.size()[0], dtype=torch.float64
+            )
+
+        return value
+
+    def outer_kernel(self, x):
+        Phi = self.basis_map(x)
+        value = torch.mm(Phi, torch.t(Phi))
+        return value
+
+    def expected_phi(self, x, base=10000):
+        Ephi = torch.zeros(x.size()[0] * self.size, dtype=torch.float64)
+        for _ in range(base):
+            Ephi += self.sample_basis_function()[0](x).view(-1) / np.sqrt(self.n)
+        Ephi = Ephi / base
+        return Ephi
+
+    def expected_phi_squared(self, x, fun, base=10000):
+        prod = 0
+        v = fun(x).view(-1) / np.sqrt(self.n)
+        for _ in range(base):
+            sample = self.sample_basis_function()[0](x).view(-1) / np.sqrt(self.n)
+            prod += torch.dot(sample, v) ** 2
+        prod = prod / base
+        return prod
+
+    def expected_phi_squared_set(self, x, base=10000):
+        v = self.active_basis
+
+        prod = torch.zeros(
+            x.size()[0],
+        )
+        for _ in range(base):
+            sample = self.sample_basis_function()[0](x).view(-1) / np.sqrt(self.n)
+            prod += torch.mm(sample, v) ** 2
+        prod = prod / base
+        return prod
+
+    def update_basis(self):
+        if self.active_basis is None:
+            Phi = self.basis_map(self.x)
+            self.active_basis = Phi
+            W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(
+                len(self.set), dtype=torch.float64
+            )
+            self.W_inv = torch.inverse(W)
+        else:
+            v = self.set[-1](self.x).view(1, -1) / np.sqrt(self.x.size()[0])
+            self.active_basis = torch.cat((self.active_basis, v), dim=0)
+            W = torch.mm(
+                self.active_basis, torch.t(self.active_basis)
+            ) + self.s * self.s * torch.eye(len(self.set), dtype=torch.float64)
+            self.W_inv = torch.inverse(W)
+
+    """
 		Scores
 	"""
 
-	def leverage_score(self, fun, adding=True, weighted=False, variance=True):
-
-		if adding == True:
-			print(fun(self.x).size())
-			v = fun(self.x) / np.sqrt(self.x.size()[0])
-			new_active_basis = torch.cat((self.active_basis, v), dim=0)
-			W = torch.mm(new_active_basis, torch.t(new_active_basis)) + self.s * self.s * torch.eye(len(self.set) + 1,
-																									dtype=torch.float64)
-			W_inv = torch.inverse(W)
-			Phi = new_active_basis
-		else:
-			W_inv = self.W_inv
-			Phi = self.active_basis
-
-		if weighted == True:
-			S = torch.diag(torch.sqrt(torch.from_numpy(np.array(self.weights))))
-			Phi = torch.mm(S, Phi)
-		else:
-			pass
-		# solve leverage score problem
-		A = torch.mm(torch.t(Phi), torch.mm(W_inv, Phi))
-		rhs = fun(self.x).view(-1, 1) / np.sqrt(self.x.size()[0])
-		# print (torch.mm(torch.t(rhs),rhs), torch.mm(torch.t(rhs),torch.mm(A,rhs)))
-		if variance == True:
-			leverage_score = np.abs(torch.mm(torch.t(rhs), rhs) - torch.mm(torch.t(rhs), torch.mm(A, rhs))) / (
-						self.s ** 2)
-		else:
-			leverage_score = np.abs(torch.mm(torch.t(rhs), rhs) - torch.mm(torch.t(rhs), torch.mm(A, rhs)))
-
-		return leverage_score
-
-	def bayes_quad_score(self, fun, base=1000, Ephi=None):
-		"""
-			Implements score Phi(set,X)E[Phi(x)]K^{-1}E[Phi(x)]Phi(X,set)
-
-		:param fun: new basis function
-		:param base: size of the basis to approximate the expected mapping
-		:return:
-		"""
-		if Ephi is None:
-			Ephi = self.expected_phi(self.x, base=base).view(-1, 1)
-		else:
-			pass
-		new_set = self.set.copy()
-		new_set.append(fun)
-		new_Phi = self.basis_map_set(self.x, new_set, np.ones(len(new_set)).tolist())
-		W = torch.mm(new_Phi, torch.t(new_Phi)) + self.s * self.s * torch.eye(len(new_set), dtype=torch.float64)
-		W_inv = torch.inverse(W)
-		v = torch.mm(new_Phi, Ephi)
-		score = torch.mm(torch.t(v), torch.mm(W_inv, v))
-		return score
-
-	def greedy_score(self, candidates):
-		K = self.kernel(self.x, self.x, noise=False)
-		scores = torch.zeros(len(candidates), dtype=torch.float64)
-		for j in range(len(candidates)):
-			fun = candidates[j]
-			score = torch.norm(torch.mm(fun, torch.t(fun)) - K)
-			# print(torch.norm(torch.mm(fun,torch.t(fun))),torch.norm(K))
-			scores[j] = score
-		return scores
-
-	def herding_score(self, fun, base=1000, Ephi=None):
-		# if Ephi is None:
-		# 	Ephi = self.expected_phi(self.x, base=base).view(-1,1)
-		# else:
-		# 	pass
-		#
-		phi = fun(self.x).view(-1) / np.sqrt(self.n)
-		Phi = self.active_basis
-		n, m = Phi.size()
-		v = 0.0
-		for j in range(n):
-			v = v + torch.dot(Phi[j, :], phi) ** 2
-		v = (1. / (n + 1)) * v
-		z = self.expected_phi_squared(self.x, fun, base=base)
-		r = z - v
-		return r
-
-	def variance_scores(self, set=None):
-		if set is None:
-			Phi = self.basis_map_set(self.x, self.set, np.ones(len(self.set)).tolist())
-			W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(len(self.set), dtype=torch.float64)
-		else:
-			Phi = self.basis_map_set(self.x, set, np.ones(len(set)).tolist())
-			W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(len(set), dtype=torch.float64)
-		W_inv = torch.inverse(W)
-		vars = torch.einsum('ji,ij->j', W, W_inv).view(-1, 1)
-		return vars
-
-	###############################
-	## Increasing the basis size ##
-	###############################
-
-	def seq_bayes_quad_increase_heuristic(self, size=1, candidates=10, base=100):
-		"""
-		Implements sequential bayes quadrature with inexact optimization
-		:param size:
-		:param base:
-		:return:
-		"""
-		Ephi = self.expected_phi(self.x, base=base).view(-1, 1)
-		for _ in range(size):
-			funs = []
-			scores = torch.zeros(candidates, dtype=torch.float64)
-			params = []
-			for j in range(candidates):
-				fun, param = self.sample_basis_function()
-				leverage_score = self.bayes_quad_score(fun, Ephi=Ephi)
-				funs.append(fun)
-				scores[j] = leverage_score
-				params.append(param)
-			argmax = torch.argmax(scores)
-			self.add_to_basis(funs[argmax], 1.0, params[argmax])
-		self.quadrature_weights()
-
-	# def herding_exact_increase(self, size = 1):
-	# 	"""
-	# 	Solves exactly the herding problem with a non-linear solver
-	# 	:param size: size of the basis to be increase
-	# 	:return: None
-	# 	"""
-	# 	for _ in range(size):
-	# 		#fun = lambda x: self.basis_func(param,x)
-	# 		p = lambda omega: np.exp(-np.sum(omega ** 2, axis=1).reshape(-1, 1) / 2 * (self.gamma ** 2)) * np.power(
-	# 			(self.gamma / np.sqrt(2 * np.pi)), 1.) * np.power(np.pi / 2, 1.)
-	# 		ls = lambda param: -self.leverage_score(self.get_basis_function(torch.from_numpy(param).view(-1,1))).numpy()[0]*p(param.reshape(-1,1))[0]
-	# 		# plot ls
-	#
-	#
-	# 		# optimize leverage score
-	# 		from scipy.optimize import minimize
-	# 		start = self.distibution(self.d).view(-1, 1).numpy()
-	# 		res = minimize(ls, start , method="L-BFGS-B", tol=0.0000001, bounds=[[-5,5]])
-	# 		solution = torch.from_numpy(res.x).view(-1,1)
-	#
-	# 		#print (start, solution)
-	# 		# params = np.linspace(-10, 10, 1000).reshape(-1, 1)
-	# 		# lss = []
-	# 		#
-	# 		# for param in params:
-	# 		# 	#print (param, p(param.reshape(-1,1))[0])
-	# 		# 	lss.append(ls(param)*p(param.reshape(-1,1))[0])
-	# 		# index = np.argmin(np.array(lss))
-	# 		# solution = torch.from_numpy(params[index]).view(-1,1)
-	# 		# plt.plot(params, lss)
-	# 		# plt.plot(start,ls(start),'ro')
-	# 		# plt.plot(solution.numpy(),ls(solution.numpy()),'go')
-	# 		#plt.show()
-	# 		#print(start, solution)
-	# 		self.add_to_basis(self.get_basis_function(solution), 1., solution)
-
-	def herding_increase_heuristic(self, size=1, candidates=100, base=1000):
-		"""
-
-			:param size:
-			:param base:
-			:return:
-			"""
-		Ephi = self.expected_phi(self.x, base=base)
-		for _ in range(size):
-			# print (_)
-			self.update_basis()
-			funs = []
-			scores = torch.zeros(candidates, dtype=torch.float64)
-			params = []
-			for j in range(candidates):
-				fun, param = self.sample_basis_function()
-				leverage_score = self.herding_score(fun, Ephi=Ephi)
-				# print (j, leverage_score)
-				funs.append(fun)
-				scores[j] = leverage_score
-				params.append(param)
-			argmax = torch.argmax(scores)
-			self.add_to_basis(funs[argmax], 1., params[argmax])
-		self.uniformize_weights()
-
-	def herding_increase_heuristic_group(self, size=1, candidates=100, base=1000):
-		"""
-
-		:param size:
-		:param base:
-		:return:
-		"""
-		Ephi = self.expected_phi(self.x, base=base)
-		for _ in range(size):
-			# print (_)
-			self.update_basis()
-			funs = []
-			params = []
-			cand = torch.zeros(candidates, self.n * self.size, dtype=torch.float)
-			for j in range(candidates):
-				fun, param = self.sample_basis_function()
-				funs.append(fun)
-				cand[j, :] = fun(self.x).view(-1) / np.sqrt(self.n)
-			leverage_scores = self.herding_score_group(cand)
-
-			argmax = torch.argmax(leverage_scores)
-			self.add_to_basis(funs[argmax], 1., params[argmax])
-
-		self.uniformize_weights()
-
-	def dpp_increase(self, size=1, candidates=1000):
-		from dppy.finite_dpps import FiniteDPP
-		funs = []
-		params = []
-		cand = torch.zeros(candidates, self.n * self.size, dtype=torch.float64)
-
-		for j in range(candidates):
-			fun, param = self.sample_basis_function()
-			funs.append(fun)
-			params.append(param)
-			cand[j, :] = fun(self.x).view(-1) / np.sqrt(self.n)
-
-		# Random feature vectors
-		Phi = torch.t(cand)
-		L = Phi.numpy().T.dot(Phi.numpy()) + self.s * self.s * torch.eye(candidates, candidates,
-																		 dtype=torch.float64).numpy()
-		DPP = FiniteDPP('likelihood', **{'L': L})
-		DPP.flush_samples()
-		DPP.sample_exact_k_dpp(size=size)
-		sample_ind = DPP.list_of_samples[0]
-		for sample in sample_ind:
-			self.add_to_basis(funs[sample], 1., params[sample])
-		self.uniformize_weights()
-
-	def leverage_score_sampling(self, size=1):
-		count = 0
-		self.update_basis()
-		while count < size:
-
-			fun, param = self.sample_basis_function()
-			leverage_score = self.leverage_score(fun)
-			q_bar = size
-
-			q = np.random.binomial(q_bar, float(leverage_score))
-			# print(count, q, leverage_score)
-			if q > 0:
-				w = (q / q_bar) / leverage_score
-
-				self.add_to_basis(fun, w, param)
-				self.update_basis()
-				# print("adding", w.float(), param)
-				count += 1
-			else:
-				pass
-		# print ("reject", q)
-		# print ("sum", np.sum(self.weights))
-		# self.uniformize_weights()
-		# self.quadrature_weights()
-		# self.leverage_weights()
-		self.normalize_weights()
-
-	# optimize omp weights
-
-	def hermite_quadrature_basis(self, size=1):
-		self.set = []
-		self.weights = []
-		self.params = []
-
-		(nodes, weights) = np.polynomial.hermite.hermgauss(int(size))
-		nodes = torch.from_numpy(np.sqrt(2) * nodes / self.gamma)
-		weights = weights / np.sqrt(np.pi)
-		# self.weights = weights.tolist()
-		# print (self.weights)
-		for index in range(size):
-			fun = self.get_basis_function(nodes[index].view(self.d, -1))
-			self.add_to_basis(fun, weights[index], nodes[index])
-
-	def greedy_increase(self, size=1, base=100):
-		for _ in range(size):
-			# print (_)
-			self.update_basis()
-			funs = []
-			params = []
-			cand = torch.zeros(base, self.n, self.size, dtype=torch.float64)
-			for j in range(base):
-				fun, param = self.sample_basis_function()
-				funs.append(fun)
-				params.append(param)
-				cand[j, :] = fun(self.x)  # / np.sqrt(self.n)
-
-			scores = self.greedy_score(cand)
-			argmax = torch.argmin(scores)
-			self.add_to_basis(funs[argmax], 1., params[argmax])
-			self.normalize_weights()
-
-	# print (self.params)
-
-	def random_increase(self, size=1):
-		for _ in range(size):
-			f, param = self.sample_basis_function()
-			self.add_to_basis(f, 1., param)
-		self.uniformize_weights()
-
-	def qmc_increase(self, size=1):
-		params = self.sample_basis_function_qmc(size=size)
-		n = params.size()[0]
-		for j in range(n):
-			param = params[j, :].view(1, -1)
-			# print (params)
-			self.add_to_basis(self.get_basis_function(param), 1., param)
-		self.uniformize_weights()
-
-	def bach_algortihm(self, size=1, candidates=100):
-		for _ in range(size):
-			set = []
-			params = []
-			for j in range(candidates):
-				f, param = self.sample_basis_function()
-				set.append(f)
-				params.append(param)
-			vars = self.variance_scores(set=set)
-			index = np.argmax(-vars)
-			self.add_to_basis(set[index], 1., params[index])
-			vars = self.variance_scores()
-		self.weights = vars.view(-1).tolist()
-		self.normalize_weights()
-
-	def pca(self, kernel, size=1):
-		if size > self.n:
-			size = self.n
-		GP = NystromFeatures(kernel, m=torch.Tensor([size]), s=self.s, approx="svd")
-		GP.fit_gp(self.x, self.y)
-		return GP.outer_kernel()
-
-	def nystrom(self, kernel, size=1):
-		if size > self.n:
-			size = self.n
-		GP = NystromFeatures(kernel, m=torch.Tensor([size]), s=self.s, approx="uniform")
-		GP.fit_gp(self.x, self.y)
-		return GP.outer_kernel()
-
-	###########################
-	## weights optimization  ##
-	###########################
-
-	def normalize_weights(self):
-
-		# self.weights = np.ones(len(self.set))/len(self.set)
-		sum = np.sum(np.array(self.weights))
-		self.weights = np.array(self.weights) / sum
-		self.weights = self.weights.tolist()
-
-	# print (self.weights)
-
-	def uniformize_weights(self):
-		self.weights = np.ones(len(self.set)) / len(self.set)
-		self.weights = self.weights.tolist()
-
-	# print (self.weights)
-
-	def bayesian_quadrature_weights(self, base=1000):
-		"""
-		Bayesian Quadrature weights
-			two possible kernels
-		:return:
-		"""
-
-		phi = fun(self.x).view(-1) / np.sqrt(self.n)
-		Phi = self.active_basis
-		n, m = Phi.size()
-
-		Z = self.expected_phi_squared_set(self.x, base=base)
-
-		# assemble kernel
-		K = self.outer_kernel(self.x) * self.outer_kernel(self.x)
-		# invert kernel
-		self.weights = torch.mm(torch.mm(Z, torch.pinverse(K)), Z)
-		self.weights = self.weights.tolist()
-
-	def leverage_weights(self):
-
-		Phi = self.basis_map(self.x)
-		self.active_basis = Phi
-		W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(len(self.set), dtype=torch.float64)
-		self.W_inv = torch.inverse(W)
-
-		new_weights = []
-		n = len(self.set)
-		for fun in self.set:
-			leverage_score = self.leverage_score(fun, adding=False, variance=True, weighted=False)
-			# print (leverage_score)
-			new_weights.append(leverage_score)
-		self.weights = new_weights
-		self.normalize_weights()
-
-	def leverage_weights_experimental(self, Kinv):
-
-		Phi = self.basis_map(self.x)
-		self.active_basis = Phi
-		W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(len(self.set), dtype=torch.float64)
-		W_outer = torch.mm(torch.t(Phi), Phi) + self.s * self.s * torch.eye(self.n * 2, dtype=torch.float64)
-		W_outer_inv = torch.inverse(W_outer)
-		self.W_inv = torch.inverse(W)
-
-		print(torch.norm(W_outer - Kinv))
-
-		# print (Kinv)
-		new_weights = []
-		n = len(self.set)
-		for fun in self.set:
-			# leverage_score = self.leverage_score(fun, adding = False, variance = False, weighted= True)
-			v = fun(self.x).view(-1, 1) / np.sqrt(self.n)
-			# print (torch.trace(torch.mm(torch.t(v),v)))
-			mat = torch.mm(torch.t(v), torch.mm(W_outer_inv, v))
-			# print (mat)
-			leverage_score = torch.trace(mat)
-			if leverage_score > 0.0:
-				# print ("Violation!")
-				lv = self.leverage_score(fun, adding=False, variance=True, weighted=False)
-				print(float(leverage_score), float(lv))
-			# new_weights.append(float(2./(n*leverage_score)))
-			new_weights.append(1. / (n * leverage_score))
-		self.weights = new_weights
-		self.normalize_weights()
-
-	# print (self.weights)
-	# print (self.params)
-	# print(self.weights)
-	def omp_optimize(self, size=1):
-		pass
+    def leverage_score(self, fun, adding=True, weighted=False, variance=True):
+
+        if adding == True:
+            print(fun(self.x).size())
+            v = fun(self.x) / np.sqrt(self.x.size()[0])
+            new_active_basis = torch.cat((self.active_basis, v), dim=0)
+            W = torch.mm(
+                new_active_basis, torch.t(new_active_basis)
+            ) + self.s * self.s * torch.eye(len(self.set) + 1, dtype=torch.float64)
+            W_inv = torch.inverse(W)
+            Phi = new_active_basis
+        else:
+            W_inv = self.W_inv
+            Phi = self.active_basis
+
+        if weighted == True:
+            S = torch.diag(torch.sqrt(torch.from_numpy(np.array(self.weights))))
+            Phi = torch.mm(S, Phi)
+        else:
+            pass
+        # solve leverage score problem
+        A = torch.mm(torch.t(Phi), torch.mm(W_inv, Phi))
+        rhs = fun(self.x).view(-1, 1) / np.sqrt(self.x.size()[0])
+        # print (torch.mm(torch.t(rhs),rhs), torch.mm(torch.t(rhs),torch.mm(A,rhs)))
+        if variance == True:
+            leverage_score = np.abs(
+                torch.mm(torch.t(rhs), rhs) - torch.mm(torch.t(rhs), torch.mm(A, rhs))
+            ) / (self.s**2)
+        else:
+            leverage_score = np.abs(
+                torch.mm(torch.t(rhs), rhs) - torch.mm(torch.t(rhs), torch.mm(A, rhs))
+            )
+
+        return leverage_score
+
+    def bayes_quad_score(self, fun, base=1000, Ephi=None):
+        """
+                Implements score Phi(set,X)E[Phi(x)]K^{-1}E[Phi(x)]Phi(X,set)
+
+        :param fun: new basis function
+        :param base: size of the basis to approximate the expected mapping
+        :return:
+        """
+        if Ephi is None:
+            Ephi = self.expected_phi(self.x, base=base).view(-1, 1)
+        else:
+            pass
+        new_set = self.set.copy()
+        new_set.append(fun)
+        new_Phi = self.basis_map_set(self.x, new_set, np.ones(len(new_set)).tolist())
+        W = torch.mm(new_Phi, torch.t(new_Phi)) + self.s * self.s * torch.eye(
+            len(new_set), dtype=torch.float64
+        )
+        W_inv = torch.inverse(W)
+        v = torch.mm(new_Phi, Ephi)
+        score = torch.mm(torch.t(v), torch.mm(W_inv, v))
+        return score
+
+    def greedy_score(self, candidates):
+        K = self.kernel(self.x, self.x, noise=False)
+        scores = torch.zeros(len(candidates), dtype=torch.float64)
+        for j in range(len(candidates)):
+            fun = candidates[j]
+            score = torch.norm(torch.mm(fun, torch.t(fun)) - K)
+            # print(torch.norm(torch.mm(fun,torch.t(fun))),torch.norm(K))
+            scores[j] = score
+        return scores
+
+    def herding_score(self, fun, base=1000, Ephi=None):
+        # if Ephi is None:
+        # 	Ephi = self.expected_phi(self.x, base=base).view(-1,1)
+        # else:
+        # 	pass
+        #
+        phi = fun(self.x).view(-1) / np.sqrt(self.n)
+        Phi = self.active_basis
+        n, m = Phi.size()
+        v = 0.0
+        for j in range(n):
+            v = v + torch.dot(Phi[j, :], phi) ** 2
+        v = (1.0 / (n + 1)) * v
+        z = self.expected_phi_squared(self.x, fun, base=base)
+        r = z - v
+        return r
+
+    def variance_scores(self, set=None):
+        if set is None:
+            Phi = self.basis_map_set(self.x, self.set, np.ones(len(self.set)).tolist())
+            W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(
+                len(self.set), dtype=torch.float64
+            )
+        else:
+            Phi = self.basis_map_set(self.x, set, np.ones(len(set)).tolist())
+            W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(
+                len(set), dtype=torch.float64
+            )
+        W_inv = torch.inverse(W)
+        vars = torch.einsum("ji,ij->j", W, W_inv).view(-1, 1)
+        return vars
+
+    ###############################
+    ## Increasing the basis size ##
+    ###############################
+
+    def seq_bayes_quad_increase_heuristic(self, size=1, candidates=10, base=100):
+        """
+        Implements sequential bayes quadrature with inexact optimization
+        :param size:
+        :param base:
+        :return:
+        """
+        Ephi = self.expected_phi(self.x, base=base).view(-1, 1)
+        for _ in range(size):
+            funs = []
+            scores = torch.zeros(candidates, dtype=torch.float64)
+            params = []
+            for j in range(candidates):
+                fun, param = self.sample_basis_function()
+                leverage_score = self.bayes_quad_score(fun, Ephi=Ephi)
+                funs.append(fun)
+                scores[j] = leverage_score
+                params.append(param)
+            argmax = torch.argmax(scores)
+            self.add_to_basis(funs[argmax], 1.0, params[argmax])
+        self.quadrature_weights()
+
+    # def herding_exact_increase(self, size = 1):
+    # 	"""
+    # 	Solves exactly the herding problem with a non-linear solver
+    # 	:param size: size of the basis to be increase
+    # 	:return: None
+    # 	"""
+    # 	for _ in range(size):
+    # 		#fun = lambda x: self.basis_func(param,x)
+    # 		p = lambda omega: np.exp(-np.sum(omega ** 2, axis=1).reshape(-1, 1) / 2 * (self.gamma ** 2)) * np.power(
+    # 			(self.gamma / np.sqrt(2 * np.pi)), 1.) * np.power(np.pi / 2, 1.)
+    # 		ls = lambda param: -self.leverage_score(self.get_basis_function(torch.from_numpy(param).view(-1,1))).numpy()[0]*p(param.reshape(-1,1))[0]
+    # 		# plot ls
+    #
+    #
+    # 		# optimize leverage score
+    # 		from scipy.optimize import minimize
+    # 		start = self.distibution(self.d).view(-1, 1).numpy()
+    # 		res = minimize(ls, start , method="L-BFGS-B", tol=0.0000001, bounds=[[-5,5]])
+    # 		solution = torch.from_numpy(res.x).view(-1,1)
+    #
+    # 		#print (start, solution)
+    # 		# params = np.linspace(-10, 10, 1000).reshape(-1, 1)
+    # 		# lss = []
+    # 		#
+    # 		# for param in params:
+    # 		# 	#print (param, p(param.reshape(-1,1))[0])
+    # 		# 	lss.append(ls(param)*p(param.reshape(-1,1))[0])
+    # 		# index = np.argmin(np.array(lss))
+    # 		# solution = torch.from_numpy(params[index]).view(-1,1)
+    # 		# plt.plot(params, lss)
+    # 		# plt.plot(start,ls(start),'ro')
+    # 		# plt.plot(solution.numpy(),ls(solution.numpy()),'go')
+    # 		#plt.show()
+    # 		#print(start, solution)
+    # 		self.add_to_basis(self.get_basis_function(solution), 1., solution)
+
+    def herding_increase_heuristic(self, size=1, candidates=100, base=1000):
+        """
+
+        :param size:
+        :param base:
+        :return:
+        """
+        Ephi = self.expected_phi(self.x, base=base)
+        for _ in range(size):
+            # print (_)
+            self.update_basis()
+            funs = []
+            scores = torch.zeros(candidates, dtype=torch.float64)
+            params = []
+            for j in range(candidates):
+                fun, param = self.sample_basis_function()
+                leverage_score = self.herding_score(fun, Ephi=Ephi)
+                # print (j, leverage_score)
+                funs.append(fun)
+                scores[j] = leverage_score
+                params.append(param)
+            argmax = torch.argmax(scores)
+            self.add_to_basis(funs[argmax], 1.0, params[argmax])
+        self.uniformize_weights()
+
+    def herding_increase_heuristic_group(self, size=1, candidates=100, base=1000):
+        """
+
+        :param size:
+        :param base:
+        :return:
+        """
+        Ephi = self.expected_phi(self.x, base=base)
+        for _ in range(size):
+            # print (_)
+            self.update_basis()
+            funs = []
+            params = []
+            cand = torch.zeros(candidates, self.n * self.size, dtype=torch.float)
+            for j in range(candidates):
+                fun, param = self.sample_basis_function()
+                funs.append(fun)
+                cand[j, :] = fun(self.x).view(-1) / np.sqrt(self.n)
+            leverage_scores = self.herding_score_group(cand)
+
+            argmax = torch.argmax(leverage_scores)
+            self.add_to_basis(funs[argmax], 1.0, params[argmax])
+
+        self.uniformize_weights()
+
+    def dpp_increase(self, size=1, candidates=1000):
+        from dppy.finite_dpps import FiniteDPP
+
+        funs = []
+        params = []
+        cand = torch.zeros(candidates, self.n * self.size, dtype=torch.float64)
+
+        for j in range(candidates):
+            fun, param = self.sample_basis_function()
+            funs.append(fun)
+            params.append(param)
+            cand[j, :] = fun(self.x).view(-1) / np.sqrt(self.n)
+
+        # Random feature vectors
+        Phi = torch.t(cand)
+        L = (
+            Phi.numpy().T.dot(Phi.numpy())
+            + self.s
+            * self.s
+            * torch.eye(candidates, candidates, dtype=torch.float64).numpy()
+        )
+        DPP = FiniteDPP("likelihood", **{"L": L})
+        DPP.flush_samples()
+        DPP.sample_exact_k_dpp(size=size)
+        sample_ind = DPP.list_of_samples[0]
+        for sample in sample_ind:
+            self.add_to_basis(funs[sample], 1.0, params[sample])
+        self.uniformize_weights()
+
+    def leverage_score_sampling(self, size=1):
+        count = 0
+        self.update_basis()
+        while count < size:
+
+            fun, param = self.sample_basis_function()
+            leverage_score = self.leverage_score(fun)
+            q_bar = size
+
+            q = np.random.binomial(q_bar, float(leverage_score))
+            # print(count, q, leverage_score)
+            if q > 0:
+                w = (q / q_bar) / leverage_score
+
+                self.add_to_basis(fun, w, param)
+                self.update_basis()
+                # print("adding", w.float(), param)
+                count += 1
+            else:
+                pass
+        # print ("reject", q)
+        # print ("sum", np.sum(self.weights))
+        # self.uniformize_weights()
+        # self.quadrature_weights()
+        # self.leverage_weights()
+        self.normalize_weights()
+
+    # optimize omp weights
+
+    def hermite_quadrature_basis(self, size=1):
+        self.set = []
+        self.weights = []
+        self.params = []
+
+        (nodes, weights) = np.polynomial.hermite.hermgauss(int(size))
+        nodes = torch.from_numpy(np.sqrt(2) * nodes / self.gamma)
+        weights = weights / np.sqrt(np.pi)
+        # self.weights = weights.tolist()
+        # print (self.weights)
+        for index in range(size):
+            fun = self.get_basis_function(nodes[index].view(self.d, -1))
+            self.add_to_basis(fun, weights[index], nodes[index])
+
+    def greedy_increase(self, size=1, base=100):
+        for _ in range(size):
+            # print (_)
+            self.update_basis()
+            funs = []
+            params = []
+            cand = torch.zeros(base, self.n, self.size, dtype=torch.float64)
+            for j in range(base):
+                fun, param = self.sample_basis_function()
+                funs.append(fun)
+                params.append(param)
+                cand[j, :] = fun(self.x)  # / np.sqrt(self.n)
+
+            scores = self.greedy_score(cand)
+            argmax = torch.argmin(scores)
+            self.add_to_basis(funs[argmax], 1.0, params[argmax])
+            self.normalize_weights()
+
+    # print (self.params)
+
+    def random_increase(self, size=1):
+        for _ in range(size):
+            f, param = self.sample_basis_function()
+            self.add_to_basis(f, 1.0, param)
+        self.uniformize_weights()
+
+    def qmc_increase(self, size=1):
+        params = self.sample_basis_function_qmc(size=size)
+        n = params.size()[0]
+        for j in range(n):
+            param = params[j, :].view(1, -1)
+            # print (params)
+            self.add_to_basis(self.get_basis_function(param), 1.0, param)
+        self.uniformize_weights()
+
+    def bach_algortihm(self, size=1, candidates=100):
+        for _ in range(size):
+            set = []
+            params = []
+            for j in range(candidates):
+                f, param = self.sample_basis_function()
+                set.append(f)
+                params.append(param)
+            vars = self.variance_scores(set=set)
+            index = np.argmax(-vars)
+            self.add_to_basis(set[index], 1.0, params[index])
+            vars = self.variance_scores()
+        self.weights = vars.view(-1).tolist()
+        self.normalize_weights()
+
+    def pca(self, kernel, size=1):
+        if size > self.n:
+            size = self.n
+        GP = NystromFeatures(kernel, m=torch.tensor([size]), s=self.s, approx="svd")
+        GP.fit_gp(self.x, self.y)
+        return GP.outer_kernel()
+
+    def nystrom(self, kernel, size=1):
+        if size > self.n:
+            size = self.n
+        GP = NystromFeatures(kernel, m=torch.tensor([size]), s=self.s, approx="uniform")
+        GP.fit_gp(self.x, self.y)
+        return GP.outer_kernel()
+
+    ###########################
+    ## weights optimization  ##
+    ###########################
+
+    def normalize_weights(self):
+
+        # self.weights = np.ones(len(self.set))/len(self.set)
+        sum = np.sum(np.array(self.weights))
+        self.weights = np.array(self.weights) / sum
+        self.weights = self.weights.tolist()
+
+    # print (self.weights)
+
+    def uniformize_weights(self):
+        self.weights = np.ones(len(self.set)) / len(self.set)
+        self.weights = self.weights.tolist()
+
+    # print (self.weights)
+
+    def bayesian_quadrature_weights(self, base=1000):
+        """
+        Bayesian Quadrature weights
+                two possible kernels
+        :return:
+        """
+
+        phi = fun(self.x).view(-1) / np.sqrt(self.n)
+        Phi = self.active_basis
+        n, m = Phi.size()
+
+        Z = self.expected_phi_squared_set(self.x, base=base)
+
+        # assemble kernel
+        K = self.outer_kernel(self.x) * self.outer_kernel(self.x)
+        # invert kernel
+        self.weights = torch.mm(torch.mm(Z, torch.pinverse(K)), Z)
+        self.weights = self.weights.tolist()
+
+    def leverage_weights(self):
+
+        Phi = self.basis_map(self.x)
+        self.active_basis = Phi
+        W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(
+            len(self.set), dtype=torch.float64
+        )
+        self.W_inv = torch.inverse(W)
+
+        new_weights = []
+        n = len(self.set)
+        for fun in self.set:
+            leverage_score = self.leverage_score(
+                fun, adding=False, variance=True, weighted=False
+            )
+            # print (leverage_score)
+            new_weights.append(leverage_score)
+        self.weights = new_weights
+        self.normalize_weights()
+
+    def leverage_weights_experimental(self, Kinv):
+
+        Phi = self.basis_map(self.x)
+        self.active_basis = Phi
+        W = torch.mm(Phi, torch.t(Phi)) + self.s * self.s * torch.eye(
+            len(self.set), dtype=torch.float64
+        )
+        W_outer = torch.mm(torch.t(Phi), Phi) + self.s * self.s * torch.eye(
+            self.n * 2, dtype=torch.float64
+        )
+        W_outer_inv = torch.inverse(W_outer)
+        self.W_inv = torch.inverse(W)
+
+        print(torch.norm(W_outer - Kinv))
+
+        # print (Kinv)
+        new_weights = []
+        n = len(self.set)
+        for fun in self.set:
+            # leverage_score = self.leverage_score(fun, adding = False, variance = False, weighted= True)
+            v = fun(self.x).view(-1, 1) / np.sqrt(self.n)
+            # print (torch.trace(torch.mm(torch.t(v),v)))
+            mat = torch.mm(torch.t(v), torch.mm(W_outer_inv, v))
+            # print (mat)
+            leverage_score = torch.trace(mat)
+            if leverage_score > 0.0:
+                # print ("Violation!")
+                lv = self.leverage_score(
+                    fun, adding=False, variance=True, weighted=False
+                )
+                print(float(leverage_score), float(lv))
+            # new_weights.append(float(2./(n*leverage_score)))
+            new_weights.append(1.0 / (n * leverage_score))
+        self.weights = new_weights
+        self.normalize_weights()
+
+    # print (self.weights)
+    # print (self.params)
+    # print(self.weights)
+    def omp_optimize(self, size=1):
+        pass
 
 
 if __name__ == "__main__":
-	d = 1
-	n = 1024
-	N = 100
-	L_infinity_ball = 1
-	s = 0.001
-	xtest = torch.from_numpy(interval(n, d))
-	# x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d)))
-	x = torch.from_numpy(np.linspace(-1, 1, N)).view(N, d)
-	f = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
-	y = f(x)
-
-	IK = IntegralKernel([x, y], s=s)
-	IK.random_increase(1000)
-	IK.uniformize_weights()
-	IK.quadrature_weights()
-
-	fun = IK.sample_basis_function()[0]
-	print(IK.bayes_quad_score(fun))
+    d = 1
+    n = 1024
+    N = 100
+    L_infinity_ball = 1
+    s = 0.001
+    xtest = torch.from_numpy(interval(n, d))
+    # x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d)))
+    x = torch.from_numpy(np.linspace(-1, 1, N)).view(N, d)
+    f = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
+    y = f(x)
+
+    IK = IntegralKernel([x, y], s=s)
+    IK.random_increase(1000)
+    IK.uniformize_weights()
+    IK.quadrature_weights()
+
+    fun = IK.sample_basis_function()[0]
+    print(IK.bayes_quad_score(fun))
diff --git a/stpy/legacy/integral_kernels2.py b/stpy/legacy/integral_kernels2.py
index d442d8d..b5699b5 100755
--- a/stpy/legacy/integral_kernels2.py
+++ b/stpy/legacy/integral_kernels2.py
@@ -7,133 +7,143 @@
 
 class IntegralKernel:
 
-	def __init__(self, dataset, s=0.1):
+    def __init__(self, dataset, s=0.1):
 
-		self.x = dataset[0]
-		self.y = dataset[1]
+        self.x = dataset[0]
+        self.y = dataset[1]
 
-		self.s = s
-		self.gamma = 1.0
-		self.distibution = lambda size: torch.from_numpy(np.random.normal(size=size) * (1. / self.gamma))
+        self.s = s
+        self.gamma = 1.0
+        self.distibution = lambda size: torch.from_numpy(
+            np.random.normal(size=size) * (1.0 / self.gamma)
+        )
 
-		self.n = self.x.size()[0]
-		self.d = self.x.size()[1]
+        self.n = self.x.size()[0]
+        self.d = self.x.size()[1]
 
-		self.basis_func = lambda x, theta: torch.cat((torch.cos(torch.mm(theta, x)), torch.sin(torch.mm(theta, x))), 1)
-		self.size = 2
+        self.basis_func = lambda x, theta: torch.cat(
+            (torch.cos(torch.mm(theta, x)), torch.sin(torch.mm(theta, x))), 1
+        )
+        self.size = 2
 
-		self.set = []
-		self.weights = []
-		self.params = []
-		self.active_basis = None
+        self.set = []
+        self.weights = []
+        self.params = []
+        self.active_basis = None
 
-	def set_distribution(self, distibution):
-		self.distibution = distibution
+    def set_distribution(self, distibution):
+        self.distibution = distibution
 
-	def set_basis_function(self, fun, size):
-		self.basis_func = fun
-		self.size = size
+    def set_basis_function(self, fun, size):
+        self.basis_func = fun
+        self.size = size
 
-	def sample_basis_function(self):
-		param = self.distibution(self.d).view(-1, 1)
-		return [self.get_basis_function(param), param]
+    def sample_basis_function(self):
+        param = self.distibution(self.d).view(-1, 1)
+        return [self.get_basis_function(param), param]
 
-	def sample_basis_function_qmc(self, size=1):
-		inv_cum_dist = lambda x: norm.ppf(x) * (1. / 1.)
-		params = torch.from_numpy(sample_qmc_halton(inv_cum_dist, size=(size, self.d)))
-		return params
+    def sample_basis_function_qmc(self, size=1):
+        inv_cum_dist = lambda x: norm.ppf(x) * (1.0 / 1.0)
+        params = torch.from_numpy(sample_qmc_halton(inv_cum_dist, size=(size, self.d)))
+        return params
 
-	def sample_basis_vector(self):
-		fun = self.sample_basis_function()[0]
-		return fun(self.x).view(-1) / np.sqrt(self.n)
+    def sample_basis_vector(self):
+        fun = self.sample_basis_function()[0]
+        return fun(self.x).view(-1) / np.sqrt(self.n)
 
-	def get_basis_function(self, param):
-		return lambda x: self.basis_func(param, x)
+    def get_basis_function(self, param):
+        return lambda x: self.basis_func(param, x)
 
-	def add_to_basis(self, fun, weight, param):
-		self.set.append(fun)
-		self.weights.append(weight)
-		self.params.append(param)
+    def add_to_basis(self, fun, weight, param):
+        self.set.append(fun)
+        self.weights.append(weight)
+        self.params.append(param)
 
-	def empty(self):
-		self.active_basis = None
-		self.set = []
-		self.weights = []
-		self.params = []
+    def empty(self):
+        self.active_basis = None
+        self.set = []
+        self.weights = []
+        self.params = []
 
-	def empty_add_random(self):
-		self.empty()
-		self.random_increase(1)
+    def empty_add_random(self):
+        self.empty()
+        self.random_increase(1)
 
-	def kernel(self, x, y, noise=True):
-		value = torch.zeros(x.size()[0], y.size()[0], dtype=torch.float64)
+    def kernel(self, x, y, noise=True):
+        value = torch.zeros(x.size()[0], y.size()[0], dtype=torch.float64)
 
-		for index, elem in enumerate(self.set):
-			value += torch.mm(elem(x), torch.t(elem(y))) * self.weights[index]
+        for index, elem in enumerate(self.set):
+            value += torch.mm(elem(x), torch.t(elem(y))) * self.weights[index]
 
-		if noise == True:
-			value = value + self.s * self.s * torch.eye(x.size()[0], y.size()[0], dtype=torch.float64)
+        if noise == True:
+            value = value + self.s * self.s * torch.eye(
+                x.size()[0], y.size()[0], dtype=torch.float64
+            )
 
-		return value
+        return value
 
-	def random_basis(self, size=1):
-		for _ in range(size):
-			f, param = self.sample_basis_function()
-			self.add_to_basis(f, 1., param)
-		self.uniformize_weights()
+    def random_basis(self, size=1):
+        for _ in range(size):
+            f, param = self.sample_basis_function()
+            self.add_to_basis(f, 1.0, param)
+        self.uniformize_weights()
 
-	def leverage_socre(self, fun):
-		v = fun(self.x) / np.sqrt(self.x.size()[0])
-		new_set = self.set
+    def leverage_socre(self, fun):
+        v = fun(self.x) / np.sqrt(self.x.size()[0])
+        new_set = self.set
 
-	def basis_map_set(self, x, set):
-		value = torch.zeros(len(set), x.size()[0] * self.size, dtype=torch.float64)
-		for index, elem in enumerate(set):
-			value[index, :] = elem(x).view(-1) / np.sqrt(self.n)  # * np.sqrt(weights[index])
-		return value
+    def basis_map_set(self, x, set):
+        value = torch.zeros(len(set), x.size()[0] * self.size, dtype=torch.float64)
+        for index, elem in enumerate(set):
+            value[index, :] = elem(x).view(-1) / np.sqrt(
+                self.n
+            )  # * np.sqrt(weights[index])
+        return value
 
-	def outer_kernel(self, x):
-		Phi = self.basis_map_set(x, self.set)
-		value = torch.mm(Phi, torch.t(Phi))
-		return value
+    def outer_kernel(self, x):
+        Phi = self.basis_map_set(x, self.set)
+        value = torch.mm(Phi, torch.t(Phi))
+        return value
 
-	def leverage_score(self, fun):
+    def leverage_score(self, fun):
 
-		return 1.0
+        return 1.0
 
-	def leverage_score_basis(self, size=1):
-		count = 0
+    def leverage_score_basis(self, size=1):
+        count = 0
 
-		while count < size:
-			fun, param = self.sample_basis_function()
-			leverage_score = self.leverage_score(fun)
-			q_bar = size
+        while count < size:
+            fun, param = self.sample_basis_function()
+            leverage_score = self.leverage_score(fun)
+            q_bar = size
 
-			q = np.random.binomial(q_bar, float(leverage_score))
-			if q > 0:
-				w = (q / q_bar) / leverage_score
+            q = np.random.binomial(q_bar, float(leverage_score))
+            if q > 0:
+                w = (q / q_bar) / leverage_score
 
-				self.add_to_basis(fun, w, param)
-				count += 1
-			else:
-				pass
+                self.add_to_basis(fun, w, param)
+                count += 1
+            else:
+                pass
 
-		self.normalize_weights()
+        self.normalize_weights()
 
-	def normalize_weights(self):
+    def normalize_weights(self):
 
-		# self.weights = np.ones(len(self.set))/len(self.set)
-		sum = np.sum(np.array(self.weights))
-		self.weights = np.array(self.weights) / sum
-		self.weights = self.weights.tolist()
+        # self.weights = np.ones(len(self.set))/len(self.set)
+        sum = np.sum(np.array(self.weights))
+        self.weights = np.array(self.weights) / sum
+        self.weights = self.weights.tolist()
+
+    # print (self.weights)
+
+    def uniformize_weights(self):
+        self.weights = np.ones(len(self.set)) / len(self.set)
+        self.weights = self.weights.tolist()
 
-	# print (self.weights)
 
-	def uniformize_weights(self):
-		self.weights = np.ones(len(self.set)) / len(self.set)
-		self.weights = self.weights.tolist()
 # print (self.weights)
 
 
 if __name__ == "__main__":
-	pass
+    pass
diff --git a/stpy/optim/cost_functions.py b/stpy/optim/cost_functions.py
index 9248763..a31ba01 100755
--- a/stpy/optim/cost_functions.py
+++ b/stpy/optim/cost_functions.py
@@ -3,51 +3,62 @@
 
 class CostFunction:
 
-	def __init__(self, cost, number_args=1):
-		self.cost = cost
-		self.number_args = number_args
-
-	def joined_egrad(self, Xx):
-		for X in Xx:
-			X.requires_grad_(True)
-		y = self.cost(Xx)
-		y.backward(retain_graph=True)
-		output = []
-		for X in Xx:
-			output.append(X.grad)
-		return output
-
-	def joined_hess(self, Xx, Uu):
-		for X in zip(Xx):
-			X.requires_grad_(True)
-		y = self.joined_egrad(Xx)
-		y.backward(retain_graph=True)
-		output = []
-		for X, U in zip(Xx, Uu):
-			output.append(torch.mm(X.grad, Uu))
-		return output
-
-	def egrad(self, X):
-		X.requires_grad_(True)
-		y = self.cost(X)
-		y.backward(retain_graph=True)
-		return X.grad
-
-	def ehess(self, X, U):
-		X.requires_grad_(True)
-		y = self.egrad(X)
-		y.backward(retain_graph=True)
-		return torch.mm(X.grad, U)
-
-	def define(self):
-		if self.number_args == 1:
-			cost_numpy = lambda X: self.cost(torch.from_numpy(X)).data.numpy()
-			grad_numpy = lambda X: self.egrad(torch.from_numpy(X)).data.numpy()
-			hess_numpy = lambda X, U: self.ehess(torch.from_numpy(X), torch.from_numpy(U)).data.numpy()
-			return [cost_numpy, grad_numpy, hess_numpy]
-		else:
-			cost_numpy = lambda Xx: self.cost([torch.from_numpy(X) for X in Xx]).data.numpy()
-			grad_numpy = lambda Xx: [z.data.numpy() for z in self.joined_egrad([torch.from_numpy(X) for X in Xx])]
-			hess_numpy = lambda Xx, Uu: [z.data.numpy() for z in self.joined_ehess([torch.from_numpy(X) for X in Xx],
-																				   [torch.from_numpy(U) for U in Uu])]
-			return [cost_numpy, grad_numpy, hess_numpy]
+    def __init__(self, cost, number_args=1):
+        self.cost = cost
+        self.number_args = number_args
+
+    def joined_egrad(self, Xx):
+        for X in Xx:
+            X.requires_grad_(True)
+        y = self.cost(Xx)
+        y.backward(retain_graph=True)
+        output = []
+        for X in Xx:
+            output.append(X.grad)
+        return output
+
+    def joined_hess(self, Xx, Uu):
+        for X in zip(Xx):
+            X.requires_grad_(True)
+        y = self.joined_egrad(Xx)
+        y.backward(retain_graph=True)
+        output = []
+        for X, U in zip(Xx, Uu):
+            output.append(torch.mm(X.grad, Uu))
+        return output
+
+    def egrad(self, X):
+        X.requires_grad_(True)
+        y = self.cost(X)
+        y.backward(retain_graph=True)
+        return X.grad
+
+    def ehess(self, X, U):
+        X.requires_grad_(True)
+        y = self.egrad(X)
+        y.backward(retain_graph=True)
+        return torch.mm(X.grad, U)
+
+    def define(self):
+        if self.number_args == 1:
+            cost_numpy = lambda X: self.cost(torch.from_numpy(X)).data.numpy()
+            grad_numpy = lambda X: self.egrad(torch.from_numpy(X)).data.numpy()
+            hess_numpy = lambda X, U: self.ehess(
+                torch.from_numpy(X), torch.from_numpy(U)
+            ).data.numpy()
+            return [cost_numpy, grad_numpy, hess_numpy]
+        else:
+            cost_numpy = lambda Xx: self.cost(
+                [torch.from_numpy(X) for X in Xx]
+            ).data.numpy()
+            grad_numpy = lambda Xx: [
+                z.data.numpy()
+                for z in self.joined_egrad([torch.from_numpy(X) for X in Xx])
+            ]
+            hess_numpy = lambda Xx, Uu: [
+                z.data.numpy()
+                for z in self.joined_ehess(
+                    [torch.from_numpy(X) for X in Xx], [torch.from_numpy(U) for U in Uu]
+                )
+            ]
+            return [cost_numpy, grad_numpy, hess_numpy]
diff --git a/stpy/optim/custom_optimizers.py b/stpy/optim/custom_optimizers.py
index 568802e..2db66c2 100644
--- a/stpy/optim/custom_optimizers.py
+++ b/stpy/optim/custom_optimizers.py
@@ -4,327 +4,340 @@
 import torch
 
 
-def bisection(g, a, b, N, version='stop'):
-	'''Approximate solution of g(x)=0 on interval [a,b] by bisection method.
-
-	Parameters
-	----------
-	g : function
-		The function for which we are trying to approximate a solution g(x)=0.
-	a,b : numbers
-		The interval in which to search for a solution. The function returns
-		None if g(a)*g(b) >= 0 since a solution is not guaranteed.
-	N : (positive) integer
-		The number of iterations to implement.
-
-	Returns
-	-------
-	x_N : number
-		The midpoint of the Nth interval computed by the bisection method. The
-		initial interval [a_0,b_0] is given by [a,b]. If f(m_n) == 0 for some
-		midpoint m_n = (a_n + b_n)/2, then the function returns this solution.
-		If all signs of values f(a_n), f(b_n) and f(m_n) are the same at any
-		iteration, the bisection method fails and return None.
-
-	Examples
-	--------
-	>>> f = lambda x: x**2 - x - 1
-	>>> bisection(f,1,2,25)
-	1.618033990263939
-	>>> f = lambda x: (2*x - 1)*(x - 3)
-	>>> bisection(f,0,1,10)
-	0.5
-	'''
-	d = {}
-
-	def f(x):
-		if x in d:
-			return d[x]
-		else:
-			d[x] = g(x)
-			return d[x]
-
-	if version == 'stop':
-		if f(a) < 0.:
-			return a
-	if f(a) * f(b) > 0.:
-		print("Bisection method fails.")
-		return None
-
-	a_n = a
-	b_n = b
-	dict = {}
-	for n in range(1, N + 1):
-		m_n = (a_n + b_n) / 2.
-		f_m_n = f(m_n)
-		if f(a_n) * f_m_n < 0:
-			a_n = a_n
-			b_n = m_n
-		elif f(b_n) * f_m_n < 0:
-			a_n = m_n
-			b_n = b_n
-		elif f_m_n == 0:
-			print("Found exact solution.")
-			return m_n
-		else:
-			return a_n
-			print("Bisection method fails.")
-			return None
-	return (a_n + b_n) / 2.
+def bisection(g, a, b, N, version="stop"):
+    """Approximate solution of g(x)=0 on interval [a,b] by bisection method.
+
+    Parameters
+    ----------
+    g : function
+            The function for which we are trying to approximate a solution g(x)=0.
+    a,b : numbers
+            The interval in which to search for a solution. The function returns
+            None if g(a)*g(b) >= 0 since a solution is not guaranteed.
+    N : (positive) integer
+            The number of iterations to implement.
+
+    Returns
+    -------
+    x_N : number
+            The midpoint of the Nth interval computed by the bisection method. The
+            initial interval [a_0,b_0] is given by [a,b]. If f(m_n) == 0 for some
+            midpoint m_n = (a_n + b_n)/2, then the function returns this solution.
+            If all signs of values f(a_n), f(b_n) and f(m_n) are the same at any
+            iteration, the bisection method fails and return None.
+
+    Examples
+    --------
+    >>> f = lambda x: x**2 - x - 1
+    >>> bisection(f,1,2,25)
+    1.618033990263939
+    >>> f = lambda x: (2*x - 1)*(x - 3)
+    >>> bisection(f,0,1,10)
+    0.5
+    """
+    d = {}
+
+    def f(x):
+        if x in d:
+            return d[x]
+        else:
+            d[x] = g(x)
+            return d[x]
+
+    if version == "stop":
+        if f(a) < 0.0:
+            return a
+    if f(a) * f(b) > 0.0:
+        print("Bisection method fails.")
+        return None
+
+    a_n = a
+    b_n = b
+    dict = {}
+    for n in range(1, N + 1):
+        m_n = (a_n + b_n) / 2.0
+        f_m_n = f(m_n)
+        if f(a_n) * f_m_n < 0:
+            a_n = a_n
+            b_n = m_n
+        elif f(b_n) * f_m_n < 0:
+            a_n = m_n
+            b_n = b_n
+        elif f_m_n == 0:
+            print("Found exact solution.")
+            return m_n
+        else:
+            return a_n
+            print("Bisection method fails.")
+            return None
+    return (a_n + b_n) / 2.0
 
 
 def greedy_per_step(fun, add, ground_set, min=True):
-	scores = []
-	for elem in range(ground_set.size()[0]):
-		new = add(ground_set[elem, :].view(1, -1))
-		scores.append(fun(new))
-	if min:
-		j = np.argmin(scores)
-	else:
-		j = np.argmax(scores)
-	return [j]
+    scores = []
+    for elem in range(ground_set.size()[0]):
+        new = add(ground_set[elem, :].view(1, -1))
+        scores.append(fun(new))
+    if min:
+        j = np.argmin(scores)
+    else:
+        j = np.argmax(scores)
+    return [j]
 
 
 def QPQC_problem(A, a, s, Sigma=None):
-	n = A.shape[0]
-	if Sigma is None:
-		I = np.eye(n)
-		Sigma = I
+    n = A.shape[0]
+    if Sigma is None:
+        I = np.eye(n)
+        Sigma = I
 
-	# SDP relaxation
-	M = np.zeros(shape=(n + 1, n + 1))
+    # SDP relaxation
+    M = np.zeros(shape=(n + 1, n + 1))
 
-	M[0, 1:] = -a.reshape(-1)
-	M[1:, 0] = -a.T.reshape(-1)
-	M[1:, 1:] = A
+    M[0, 1:] = -a.reshape(-1)
+    M[1:, 0] = -a.T.reshape(-1)
+    M[1:, 1:] = A
 
-	# print (M)
+    # print (M)
 
-	Meqconst = np.eye(n + 1)
-	Meqconst[1:, 1:] = Sigma
-	Meqconst[0][0] = 0
+    Meqconst = np.eye(n + 1)
+    Meqconst[1:, 1:] = Sigma
+    Meqconst[0][0] = 0
 
-	# print (Meqconst)
+    # print (Meqconst)
 
-	First = np.zeros(shape=(n + 1, n + 1))
-	First[0, 0] = 1.
+    First = np.zeros(shape=(n + 1, n + 1))
+    First[0, 0] = 1.0
 
-	X = cp.Variable((n + 1, n + 1))
+    X = cp.Variable((n + 1, n + 1))
 
-	objective = cp.Maximize(cp.trace(M @ X))
+    objective = cp.Maximize(cp.trace(M @ X))
 
-	constraints = [X >> 0]
-	constraints += [cp.trace(Meqconst @ X) >= s]
-	constraints += [cp.trace(First @ X) == 1]
+    constraints = [X >> 0]
+    constraints += [cp.trace(Meqconst @ X) >= s]
+    constraints += [cp.trace(First @ X) == 1]
 
-	prob = cp.Problem(objective, constraints)
-	prob.solve()
+    prob = cp.Problem(objective, constraints)
+    prob.solve()
 
-	# print (X.value[1:,1:])
-	eigvals, eigvecs = np.linalg.eig(X.value[1:, 1:])
+    # print (X.value[1:,1:])
+    eigvals, eigvecs = np.linalg.eig(X.value[1:, 1:])
 
-	index = np.argmax(eigvals)
-	val = np.max(eigvals)
-	x = np.real(eigvecs[index] * np.sqrt(val))
-	return val, x
+    index = np.argmax(eigvals)
+    val = np.max(eigvals)
+    x = np.real(eigvecs[index] * np.sqrt(val))
+    return val, x
 
 
 def convex_QCQP(A, a, s, Sigma=None, threads=4, verbose=False):
-	"""
-	Solving
-
-	min xAx - 2ax
-	s.t. xSigmax \leq s
-	A, Sigma psd.
-
-	:param A:
-	:param a:
-	:param s:
-	:param Sigma:
-	:return:
-	"""
-	n = A.shape[0]
-
-	if Sigma is None:
-		I = np.eye(n)
-		Sigma = I
-
-	x = cp.Variable(n)
-	objective = cp.Minimize(cp.quad_form(x, A) - 2 * x @ a)
-	zero = np.zeros(n)
-	# constraints = [ cp.SOC(zero@x + np.array([np.sqrt(s)]), Sigma @ x)]
-	constraints = [cp.quad_form(x, Sigma) <= s]
-	prob = cp.Problem(objective, constraints)
-	prob.solve(solver=cp.MOSEK, mosek_params={mosek.iparam.num_threads: threads,
-											  mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-											  mosek.dparam.intpnt_co_tol_pfeas: 1e-8,
-											  mosek.dparam.intpnt_co_tol_dfeas: 1e-8,
-											  mosek.dparam.intpnt_co_tol_rel_gap: 1e-8},
-			   verbose=verbose)
-
-	x_no_const = x.value.reshape(-1, 1)
-	val = prob.value
-	return val, x_no_const
+    r"""
+    Solving
+
+    min xAx - 2ax
+    s.t. xSigmax \leq s
+    A, Sigma psd.
+
+    :param A:
+    :param a:
+    :param s:
+    :param Sigma:
+    :return:
+    """
+    n = A.shape[0]
+
+    if Sigma is None:
+        I = np.eye(n)
+        Sigma = I
+
+    x = cp.Variable(n)
+    objective = cp.Minimize(cp.quad_form(x, A) - 2 * x @ a)
+    zero = np.zeros(n)
+    # constraints = [ cp.SOC(zero@x + np.array([np.sqrt(s)]), Sigma @ x)]
+    constraints = [cp.quad_form(x, Sigma) <= s]
+    prob = cp.Problem(objective, constraints)
+    prob.solve(
+        solver=cp.MOSEK,
+        mosek_params={
+            mosek.iparam.num_threads: threads,
+            mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+            mosek.dparam.intpnt_co_tol_pfeas: 1e-8,
+            mosek.dparam.intpnt_co_tol_dfeas: 1e-8,
+            mosek.dparam.intpnt_co_tol_rel_gap: 1e-8,
+        },
+        verbose=verbose,
+    )
+
+    x_no_const = x.value.reshape(-1, 1)
+    val = prob.value
+    return val, x_no_const
 
 
 def QCQP_problem(A, a, s, Sigma=None, threads=4, verbose=False):
-	"""
-	Solving
-
-	min xAx - 2ax
-	s.t. xSigmax == s
-
-
-	:param A:
-	:param a:
-	:param s:
-	:param Sigma:
-	:return:
-	"""
-	n = A.shape[0]
-	lam = cp.Variable(1)
-	if Sigma is None:
-		I = np.eye(n)
-		Sigma = I
-
-	objective = cp.Maximize(lam * s - cp.matrix_frac(a, A - lam * Sigma))
-	constraints = [A - lam * Sigma >> 0]
-	prob = cp.Problem(objective, constraints)
-	prob.solve(solver=cp.MOSEK, mosek_params={mosek.iparam.num_threads: threads,
-											  mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-											  mosek.dparam.intpnt_co_tol_pfeas: 1e-12,
-											  mosek.dparam.intpnt_co_tol_dfeas: 1e-12,
-											  mosek.dparam.intpnt_co_tol_rel_gap: 1e-12},
-			   verbose=verbose)
-
-	x_no_const = np.linalg.solve(A - lam.value * Sigma, a)
-	val = prob.value
-	return val, x_no_const
+    """
+    Solving
+
+    min xAx - 2ax
+    s.t. xSigmax == s
+
+
+    :param A:
+    :param a:
+    :param s:
+    :param Sigma:
+    :return:
+    """
+    n = A.shape[0]
+    lam = cp.Variable(1)
+    if Sigma is None:
+        I = np.eye(n)
+        Sigma = I
+
+    objective = cp.Maximize(lam * s - cp.matrix_frac(a, A - lam * Sigma))
+    constraints = [A - lam * Sigma >> 0]
+    prob = cp.Problem(objective, constraints)
+    prob.solve(
+        solver=cp.MOSEK,
+        mosek_params={
+            mosek.iparam.num_threads: threads,
+            mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+            mosek.dparam.intpnt_co_tol_pfeas: 1e-12,
+            mosek.dparam.intpnt_co_tol_dfeas: 1e-12,
+            mosek.dparam.intpnt_co_tol_rel_gap: 1e-12,
+        },
+        verbose=verbose,
+    )
+
+    x_no_const = np.linalg.solve(A - lam.value * Sigma, a)
+    val = prob.value
+    return val, x_no_const
 
 
 def solve_mpi(Q, c, tau, verbose=True, up=None, low=None, xwarm=None):
-	"""
-	Solve MIP program
-
-
-	"""
-	if verbose == True:
-		print("Starting Acq. Fucn solver...")
-		print("Resolution: ")
-	# Grid
-
-	# tau = torch.from_numpy(np.arange(-n, n + 1, 1).astype(np.double)) / n
-	N = tau.size()[0]
-	d = Q.size()[0]
-	s = torch.ones(N)
-	Tau = torch.zeros(size=(d, d * N), dtype=torch.float64)
-	S = torch.zeros(size=(d, d * N), dtype=torch.float64)
-
-	for j in range(d):
-		Tau[j, j * N:(j + 1) * N] = tau
-		S[j, j * N:(j + 1) * N] = s
-
-	B = Q @ Tau
-
-	if (up is not None) or (low is not None):
-		G = torch.cat((B, -B, S, -S, torch.t(c), -torch.t(c)))
-		h = torch.ones(4 * d + 2)
-		h[0:2 * d] = 1
-		h[3 * d:4 * d] = -1
-		h[4 * d] = up
-		h[4 * d + 1] = -low
-	else:
-		G = torch.cat((B, -B, S, -S))
-		h = torch.ones(4 * d)
-		h[0:2 * d] = 1
-		h[3 * d:4 * d] = -1
-	# Indicator variables
-
-	x = cp.Variable(d * N, boolean=True)
-	if xwarm is not None:
-		x.value = xwarm.detach().numpy()
-	c = c.view(-1).detach().numpy()
-
-	objective = cp.Minimize(-c * x)
-	constraints = [0 <= x, x <= 1, G.detach().numpy() * x <= h.view(-1).detach().numpy()]
-	prob = cp.Problem(objective, constraints)
-	prob.solve(solver=cp.MOSEK, verbose=verbose, warm_start=True)
-
-	# print (x.value)
-
-	return (torch.from_numpy(Tau.numpy() @ x.value), np.dot(c, x.value))
+    """
+    Solve MIP program
+
+
+    """
+    if verbose == True:
+        print("Starting Acq. Fucn solver...")
+        print("Resolution: ")
+    # Grid
+
+    # tau = torch.from_numpy(np.arange(-n, n + 1, 1).astype(np.double)) / n
+    N = tau.size()[0]
+    d = Q.size()[0]
+    s = torch.ones(N)
+    Tau = torch.zeros(size=(d, d * N), dtype=torch.float64)
+    S = torch.zeros(size=(d, d * N), dtype=torch.float64)
+
+    for j in range(d):
+        Tau[j, j * N : (j + 1) * N] = tau
+        S[j, j * N : (j + 1) * N] = s
+
+    B = Q @ Tau
+
+    if (up is not None) or (low is not None):
+        G = torch.cat((B, -B, S, -S, torch.t(c), -torch.t(c)))
+        h = torch.ones(4 * d + 2)
+        h[0 : 2 * d] = 1
+        h[3 * d : 4 * d] = -1
+        h[4 * d] = up
+        h[4 * d + 1] = -low
+    else:
+        G = torch.cat((B, -B, S, -S))
+        h = torch.ones(4 * d)
+        h[0 : 2 * d] = 1
+        h[3 * d : 4 * d] = -1
+    # Indicator variables
+
+    x = cp.Variable(d * N, boolean=True)
+    if xwarm is not None:
+        x.value = xwarm.detach().numpy()
+    c = c.view(-1).detach().numpy()
+
+    objective = cp.Minimize(-c * x)
+    constraints = [
+        0 <= x,
+        x <= 1,
+        G.detach().numpy() * x <= h.view(-1).detach().numpy(),
+    ]
+    prob = cp.Problem(objective, constraints)
+    prob.solve(solver=cp.MOSEK, verbose=verbose, warm_start=True)
+
+    # print (x.value)
+
+    return (torch.from_numpy(Tau.numpy() @ x.value), np.dot(c, x.value))
 
 
 def newton_solve(f, x0, eps=1e-3, maxiter=100, verbose=False, grad=None):
-	"""
-	>>> newton_solve(lambda x: x**2,torch.Tensor([2.0,1.0]).double().view(-1))
-	tensor([0., 0.], dtype=torch.float64)
-	"""
-	lam = 1.
-	d = int(x0.size()[0])
-	x0.requires_grad_(True)
-	x = torch.zeros(size=(d, 1), requires_grad=True).view(-1).double()
-	x.data = x0.data
-	res = f(x) ** 2
-	i = 0
-	s = 1.
-
-	while torch.max(res) > eps and i < maxiter:
-		i = i + 1
-
-		if grad is None:
-			nabla_f = torch.autograd.functional.jacobian(f, x, strict=True)
-		else:
-			nabla_f = grad(x)
-
-		if verbose:
-			print(i, "err:", torch.max(res), s)
-			print(nabla_f.size())
-			print("-----------------------")
-
-		xn = x - torch.linalg.solve(nabla_f + torch.eye(d).double() * s, f(x).view(-1, 1)).view(-1)
-		resn = f(xn) ** 2
-
-		if torch.max(resn) < torch.max(res):
-			x = xn.requires_grad_(True)
-			# lam = np.minimum(lam * 2,1)
-			s = s / 2
-			res = resn
-
-		else:
-			s = s * 2
-	# lam = lam /2.
-	return x
+    """
+    >>> newton_solve(lambda x: x**2,torch.tensor([2.0,1.0]).double().view(-1))
+    tensor([0., 0.], dtype=torch.float64)
+    """
+    lam = 1.0
+    d = int(x0.size()[0])
+    x0.requires_grad_(True)
+    x = torch.zeros(size=(d, 1), requires_grad=True).view(-1).double()
+    x.data = x0.data
+    res = f(x) ** 2
+    i = 0
+    s = 1.0
+
+    while torch.max(res) > eps and i < maxiter:
+        i = i + 1
+
+        if grad is None:
+            nabla_f = torch.autograd.functional.jacobian(f, x, strict=True)
+        else:
+            nabla_f = grad(x)
+
+        if verbose:
+            print(i, "err:", torch.max(res), s)
+            print(nabla_f.size())
+            print("-----------------------")
+
+        xn = x - torch.linalg.solve(
+            nabla_f + torch.eye(d).double() * s, f(x).view(-1, 1)
+        ).view(-1)
+        resn = f(xn) ** 2
+
+        if torch.max(resn) < torch.max(res):
+            x = xn.requires_grad_(True)
+            # lam = np.minimum(lam * 2,1)
+            s = s / 2
+            res = resn
+
+        else:
+            s = s * 2
+    # lam = lam /2.
+    return x
 
 
 def matrix_recovery_hermitian_trace_regression(X, b, eps=1e-5):
-	"""
+    """
 
-	:param X: list of matrices
-	:param b: vector of resposnes
-	:param eps: constraint tolerance
-	:return: reocvered matrix
-	"""
+    :param X: list of matrices
+    :param b: vector of resposnes
+    :param eps: constraint tolerance
+    :return: reocvered matrix
+    """
 
-	d = X[0].shape[0]
-	N = len(X)
-	Z = cp.Variable((d, d), symmetric=True)
+    d = X[0].shape[0]
+    N = len(X)
+    Z = cp.Variable((d, d), symmetric=True)
 
-	constraints = [Z >> 0]
-	constraints += [
-		cp.trace(X[i] @ Z) >= b[i] - eps for i in range(N)
-	]
-	constraints += [
-		cp.trace(X[i] @ Z) <= b[i] + eps for i in range(N)
-	]
+    constraints = [Z >> 0]
+    constraints += [cp.trace(X[i] @ Z) >= b[i] - eps for i in range(N)]
+    constraints += [cp.trace(X[i] @ Z) <= b[i] + eps for i in range(N)]
 
-	prob = cp.Problem(cp.Minimize(cp.norm(Z, "nuc")),
-					  constraints)
+    prob = cp.Problem(cp.Minimize(cp.norm(Z, "nuc")), constraints)
 
-	prob.solve()
+    prob.solve()
 
-	return Z.value
+    return Z.value
 
 
 if __name__ == "__main__":
-	newton_solve(lambda x: x ** 2, torch.Tensor([2.0, 1.0]).double().view(-1), verbose=True)
+    newton_solve(
+        lambda x: x**2, torch.tensor([2.0, 1.0]).double().view(-1), verbose=True
+    )
diff --git a/stpy/optim/frank_wolfe.py b/stpy/optim/frank_wolfe.py
index ebee977..0e303e4 100644
--- a/stpy/optim/frank_wolfe.py
+++ b/stpy/optim/frank_wolfe.py
@@ -3,57 +3,57 @@
 from scipy.optimize import minimize_scalar
 
 
-def step_frank_wolfe_simplex(F, nablaF, x, step_size = 'opt'):
-	d = x.shape[0]
-	nabla = nablaF(x)
-	index = np.argmax(nabla)
-	e = np.zeros(d)
-	e[index] = 1.
-	if step_size == 'opt':
-		fn = lambda h: -F(x * h + (1 - h) * e)
-		res = minimize_scalar(fn, bounds=(10e-8, 1 - 10e-8), method='bounded')
-		gamma = res.x
-	else:
-		gamma = 1.
-	x = x * gamma + (1 - gamma) * e
-	return x
-
-
-def step_exponential_gradient_descent(nablaF, x, eta=1.):
-	"""
-
-	:param nablaF:
-	:param x:
-	:param eta:
-	:return:
-	"""
-	x = x * torch.exp(eta * nablaF(x))
-	x = x / torch.sum(x)
-	return x
+def step_frank_wolfe_simplex(F, nablaF, x, step_size="opt"):
+    d = x.shape[0]
+    nabla = nablaF(x)
+    index = np.argmax(nabla)
+    e = np.zeros(d)
+    e[index] = 1.0
+    if step_size == "opt":
+        fn = lambda h: -F(x * h + (1 - h) * e)
+        res = minimize_scalar(fn, bounds=(10e-8, 1 - 10e-8), method="bounded")
+        gamma = res.x
+    else:
+        gamma = 1.0
+    x = x * gamma + (1 - gamma) * e
+    return x
+
+
+def step_exponential_gradient_descent(nablaF, x, eta=1.0):
+    """
+
+    :param nablaF:
+    :param x:
+    :param eta:
+    :return:
+    """
+    x = x * torch.exp(eta * nablaF(x))
+    x = x / torch.sum(x)
+    return x
 
 
 def step_wa_simlex(F, nablaF, x, optimality):
-	d = x.shape[0]
-	nabla = nablaF(x)
-	e_plus = np.max(nabla)
-	e_minus = np.min(nabla)
-	i_minus = np.argmin(nabla)
-	i_plus = np.argmax(nabla)
-	e = np.zeros(d)
-
-	if (e_plus - optimality) / optimality > (optimality - e_minus) / optimality:
-		index = i_plus
-		e[index] = 1.
-		fn = lambda h: -F(x * h + (1 - h) * e)
-		res = minimize_scalar(fn, bounds=(10e-8, 1 - 10e-8), method='bounded')
-		gamma = res.x
-		x = x * gamma + (1. - gamma) * e
-	else:
-		index = i_minus
-		e[index] = 1.
-		fn = lambda h: -F((x + h * e) / (1 + h))
-		# res = minimize_scalar(fn,bounds=(0.,1/(1-x[index])),method='bounded')
-		res = minimize_scalar(fn, bounds=(-x[index], 1 - x[index]), method='bounded')
-		gamma = res.x
-		x = (x + gamma * e) / (1 + gamma)
-	return x
+    d = x.shape[0]
+    nabla = nablaF(x)
+    e_plus = np.max(nabla)
+    e_minus = np.min(nabla)
+    i_minus = np.argmin(nabla)
+    i_plus = np.argmax(nabla)
+    e = np.zeros(d)
+
+    if (e_plus - optimality) / optimality > (optimality - e_minus) / optimality:
+        index = i_plus
+        e[index] = 1.0
+        fn = lambda h: -F(x * h + (1 - h) * e)
+        res = minimize_scalar(fn, bounds=(10e-8, 1 - 10e-8), method="bounded")
+        gamma = res.x
+        x = x * gamma + (1.0 - gamma) * e
+    else:
+        index = i_minus
+        e[index] = 1.0
+        fn = lambda h: -F((x + h * e) / (1 + h))
+        # res = minimize_scalar(fn,bounds=(0.,1/(1-x[index])),method='bounded')
+        res = minimize_scalar(fn, bounds=(-x[index], 1 - x[index]), method="bounded")
+        gamma = res.x
+        x = (x + gamma * e) / (1 + gamma)
+    return x
diff --git a/stpy/optim/hyper_parameter_opt.py b/stpy/optim/hyper_parameter_opt.py
index aa25b28..33c1eed 100755
--- a/stpy/optim/hyper_parameter_opt.py
+++ b/stpy/optim/hyper_parameter_opt.py
@@ -6,127 +6,147 @@
 
 class HyperParameterOpt:
 
-	def __init__(self, obj, x, y, fun, params):
-
-		self.mode = obj
-		self.x = x
-		self.y = y
-		self.fun = fun
-		self.params = params
-
-	def optimize(self, type, optimizer, restarts):
-
-		## Bandwidth optimization
-		def bandwidth_opt(X):
-			gamma = X
-			Rot = torch.eye(self.x.size()[1], dtype=torch.float64)
-			return self.log_marginal_likelihood(gamma, Rot, 1.0, kernel=" ")
-
-		def bandwidth_opt_handler():
-			manifold = Euclidean(self.kernel_object.gamma.size()[0])
-			C = CostFunction(bandwidth_opt, number_args=1)
-			xinit = lambda: np.random.randn() ** 2 + np.abs(
-				torch.zeros(self.kernel_object.gamma.size()[0], dtype=torch.float64).numpy())
-			return optimize(manifold, C, 1, xinit)
-
-		def bandwidth_kappa_opt(X):
-			gamma = X[0]
-			kappa = X[1]
-			Rot = torch.eye(self.x.size()[1], dtype=torch.float64)
-			return self.log_marginal_likelihood(gamma, Rot, kappa, kernel=" ")
-
-		def bandwidth_kappa_opt_handler():
-			manifold1 = Euclidean(self.kernel_object.gamma.size()[0])
-			manifold2 = Euclidean(1)
-			manifold = Product((manifold1, manifold2))
-			C = CostFunction(bandwidth_kappa_opt, number_args=2)
-			xinit = lambda x: [torch.randn(self.kernel_object.gamma.size()[0], dtype=torch.float64).numpy(),
-							   np.abs(torch.randn(1, dtype=torch.float64).numpy())]
-			return optimize(manifold, C, 2, xinit)
-
-		## Rotations optimization
-		def rotations_opt(X):
-			Rot = X
-			return self.log_marginal_likelihood(self.kernel_object.gamma, Rot, self.kernel_object.kappa, kernel=" ")
-
-		def rotations_opt_handler():
-			rots = Rotations(self.kernel_object.gamma.size()[0])
-			manifold = rots
-			xinit = lambda: torch.qr(torch.randn(self.x.size()[1], self.x.size()[1], dtype=torch.float64))[0].numpy()
-			C = CostFunction(rotations_opt, number_args=1)
-			return optimize(manifold, C, 1, xinit)
-
-		## Bandwidth and Rotations optimization
-		def bandwith_rotations_opt(X):
-			gamma = X[0]
-			Rot = X[1]
-			return self.log_marginal_likelihood(gamma, Rot, 0.1, kernel=" ")
-
-		def bandwidth_rotations_opt_handler():
-			eucl = Euclidean(self.kernel_object.gamma.size()[0])
-			rots = Rotations(self.kernel_object.gamma.size()[0])
-			manifold = Product((eucl, rots))
-			xinit = lambda: [torch.randn(self.kernel_object.gamma.size()[0], dtype=torch.float64).numpy(),
-							 torch.qr(torch.randn(self.x.size()[1], self.x.size()[1], dtype=torch.float64))[0].numpy()]
-			C = CostFunction(bandwith_rotations_opt, number_args=2)
-			return optimize(manifold, C, 2, xinit)
-
-		## Bandwidth and Rotations optimization
-		def bandwith_kappa_rotations_opt(X):
-			gamma = X[0]
-			kappa = X[1]
-			Rot = X[2]
-			return self.log_marginal_likelihood(gamma, Rot, kappa, kernel=" ")
-
-		def bandwidth_kappa_rotations_opt_handler():
-			eucl = Euclidean(self.kernel_object.gamma.size()[0])
-			eucl2 = Euclidean(1)
-			rots = Rotations(self.kernel_object.gamma.size()[0])
-			manifold = Product((eucl, eucl2, rots))
-			xinit = [self.kernel_object.gamma.numpy(), torch.eye(self.x.size()[1], dtype=torch.float64).numpy()]
-			C = CostFunction(bandwith_kappa_rotations_opt, number_args=2)
-			return optimize(manifold, C, 2, xinit)
-
-		# Finalize
-		if type == "bandwidth":
-			best_params = bandwidth_opt_handler()
-			self.kernel_object.gamma = torch.abs(best_params[0]).detach()
-
-		elif type == "rots":
-			best_params = rotations_opt_handler()
-			Rot = best_params[0].detach()
-			print("Rotation:", Rot)
-			self.Rot = Rot
-			self.x = torch.mm(self.x, Rot).detach()
-
-		elif type == "bandwidth+kappa":
-			best_params = bandwidth_kappa_opt_handler()
-			self.kernel_object.gamma = torch.abs(best_params[0]).detach()
-			self.s = torch.abs(best_params[1]).detach()
-
-		elif type == "bandwidth+rots":
-			best_params = bandwidth_rotations_opt_handler()
-			self.kernel_object.gamma = torch.abs(best_params[0]).detach()
-			Rot = best_params[1].detach()
-			print("Rotation:", Rot)
-			self.Rot = Rot
-			self.x = torch.mm(self.x, Rot).detach()
-
-		elif type == "bandwidth+kappa+rots":
-			best_params = bandwidth_kappa_rotations_opt_handler()
-			self.kernel_object.gamma = torch.abs(best_params[0]).detach()
-			self.s = torch.abs(best_params[1]).detach()
-			Rot = best_params[2].detach()
-			print("Rotation:", Rot)
-			self.Rot = Rot
-			self.x = torch.mm(self.x, Rot).detach()
-
-		else:
-			raise AttributeError("Optimization scheme not implemented")
-
-		self.back_prop = False
-		self.fit = False
-		self.fit_gp(self.x, self.y)
-		print(self.description())
-
-		return True
+    def __init__(self, obj, x, y, fun, params):
+
+        self.mode = obj
+        self.x = x
+        self.y = y
+        self.fun = fun
+        self.params = params
+
+    def optimize(self, type, optimizer, restarts):
+
+        ## Bandwidth optimization
+        def bandwidth_opt(X):
+            gamma = X
+            Rot = torch.eye(self.x.size()[1], dtype=torch.float64)
+            return self.log_marginal_likelihood(gamma, Rot, 1.0, kernel=" ")
+
+        def bandwidth_opt_handler():
+            manifold = Euclidean(self.kernel_object.gamma.size()[0])
+            C = CostFunction(bandwidth_opt, number_args=1)
+            xinit = lambda: np.random.randn() ** 2 + np.abs(
+                torch.zeros(
+                    self.kernel_object.gamma.size()[0], dtype=torch.float64
+                ).numpy()
+            )
+            return optimize(manifold, C, 1, xinit)
+
+        def bandwidth_kappa_opt(X):
+            gamma = X[0]
+            kappa = X[1]
+            Rot = torch.eye(self.x.size()[1], dtype=torch.float64)
+            return self.log_marginal_likelihood(gamma, Rot, kappa, kernel=" ")
+
+        def bandwidth_kappa_opt_handler():
+            manifold1 = Euclidean(self.kernel_object.gamma.size()[0])
+            manifold2 = Euclidean(1)
+            manifold = Product((manifold1, manifold2))
+            C = CostFunction(bandwidth_kappa_opt, number_args=2)
+            xinit = lambda x: [
+                torch.randn(
+                    self.kernel_object.gamma.size()[0], dtype=torch.float64
+                ).numpy(),
+                np.abs(torch.randn(1, dtype=torch.float64).numpy()),
+            ]
+            return optimize(manifold, C, 2, xinit)
+
+        ## Rotations optimization
+        def rotations_opt(X):
+            Rot = X
+            return self.log_marginal_likelihood(
+                self.kernel_object.gamma, Rot, self.kernel_object.kappa, kernel=" "
+            )
+
+        def rotations_opt_handler():
+            rots = Rotations(self.kernel_object.gamma.size()[0])
+            manifold = rots
+            xinit = lambda: torch.qr(
+                torch.randn(self.x.size()[1], self.x.size()[1], dtype=torch.float64)
+            )[0].numpy()
+            C = CostFunction(rotations_opt, number_args=1)
+            return optimize(manifold, C, 1, xinit)
+
+        ## Bandwidth and Rotations optimization
+        def bandwith_rotations_opt(X):
+            gamma = X[0]
+            Rot = X[1]
+            return self.log_marginal_likelihood(gamma, Rot, 0.1, kernel=" ")
+
+        def bandwidth_rotations_opt_handler():
+            eucl = Euclidean(self.kernel_object.gamma.size()[0])
+            rots = Rotations(self.kernel_object.gamma.size()[0])
+            manifold = Product((eucl, rots))
+            xinit = lambda: [
+                torch.randn(
+                    self.kernel_object.gamma.size()[0], dtype=torch.float64
+                ).numpy(),
+                torch.qr(
+                    torch.randn(self.x.size()[1], self.x.size()[1], dtype=torch.float64)
+                )[0].numpy(),
+            ]
+            C = CostFunction(bandwith_rotations_opt, number_args=2)
+            return optimize(manifold, C, 2, xinit)
+
+        ## Bandwidth and Rotations optimization
+        def bandwith_kappa_rotations_opt(X):
+            gamma = X[0]
+            kappa = X[1]
+            Rot = X[2]
+            return self.log_marginal_likelihood(gamma, Rot, kappa, kernel=" ")
+
+        def bandwidth_kappa_rotations_opt_handler():
+            eucl = Euclidean(self.kernel_object.gamma.size()[0])
+            eucl2 = Euclidean(1)
+            rots = Rotations(self.kernel_object.gamma.size()[0])
+            manifold = Product((eucl, eucl2, rots))
+            xinit = [
+                self.kernel_object.gamma.numpy(),
+                torch.eye(self.x.size()[1], dtype=torch.float64).numpy(),
+            ]
+            C = CostFunction(bandwith_kappa_rotations_opt, number_args=2)
+            return optimize(manifold, C, 2, xinit)
+
+        # Finalize
+        if type == "bandwidth":
+            best_params = bandwidth_opt_handler()
+            self.kernel_object.gamma = torch.abs(best_params[0]).detach()
+
+        elif type == "rots":
+            best_params = rotations_opt_handler()
+            Rot = best_params[0].detach()
+            print("Rotation:", Rot)
+            self.Rot = Rot
+            self.x = torch.mm(self.x, Rot).detach()
+
+        elif type == "bandwidth+kappa":
+            best_params = bandwidth_kappa_opt_handler()
+            self.kernel_object.gamma = torch.abs(best_params[0]).detach()
+            self.s = torch.abs(best_params[1]).detach()
+
+        elif type == "bandwidth+rots":
+            best_params = bandwidth_rotations_opt_handler()
+            self.kernel_object.gamma = torch.abs(best_params[0]).detach()
+            Rot = best_params[1].detach()
+            print("Rotation:", Rot)
+            self.Rot = Rot
+            self.x = torch.mm(self.x, Rot).detach()
+
+        elif type == "bandwidth+kappa+rots":
+            best_params = bandwidth_kappa_rotations_opt_handler()
+            self.kernel_object.gamma = torch.abs(best_params[0]).detach()
+            self.s = torch.abs(best_params[1]).detach()
+            Rot = best_params[2].detach()
+            print("Rotation:", Rot)
+            self.Rot = Rot
+            self.x = torch.mm(self.x, Rot).detach()
+
+        else:
+            raise AttributeError("Optimization scheme not implemented")
+
+        self.back_prop = False
+        self.fit = False
+        self.fit_gp(self.x, self.y)
+        print(self.description())
+
+        return True
diff --git a/stpy/optim/manifold_optimization.py b/stpy/optim/manifold_optimization.py
index acc25c9..3ade0eb 100644
--- a/stpy/optim/manifold_optimization.py
+++ b/stpy/optim/manifold_optimization.py
@@ -5,41 +5,50 @@
 
 
 def optimize(manifold, cost_function, number_args, sampling_func, optimizer, restarts):
-	[cost_numpy, egrad_numpy, ehess_numpy] = cost_function.define()
-
-	if optimizer == "pymanopt":
-		problem = Problem(manifold=manifold, cost=cost_numpy, egrad=egrad_numpy, ehess=ehess_numpy, verbosity=1)
-		solver = SteepestDescent(maxiter=100, mingradnorm=1e-8, minstepsize=1e-10)
-
-		def solve(problem, x=None):
-			return solver.solve(problem, x=x)
-
-	elif optimizer == "scipy":
-		problem = None
-
-		def solve(problem, x=None):
-			res = minimize(cost_numpy, xinit, method="L-BFGS-B", jac=egrad_numpy, tol=0.0001)
-			return res.x
-	else:
-		raise NotImplementedError
-
-	# optimization
-	repeats = restarts
-	best = 10e10
-	best_params = [i for i in range(number_args)]
-
-	for _ in range(repeats):
-		xinit = sampling_func()
-		# try:
-		Xopt = solve(problem, x=xinit)
-		print(xinit)
-		cost = cost_numpy(Xopt)
-		print("Run:", _, " cost: ", cost)
-		if cost < best:
-			best = cost
-			if len(best_params) > 1:
-				for j in range(number_args):
-					best_params[j] = torch.from_numpy(Xopt[j])
-			else:
-				best_params[0] = torch.from_numpy(Xopt)
-	return best_params
+    [cost_numpy, egrad_numpy, ehess_numpy] = cost_function.define()
+
+    if optimizer == "pymanopt":
+        problem = Problem(
+            manifold=manifold,
+            cost=cost_numpy,
+            egrad=egrad_numpy,
+            ehess=ehess_numpy,
+            verbosity=1,
+        )
+        solver = SteepestDescent(maxiter=100, mingradnorm=1e-8, minstepsize=1e-10)
+
+        def solve(problem, x=None):
+            return solver.solve(problem, x=x)
+
+    elif optimizer == "scipy":
+        problem = None
+
+        def solve(problem, x=None):
+            res = minimize(
+                cost_numpy, xinit, method="L-BFGS-B", jac=egrad_numpy, tol=0.0001
+            )
+            return res.x
+
+    else:
+        raise NotImplementedError
+
+    # optimization
+    repeats = restarts
+    best = 10e10
+    best_params = [i for i in range(number_args)]
+
+    for _ in range(repeats):
+        xinit = sampling_func()
+        # try:
+        Xopt = solve(problem, x=xinit)
+        print(xinit)
+        cost = cost_numpy(Xopt)
+        print("Run:", _, " cost: ", cost)
+        if cost < best:
+            best = cost
+            if len(best_params) > 1:
+                for j in range(number_args):
+                    best_params[j] = torch.from_numpy(Xopt[j])
+            else:
+                best_params[0] = torch.from_numpy(Xopt)
+    return best_params
diff --git a/stpy/point_processes/binomial/binomial_process.py b/stpy/point_processes/binomial/binomial_process.py
index ddbf89e..f3ce7f2 100644
--- a/stpy/point_processes/binomial/binomial_process.py
+++ b/stpy/point_processes/binomial/binomial_process.py
@@ -1,38 +1,38 @@
 import torch
 
 
-class BernoulliPointProcess():
+class BernoulliPointProcess:
 
-	def __init__(self, basic_sets, d=1, rate=None):
-		self.basic_sets = basic_sets
-		self.rate = rate
-		self.d = d
+    def __init__(self, basic_sets, d=1, rate=None):
+        self.basic_sets = basic_sets
+        self.rate = rate
+        self.d = d
 
-	def is_basic(self, S):
-		"""
-		:return:
-		"""
-		for set in self.basic_sets:
-			if hash(set) == hash(S):
-				return True
-		return False
+    def is_basic(self, S):
+        """
+        :return:
+        """
+        for set in self.basic_sets:
+            if hash(set) == hash(S):
+                return True
+        return False
 
-	def sample(self, S, t=None, dt=None):
-		if self.is_basic(S):
-			rv = torch.bernoulli(self.rate(S))
-			if rv > 0.5:
-				return (S, 1., 1., dt, t)
-			else:
-				return (S, 0., 1., dt, t)
-		else:
-			# iterate over all sets that contain it
-			outcome = 0.
-			for set in self.basic_sets:
-				if S.inside(set):
-					rv = float(torch.bernoulli(self.rate(S)))
-					outcome = max(rv, 0.)
-				if outcome > 0.5:
-					return (S, 1., 1., dt, t)
-				else:
-					return (S, 0., 1., dt, t)
-			pass
+    def sample(self, S, t=None, dt=None):
+        if self.is_basic(S):
+            rv = torch.bernoulli(self.rate(S))
+            if rv > 0.5:
+                return (S, 1.0, 1.0, dt, t)
+            else:
+                return (S, 0.0, 1.0, dt, t)
+        else:
+            # iterate over all sets that contain it
+            outcome = 0.0
+            for set in self.basic_sets:
+                if S.inside(set):
+                    rv = float(torch.bernoulli(self.rate(S)))
+                    outcome = max(rv, 0.0)
+                if outcome > 0.5:
+                    return (S, 1.0, 1.0, dt, t)
+                else:
+                    return (S, 0.0, 1.0, dt, t)
+            pass
diff --git a/stpy/point_processes/binomial/binomial_process_estimator.py b/stpy/point_processes/binomial/binomial_process_estimator.py
index bf88e90..2096af3 100644
--- a/stpy/point_processes/binomial/binomial_process_estimator.py
+++ b/stpy/point_processes/binomial/binomial_process_estimator.py
@@ -12,473 +12,653 @@
 
 
 class BernoulliRateEstimator(RateEstimator):
-	"""
-		without link function, but with inequality constraints
-	"""
-
-	def __init__(self, hierarchy, d=1, m=100, kernel_object=None, B=1., s=1., jitter=10e-8, b=0., basis='triangle',
-				 offset=0.1, uncertainty='laplace'):
-
-		self.d = d
-		self.s = s
-		self.b = b
-		self.B = B
-		self.uncertainty = uncertainty
-		self.hierarchy = hierarchy
-		self.kernel_object = kernel_object
-		self.packing = TriangleEmbedding(d, m, kernel_object=kernel_object, B=1., b=0., offset=offset,
-										 s=np.sqrt(jitter))
-		self.feedback = "histogram"
-		self.data = None
-
-		self.basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
-		self.varphis = torch.zeros(size=(len(self.basic_sets), self.get_m())).double()
-
-		for index_set, set in enumerate(self.basic_sets):
-			self.varphis[index_set, :] = self.embed_set(set)
-
-	def embed_set(self, S):
-		return self.packing.integral(S).view(1, -1)
-
-	def load_data(self, data):
-		"""
-
-		:param data: (S, no_events, out_of, duration, time)
-		:return:
-		"""
-		self.data = []
-		self.phis = None
-		for datapoint in data:
-			self.add_data_point(datapoint)
-
-	def add_data_point(self, datapoint):
-
-		if self.data is None:
-			self.load_data([datapoint])
-		else:
-
-			# add
-			self.data.append(datapoint)
-
-			S, count, pool, duration, time = datapoint
-			phi = self.embed_set(S)
-
-			if self.phis is not None:
-				self.counts = torch.cat((self.counts, torch.Tensor([count])))
-				self.pool = torch.cat((self.pool, torch.Tensor([pool])))
-				self.phis = torch.cat((self.phis, phi), dim=0)
-			else:
-				self.counts = torch.Tensor([count]).double()
-				self.pool = torch.Tensor([pool]).double()
-				self.phis = phi
-
-	def nabla(self, theta):
-		# defining objective
-		if self.data is not None:
-			return - torch.einsum('i,ij,i->j', self.counts, self.phis, 1. / (self.phis @ theta).view(-1)).view(-1, 1) + \
-				   torch.einsum('i,ij,i->j', self.pool - self.counts, self.phis,
-								1. / (1. - self.phis @ theta).view(-1)).view(-1, 1) \
-				   + self.s * theta.view(-1, 1)
-		else:
-			return self.s * theta.view(-1, 1)
-
-	def sample(self, steps=10, verbose=False):
-		"""
-		Langevin dynamics to sample from constrained GP prior
-
-		:param steps: Number of iterations
-		:return:
-		"""
-		l = np.zeros(shape=(len(self.basic_sets)))
-		u = np.zeros(shape=(len(self.basic_sets))) + 1.
-
-		# prox operator
-		def prox(x):
-			res = solve_qp(np.eye(self.get_m()), x.numpy().reshape(-1),
-						   C=np.vstack((-self.varphis.numpy(), self.varphis.numpy())).T,
-						   b=np.hstack((-u, l)), factorized=True)
-			return torch.from_numpy(res[0]).view(-1, 1)
-
-		# initialization
-		if self.rate is not None:
-			theta = self.rate.view(-1, 1)
-		else:
-			theta = self.b + 0.05 * torch.rand(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=False).view(
-				-1, 1) ** 2
-
-		# loop
-		for k in range(steps):
-			w = torch.randn(size=(self.get_m(), 1)).double()
-
-			# calculate proper step-size
-			W = self.construct_covariance(theta=theta)
-			L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-3))
-			eta = 0.5 / L
-
-			theta = 0.5 * theta - eta * self.nabla(theta) + 0.5 * prox(theta) + np.sqrt(2 * eta) * w
-			if verbose == True:
-				print("Iter:", k, theta.T)
-
-		self.sampled_theta = prox(theta)
-
-	def construct_covariance(self, theta):
-		D1 = torch.diag(self.counts / (self.phis @ theta).view(-1) ** 2)
-		D2 = torch.diag((self.pool - self.counts) / (1 - self.phis @ theta).view(-1) ** 2)
-
-		W = self.phis.T @ (D1 + D2) @ self.phis + self.s * torch.eye(self.get_m()).double()
-		return W
-
-	def construct_confidence(self):
-		self.W = self.construct_covariance(self.rate)
-		self.invW = torch.pinverse(self.W)
-
-	def construct_likelihood_ratio(self, method='full'):
-		# for data
-		phis = self.phis.numpy()
-		counts = self.counts.numpy()
-
-		# for constraints
-		varphis = self.varphis.numpy()
-
-		# current fit
-		mean_theta = self.rate.numpy()
-
-		if method == 'split':
-			pass
-		elif method == 'full':
-			self.likelihood = - counts @ np.log(phis @ mean_theta) - (1 - counts) @ np.log(1 - phis @ mean_theta) \
-							  + self.s * 0.5 * np.sum(mean_theta - 0.5) ** 2
-		elif method == 'cv':
-			pass
-
-	def ucb(self, S, beta=8., delta=0.1):
-		if self.uncertainty == 'laplace':
-			ucb = self.embed_set(S) @ self.rate + beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
-			return torch.minimum(torch.Tensor([[1.]]).double(), ucb)
-
-		elif self.uncertainty == "ratio":
-			phi = self.embed_set(S)
-			phis = self.phis.numpy()
-			varphis = self.varphis.numpy()
-
-			counts = self.counts.numpy()
-			theta = cp.Variable(self.get_m())
-
-			objective = cp.Maximize(phi @ theta)
-
-			v = np.log(1. / delta) + self.likelihood
-			constraints = [- counts @ cp.log(phis @ theta) - (1 - counts) @ cp.log(1 - phis @ theta)
-						   + self.s * 0.5 * cp.sum_squares(theta - 0.5) <= v]
-
-			# every set has probability between 0-1.
-			constraints.append(varphis @ theta >= np.zeros(varphis.shape[0]))
-			constraints.append(varphis @ theta <= np.ones(varphis.shape[0]))
-
-			prob = cp.Problem(objective, constraints)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-			return torch.minimum(torch.Tensor([[1.]]).double(), torch.from_numpy(np.array(prob.value)))
-
-	def lcb(self, S, beta=8., delta=0.1):
-		if self.uncertainty == 'laplace':
-			lcb = self.embed_set(S) @ self.rate - beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
-			return torch.maximum(torch.Tensor([[0.]]).double(), lcb)
-
-		elif self.uncertainty == "ratio":
-			phi = self.embed_set(S)
-			phis = self.phis.numpy()
-			varphis = self.varphis.numpy()
-
-			counts = self.counts.numpy()
-			theta = cp.Variable(self.get_m())
-
-			objective = cp.Minimize(phi @ theta)
-			v = np.log(1. / delta) + self.likelihood
-			constraints = [- counts @ cp.log(phis @ theta) - (1 - counts) @ cp.log(1 - phis @ theta)
-						   + self.s * 0.5 * cp.sum_squares(theta - 0.5) <= v]
-
-			# every set has probability between 0-1.
-			constraints.append(varphis @ theta >= np.zeros(varphis.shape[0]))
-			constraints.append(varphis @ theta <= np.ones(varphis.shape[0]))
-
-			prob = cp.Problem(objective, constraints)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-
-			return torch.maximum(torch.Tensor([[0.]]).double(), torch.from_numpy(np.array(prob.value)))
-
-	def fit_gp(self, threads=4):
-
-		phis = self.phis.numpy()
-		varphis = self.varphis.numpy()
-
-		counts = self.counts.numpy()
-		theta = cp.Variable(self.get_m())
-		objective = cp.Minimize(- counts @ cp.log(phis @ theta) - (1 - counts) @ cp.log(1 - phis @ theta)
-								+ self.s * 0.5 * cp.sum_squares(theta - 0.5))
-
-		# probability constraints
-		constraints = []
-
-		# every set has probability between 0-1.
-		constraints.append(varphis @ theta >= np.zeros(varphis.shape[0]))
-		constraints.append(varphis @ theta <= np.ones(varphis.shape[0]))
-
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-		self.rate = torch.from_numpy(theta.value)
-		return self.rate
+    """
+    without link function, but with inequality constraints
+    """
+
+    def __init__(
+        self,
+        hierarchy,
+        d=1,
+        m=100,
+        kernel_object=None,
+        B=1.0,
+        s=1.0,
+        jitter=10e-8,
+        b=0.0,
+        basis="triangle",
+        offset=0.1,
+        uncertainty="laplace",
+    ):
+
+        self.d = d
+        self.s = s
+        self.b = b
+        self.B = B
+        self.uncertainty = uncertainty
+        self.hierarchy = hierarchy
+        self.kernel_object = kernel_object
+        self.packing = TriangleEmbedding(
+            d,
+            m,
+            kernel_object=kernel_object,
+            B=1.0,
+            b=0.0,
+            offset=offset,
+            s=np.sqrt(jitter),
+        )
+        self.feedback = "histogram"
+        self.data = None
+
+        self.basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
+        self.varphis = torch.zeros(size=(len(self.basic_sets), self.get_m())).double()
+
+        for index_set, set in enumerate(self.basic_sets):
+            self.varphis[index_set, :] = self.embed_set(set)
+
+    def embed_set(self, S):
+        return self.packing.integral(S).view(1, -1)
+
+    def load_data(self, data):
+        """
+
+        :param data: (S, no_events, out_of, duration, time)
+        :return:
+        """
+        self.data = []
+        self.phis = None
+        for datapoint in data:
+            self.add_data_point(datapoint)
+
+    def add_data_point(self, datapoint):
+
+        if self.data is None:
+            self.load_data([datapoint])
+        else:
+
+            # add
+            self.data.append(datapoint)
+
+            S, count, pool, duration, time = datapoint
+            phi = self.embed_set(S)
+
+            if self.phis is not None:
+                self.counts = torch.cat((self.counts, torch.tensor([count])))
+                self.pool = torch.cat((self.pool, torch.tensor([pool])))
+                self.phis = torch.cat((self.phis, phi), dim=0)
+            else:
+                self.counts = torch.tensor([count]).double()
+                self.pool = torch.tensor([pool]).double()
+                self.phis = phi
+
+    def nabla(self, theta):
+        # defining objective
+        if self.data is not None:
+            return (
+                -torch.einsum(
+                    "i,ij,i->j",
+                    self.counts,
+                    self.phis,
+                    1.0 / (self.phis @ theta).view(-1),
+                ).view(-1, 1)
+                + torch.einsum(
+                    "i,ij,i->j",
+                    self.pool - self.counts,
+                    self.phis,
+                    1.0 / (1.0 - self.phis @ theta).view(-1),
+                ).view(-1, 1)
+                + self.s * theta.view(-1, 1)
+            )
+        else:
+            return self.s * theta.view(-1, 1)
+
+    def sample(self, steps=10, verbose=False):
+        """
+        Langevin dynamics to sample from constrained GP prior
+
+        :param steps: Number of iterations
+        :return:
+        """
+        l = np.zeros(shape=(len(self.basic_sets)))
+        u = np.zeros(shape=(len(self.basic_sets))) + 1.0
+
+        # prox operator
+        def prox(x):
+            res = solve_qp(
+                np.eye(self.get_m()),
+                x.numpy().reshape(-1),
+                C=np.vstack((-self.varphis.numpy(), self.varphis.numpy())).T,
+                b=np.hstack((-u, l)),
+                factorized=True,
+            )
+            return torch.from_numpy(res[0]).view(-1, 1)
+
+        # initialization
+        if self.rate is not None:
+            theta = self.rate.view(-1, 1)
+        else:
+            theta = (
+                self.b
+                + 0.05
+                * torch.rand(
+                    size=(self.get_m(), 1), dtype=torch.float64, requires_grad=False
+                ).view(-1, 1)
+                ** 2
+            )
+
+        # loop
+        for k in range(steps):
+            w = torch.randn(size=(self.get_m(), 1)).double()
+
+            # calculate proper step-size
+            W = self.construct_covariance(theta=theta)
+            L = float(
+                scipy.sparse.linalg.eigsh(
+                    W.numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-3
+                )
+            )
+            eta = 0.5 / L
+
+            theta = (
+                0.5 * theta
+                - eta * self.nabla(theta)
+                + 0.5 * prox(theta)
+                + np.sqrt(2 * eta) * w
+            )
+            if verbose == True:
+                print("Iter:", k, theta.T)
+
+        self.sampled_theta = prox(theta)
+
+    def construct_covariance(self, theta):
+        D1 = torch.diag(self.counts / (self.phis @ theta).view(-1) ** 2)
+        D2 = torch.diag(
+            (self.pool - self.counts) / (1 - self.phis @ theta).view(-1) ** 2
+        )
+
+        W = (
+            self.phis.T @ (D1 + D2) @ self.phis
+            + self.s * torch.eye(self.get_m()).double()
+        )
+        return W
+
+    def construct_confidence(self):
+        self.W = self.construct_covariance(self.rate)
+        self.invW = torch.pinverse(self.W)
+
+    def construct_likelihood_ratio(self, method="full"):
+        # for data
+        phis = self.phis.numpy()
+        counts = self.counts.numpy()
+
+        # for constraints
+        varphis = self.varphis.numpy()
+
+        # current fit
+        mean_theta = self.rate.numpy()
+
+        if method == "split":
+            pass
+        elif method == "full":
+            self.likelihood = (
+                -counts @ np.log(phis @ mean_theta)
+                - (1 - counts) @ np.log(1 - phis @ mean_theta)
+                + self.s * 0.5 * np.sum(mean_theta - 0.5) ** 2
+            )
+        elif method == "cv":
+            pass
+
+    def ucb(self, S, beta=8.0, delta=0.1):
+        if self.uncertainty == "laplace":
+            ucb = (
+                self.embed_set(S) @ self.rate
+                + beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
+            )
+            return torch.minimum(torch.tensor([[1.0]]).double(), ucb)
+
+        elif self.uncertainty == "ratio":
+            phi = self.embed_set(S)
+            phis = self.phis.numpy()
+            varphis = self.varphis.numpy()
+
+            counts = self.counts.numpy()
+            theta = cp.Variable(self.get_m())
+
+            objective = cp.Maximize(phi @ theta)
+
+            v = np.log(1.0 / delta) + self.likelihood
+            constraints = [
+                -counts @ cp.log(phis @ theta)
+                - (1 - counts) @ cp.log(1 - phis @ theta)
+                + self.s * 0.5 * cp.sum_squares(theta - 0.5)
+                <= v
+            ]
+
+            # every set has probability between 0-1.
+            constraints.append(varphis @ theta >= np.zeros(varphis.shape[0]))
+            constraints.append(varphis @ theta <= np.ones(varphis.shape[0]))
+
+            prob = cp.Problem(objective, constraints)
+            prob.solve(
+                solver=cp.MOSEK,
+                warm_start=False,
+                verbose=False,
+                mosek_params={
+                    mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                    mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_rel_gap: 1e-6,
+                },
+            )
+            return torch.minimum(
+                torch.tensor([[1.0]]).double(), torch.from_numpy(np.array(prob.value))
+            )
+
+    def lcb(self, S, beta=8.0, delta=0.1):
+        if self.uncertainty == "laplace":
+            lcb = (
+                self.embed_set(S) @ self.rate
+                - beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
+            )
+            return torch.maximum(torch.tensor([[0.0]]).double(), lcb)
+
+        elif self.uncertainty == "ratio":
+            phi = self.embed_set(S)
+            phis = self.phis.numpy()
+            varphis = self.varphis.numpy()
+
+            counts = self.counts.numpy()
+            theta = cp.Variable(self.get_m())
+
+            objective = cp.Minimize(phi @ theta)
+            v = np.log(1.0 / delta) + self.likelihood
+            constraints = [
+                -counts @ cp.log(phis @ theta)
+                - (1 - counts) @ cp.log(1 - phis @ theta)
+                + self.s * 0.5 * cp.sum_squares(theta - 0.5)
+                <= v
+            ]
+
+            # every set has probability between 0-1.
+            constraints.append(varphis @ theta >= np.zeros(varphis.shape[0]))
+            constraints.append(varphis @ theta <= np.ones(varphis.shape[0]))
+
+            prob = cp.Problem(objective, constraints)
+            prob.solve(
+                solver=cp.MOSEK,
+                warm_start=False,
+                verbose=False,
+                mosek_params={
+                    mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                    mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_rel_gap: 1e-6,
+                },
+            )
+
+            return torch.maximum(
+                torch.tensor([[0.0]]).double(), torch.from_numpy(np.array(prob.value))
+            )
+
+    def fit_gp(self, threads=4):
+
+        phis = self.phis.numpy()
+        varphis = self.varphis.numpy()
+
+        counts = self.counts.numpy()
+        theta = cp.Variable(self.get_m())
+        objective = cp.Minimize(
+            -counts @ cp.log(phis @ theta)
+            - (1 - counts) @ cp.log(1 - phis @ theta)
+            + self.s * 0.5 * cp.sum_squares(theta - 0.5)
+        )
+
+        # probability constraints
+        constraints = []
+
+        # every set has probability between 0-1.
+        constraints.append(varphis @ theta >= np.zeros(varphis.shape[0]))
+        constraints.append(varphis @ theta <= np.ones(varphis.shape[0]))
+
+        prob = cp.Problem(objective, constraints)
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-6,
+            },
+        )
+        self.rate = torch.from_numpy(theta.value)
+        return self.rate
 
 
 class LinkBernoulliRateEstimator(BernoulliRateEstimator):
 
-	def construct_covariance(self, theta):
-		D1 = torch.diag(self.counts / (self.phis @ theta).view(-1) ** 2)
-		D2 = torch.diag((self.pool - self.counts) / (1 - self.phis @ theta).view(-1) ** 2)
-
-		W = self.phis.T @ (D1 + D2) @ self.phis + self.s * torch.eye(self.get_m()).double()
-		return W
-
-	def log_marginal(self, kernel, X):
-		func = kernel.get_kernel()
-		K = func(self.x, self.x, **X) + torch.eye(self.n, dtype=torch.float64) * self.s * self.s
-
-		L = torch.linalg.cholesky(K)
-		logdet = -0.5 * 2 * torch.sum(torch.log(torch.diag(L)))
-		alpha = torch.solve(self.y, K)[0]
-		logprob = -0.5 * torch.mm(torch.t(self.y), alpha) + logdet
-		logprob = -logprob
-		return logprob
-
-	def construct_likelihood_ratio(self, method='full'):
-		# for data
-		phis = self.phis.numpy()
-		counts = self.counts.numpy()
-
-		# for constraints
-		varphis = self.varphis.numpy()
-
-		# current fit
-		mean_theta = self.rate.numpy()
-
-		if method == 'split':
-			pass
-		elif method == 'full':
-			self.likelihood = - counts @ phis @ mean_theta + np.log(1 + np.exp(phis @ mean_theta)) \
-							  + self.s * 0.5 * np.sum(mean_theta) ** 2
-		elif method == 'cv':
-			pass
-
-	def fit_gp(self, threads=4):
-		phis = self.phis.numpy()
-
-		counts = self.counts.numpy()
-		theta = cp.Variable(self.get_m())
-		objective = cp.Minimize(-cp.sum(cp.multiply(counts, phis @ theta)) + cp.sum(cp.logistic(phis @ theta))
-								+ self.s * 0.5 * cp.sum_squares(theta))
-
-		# probability constraints
-		constraints = []
-
-		prob = cp.Problem(objective, constraints)
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-		self.rate = torch.from_numpy(theta.value)
-		return self.rate
-
-	def link(self, x):
-		return 1. / (1. + torch.exp(-x))
-
-	def mean_set(self, S):
-		return self.link(self.embed_set(S) @ self.rate)
-
-	def ucb(self, S, beta=8., delta=0.1):
-		if self.uncertainty == "laplace":
-			ucb = self.embed_set(S) @ self.rate + beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
-			return self.link(ucb)
-		elif self.uncertainty == "martingale":
-			phi = self.embed_set(S)
-			hat_theta = self.rate.numpy()
-
-			def constraint_value_gradient(theta, beta):
-				y = cp.Variable(self.get_m())
-				v = (theta - hat_theta)
-				objective2 = cp.Maximize(y @ v - cp.sum(cp.abs(self.phis @ y)) - beta)
-
-				prob = cp.Problem(objective2)
-				prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-						   mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-										 mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
-										 mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
-										 mosek.dparam.intpnt_co_tol_rel_gap: 1e-4})
-				print(prob.status)
-				return prob.value, y.value
-
-			beta = 2.
-			iters = 10
-			gamma = 0.00000001
-			theta = hat_theta
-			print(theta)
-
-			for k in range(iters):
-				print("Iter:", k)
-				d = cp.Variable(self.get_m())
-				objective = cp.Minimize(phi @ d.T)
-				val, nabla = constraint_value_gradient(theta, beta)
-				constraints = [val + nabla.reshape(1, -1) @ d <= 0., cp.sum_squares(d) <= gamma]
-				prob = cp.Problem(objective, constraints)
-				prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
-				theta = theta + d.value
-				print(theta)
-
-			return phi @ theta
-
-		elif self.uncertainty == "ratio":
-			phi = self.embed_set(S)
-			phis = self.phis.numpy()
-
-			counts = self.counts.numpy()
-			theta = cp.Variable(self.get_m())
-
-			objective = cp.Maximize(phi @ theta)
-			v = np.log(1. / delta) + self.likelihood
-			constraints = [-cp.sum(cp.multiply(counts, phis @ theta)) + cp.sum(cp.logistic(phis @ theta))
-						   + self.s * 0.5 * cp.sum_squares(theta) <= v]
-
-			prob = cp.Problem(objective, constraints)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-			return self.link(phi @ theta.value)
-
-	def lcb(self, S, beta=8., delta=0.1):
-		if self.uncertainty == "laplace":
-			lcb = self.embed_set(S) @ self.rate - beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
-			return self.link(lcb)
-		elif self.uncertainty == "ratio":
-			phi = self.embed_set(S)
-			phis = self.phis.numpy()
-
-			counts = self.counts.numpy()
-			theta = cp.Variable(self.get_m())
-
-			objective = cp.Minimize(phi @ theta)
-			v = np.log(1. / delta) + self.likelihood
-			constraints = [-cp.sum(cp.multiply(counts, phis @ theta)) + cp.sum(cp.logistic(phis @ theta))
-						   + self.s * 0.5 * cp.sum_squares(theta) <= v]
-
-			prob = cp.Problem(objective, constraints)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-			return self.link(phi @ theta.value)
-
-	def nabla(self, theta):
-		if self.data is not None:
-			return -torch.einsum('i,ij->j', self.counts, self.phis).view(-1, 1) + \
-				   torch.einsum('i,ij,i->j', self.pool, self.phis,
-								1. / (1. + torch.exp(self.phis @ theta).view(-1))).view(-1, 1) \
-				   + self.s * theta.view(-1, 1)
-		else:
-			return self.s * theta.view(-1, 1)
-
-	def construct_covariance(self, theta):
-		W = torch.eye(self.get_m()).double() * self.s + torch.einsum('i,ij,ik->jk',
-																	 torch.exp(self.phis @ theta).view(-1) / (
-																				 1 + torch.exp(self.phis @ theta)).view(
-																		 -1) ** 2, self.phis, self.phis)
-		return W
+    def construct_covariance(self, theta):
+        D1 = torch.diag(self.counts / (self.phis @ theta).view(-1) ** 2)
+        D2 = torch.diag(
+            (self.pool - self.counts) / (1 - self.phis @ theta).view(-1) ** 2
+        )
+
+        W = (
+            self.phis.T @ (D1 + D2) @ self.phis
+            + self.s * torch.eye(self.get_m()).double()
+        )
+        return W
+
+    def log_marginal(self, kernel, X):
+        func = kernel.get_kernel()
+        K = (
+            func(self.x, self.x, **X)
+            + torch.eye(self.n, dtype=torch.float64) * self.s * self.s
+        )
+
+        L = torch.linalg.cholesky(K)
+        logdet = -0.5 * 2 * torch.sum(torch.log(torch.diag(L)))
+        alpha = torch.solve(self.y, K)[0]
+        logprob = -0.5 * torch.mm(torch.t(self.y), alpha) + logdet
+        logprob = -logprob
+        return logprob
+
+    def construct_likelihood_ratio(self, method="full"):
+        # for data
+        phis = self.phis.numpy()
+        counts = self.counts.numpy()
+
+        # for constraints
+        varphis = self.varphis.numpy()
+
+        # current fit
+        mean_theta = self.rate.numpy()
+
+        if method == "split":
+            pass
+        elif method == "full":
+            self.likelihood = (
+                -counts @ phis @ mean_theta
+                + np.log(1 + np.exp(phis @ mean_theta))
+                + self.s * 0.5 * np.sum(mean_theta) ** 2
+            )
+        elif method == "cv":
+            pass
+
+    def fit_gp(self, threads=4):
+        phis = self.phis.numpy()
+
+        counts = self.counts.numpy()
+        theta = cp.Variable(self.get_m())
+        objective = cp.Minimize(
+            -cp.sum(cp.multiply(counts, phis @ theta))
+            + cp.sum(cp.logistic(phis @ theta))
+            + self.s * 0.5 * cp.sum_squares(theta)
+        )
+
+        # probability constraints
+        constraints = []
+
+        prob = cp.Problem(objective, constraints)
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-6,
+            },
+        )
+        self.rate = torch.from_numpy(theta.value)
+        return self.rate
+
+    def link(self, x):
+        return 1.0 / (1.0 + torch.exp(-x))
+
+    def mean_set(self, S):
+        return self.link(self.embed_set(S) @ self.rate)
+
+    def ucb(self, S, beta=8.0, delta=0.1):
+        if self.uncertainty == "laplace":
+            ucb = (
+                self.embed_set(S) @ self.rate
+                + beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
+            )
+            return self.link(ucb)
+        elif self.uncertainty == "martingale":
+            phi = self.embed_set(S)
+            hat_theta = self.rate.numpy()
+
+            def constraint_value_gradient(theta, beta):
+                y = cp.Variable(self.get_m())
+                v = theta - hat_theta
+                objective2 = cp.Maximize(y @ v - cp.sum(cp.abs(self.phis @ y)) - beta)
+
+                prob = cp.Problem(objective2)
+                prob.solve(
+                    solver=cp.MOSEK,
+                    warm_start=False,
+                    verbose=False,
+                    mosek_params={
+                        mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                        mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
+                        mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
+                        mosek.dparam.intpnt_co_tol_rel_gap: 1e-4,
+                    },
+                )
+                print(prob.status)
+                return prob.value, y.value
+
+            beta = 2.0
+            iters = 10
+            gamma = 0.00000001
+            theta = hat_theta
+            print(theta)
+
+            for k in range(iters):
+                print("Iter:", k)
+                d = cp.Variable(self.get_m())
+                objective = cp.Minimize(phi @ d.T)
+                val, nabla = constraint_value_gradient(theta, beta)
+                constraints = [
+                    val + nabla.reshape(1, -1) @ d <= 0.0,
+                    cp.sum_squares(d) <= gamma,
+                ]
+                prob = cp.Problem(objective, constraints)
+                prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
+                theta = theta + d.value
+                print(theta)
+
+            return phi @ theta
+
+        elif self.uncertainty == "ratio":
+            phi = self.embed_set(S)
+            phis = self.phis.numpy()
+
+            counts = self.counts.numpy()
+            theta = cp.Variable(self.get_m())
+
+            objective = cp.Maximize(phi @ theta)
+            v = np.log(1.0 / delta) + self.likelihood
+            constraints = [
+                -cp.sum(cp.multiply(counts, phis @ theta))
+                + cp.sum(cp.logistic(phis @ theta))
+                + self.s * 0.5 * cp.sum_squares(theta)
+                <= v
+            ]
+
+            prob = cp.Problem(objective, constraints)
+            prob.solve(
+                solver=cp.MOSEK,
+                warm_start=False,
+                verbose=False,
+                mosek_params={
+                    mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                    mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_rel_gap: 1e-6,
+                },
+            )
+            return self.link(phi @ theta.value)
+
+    def lcb(self, S, beta=8.0, delta=0.1):
+        if self.uncertainty == "laplace":
+            lcb = (
+                self.embed_set(S) @ self.rate
+                - beta * self.embed_set(S) @ self.invW @ self.embed_set(S).T
+            )
+            return self.link(lcb)
+        elif self.uncertainty == "ratio":
+            phi = self.embed_set(S)
+            phis = self.phis.numpy()
+
+            counts = self.counts.numpy()
+            theta = cp.Variable(self.get_m())
+
+            objective = cp.Minimize(phi @ theta)
+            v = np.log(1.0 / delta) + self.likelihood
+            constraints = [
+                -cp.sum(cp.multiply(counts, phis @ theta))
+                + cp.sum(cp.logistic(phis @ theta))
+                + self.s * 0.5 * cp.sum_squares(theta)
+                <= v
+            ]
+
+            prob = cp.Problem(objective, constraints)
+            prob.solve(
+                solver=cp.MOSEK,
+                warm_start=False,
+                verbose=False,
+                mosek_params={
+                    mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                    mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
+                    mosek.dparam.intpnt_co_tol_rel_gap: 1e-6,
+                },
+            )
+            return self.link(phi @ theta.value)
+
+    def nabla(self, theta):
+        if self.data is not None:
+            return (
+                -torch.einsum("i,ij->j", self.counts, self.phis).view(-1, 1)
+                + torch.einsum(
+                    "i,ij,i->j",
+                    self.pool,
+                    self.phis,
+                    1.0 / (1.0 + torch.exp(self.phis @ theta).view(-1)),
+                ).view(-1, 1)
+                + self.s * theta.view(-1, 1)
+            )
+        else:
+            return self.s * theta.view(-1, 1)
+
+    def construct_covariance(self, theta):
+        W = torch.eye(self.get_m()).double() * self.s + torch.einsum(
+            "i,ij,ik->jk",
+            torch.exp(self.phis @ theta).view(-1)
+            / (1 + torch.exp(self.phis @ theta)).view(-1) ** 2,
+            self.phis,
+            self.phis,
+        )
+        return W
 
 
 if __name__ == "__main__":
-	import matplotlib.pyplot as plt
-	from stpy.point_processes.binomial.binomial_process import BernoulliPointProcess
-
-	d = 1
-	gamma = 0.1
-	n = 64
-	m = 128
-	levels = 7
-	k = KernelFunction(gamma=gamma, kappa=1.)
-
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	actions = hierarchical_structure.get_sets_level(levels)
-	dummy = torch.zeros(size=(1, d)).double()
-
-	estimator = BernoulliRateEstimator(hierarchical_structure, m=64, kernel_object=k, s=0.001, uncertainty='ratio')
-	estimator_link = LinkBernoulliRateEstimator(hierarchical_structure, m=64, kernel_object=k, s=0.001,
-												uncertainty="ratio")
-
-	rate = lambda S: np.sin(np.pi * S.return_discretization(n=1) ** 2) * 0.5
-	process = BernoulliPointProcess(hierarchical_structure.get_sets_level(levels), rate=rate)
-
-	N = 100
-
-	data = []
-	for i in range(N):
-		data.append(process.sample(actions[torch.randint(0, len(actions), size=(1, 1))]))
-
-	estimator.load_data(data)
-	estimator_link.load_data(data)
-
-	estimator.fit_gp()
-	estimator_link.fit_gp()
-
-	# plot observations
-	for datapoint in data:
-		S, v, _, _, _ = datapoint
-		x = S.return_discretization(n)
-		if v == 1:
-			plt.plot(x, x * 0, 'ko')
-		else:
-			plt.plot(x, x * 0, 'ro')
-
-	xtest = hierarchical_structure.top_node.return_discretization(64)
-	plt.plot(xtest, estimator.mean_rate(hierarchical_structure.top_node, 64) * actions[0].volume(), 'tab:blue')
-
-	samples = 0
-	for i in range(samples):
-		estimator.sample(steps=100, verbose=False)
-		plt.plot(xtest, estimator.sample_path(hierarchical_structure.top_node, 64) * actions[0].volume(), 'g--')
-
-	estimator.construct_confidence()
-	estimator.construct_likelihood_ratio()
-
-	estimator_link.construct_confidence()
-	estimator_link.construct_likelihood_ratio()
-	# plot function
-	for action in actions:
-		val = estimator.mean_set(action)
-		val_link = estimator_link.mean_set(action)
-
-		ucb, lcb = float(estimator.ucb(action)), float(estimator.lcb(action))
-		ucb_link, lcb_link = float(estimator_link.ucb(action, delta=0.5)), float(estimator_link.lcb(action, delta=0.5))
-		x = action.return_discretization(64)
-		plt.plot(x, x * 0 + rate(action), color='tab:red')
-		x = x.view(-1)
-
-		plt.plot(x, x * 0 + val, color='tab:blue', linestyle='--')
-		plt.plot(x, x * 0 + val_link, color='tab:pink', linestyle='--')
-		plt.fill_between(x, x * 0 + lcb, x * 0 + ucb, color='tab:blue', alpha=0.2)
-		plt.fill_between(x, x * 0 + lcb_link, x * 0 + ucb_link, color='tab:pink', alpha=0.2)
-
-	plt.show()
+    import matplotlib.pyplot as plt
+    from stpy.point_processes.binomial.binomial_process import BernoulliPointProcess
+
+    d = 1
+    gamma = 0.1
+    n = 64
+    m = 128
+    levels = 7
+    k = KernelFunction(gamma=gamma, kappa=1.0)
+
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    actions = hierarchical_structure.get_sets_level(levels)
+    dummy = torch.zeros(size=(1, d)).double()
+
+    estimator = BernoulliRateEstimator(
+        hierarchical_structure, m=64, kernel_object=k, s=0.001, uncertainty="ratio"
+    )
+    estimator_link = LinkBernoulliRateEstimator(
+        hierarchical_structure, m=64, kernel_object=k, s=0.001, uncertainty="ratio"
+    )
+
+    rate = lambda S: np.sin(np.pi * S.return_discretization(n=1) ** 2) * 0.5
+    process = BernoulliPointProcess(
+        hierarchical_structure.get_sets_level(levels), rate=rate
+    )
+
+    N = 100
+
+    data = []
+    for i in range(N):
+        data.append(
+            process.sample(actions[torch.randint(0, len(actions), size=(1, 1))])
+        )
+
+    estimator.load_data(data)
+    estimator_link.load_data(data)
+
+    estimator.fit_gp()
+    estimator_link.fit_gp()
+
+    # plot observations
+    for datapoint in data:
+        S, v, _, _, _ = datapoint
+        x = S.return_discretization(n)
+        if v == 1:
+            plt.plot(x, x * 0, "ko")
+        else:
+            plt.plot(x, x * 0, "ro")
+
+    xtest = hierarchical_structure.top_node.return_discretization(64)
+    plt.plot(
+        xtest,
+        estimator.mean_rate(hierarchical_structure.top_node, 64) * actions[0].volume(),
+        "tab:blue",
+    )
+
+    samples = 0
+    for i in range(samples):
+        estimator.sample(steps=100, verbose=False)
+        plt.plot(
+            xtest,
+            estimator.sample_path(hierarchical_structure.top_node, 64)
+            * actions[0].volume(),
+            "g--",
+        )
+
+    estimator.construct_confidence()
+    estimator.construct_likelihood_ratio()
+
+    estimator_link.construct_confidence()
+    estimator_link.construct_likelihood_ratio()
+    # plot function
+    for action in actions:
+        val = estimator.mean_set(action)
+        val_link = estimator_link.mean_set(action)
+
+        ucb, lcb = float(estimator.ucb(action)), float(estimator.lcb(action))
+        ucb_link, lcb_link = float(estimator_link.ucb(action, delta=0.5)), float(
+            estimator_link.lcb(action, delta=0.5)
+        )
+        x = action.return_discretization(64)
+        plt.plot(x, x * 0 + rate(action), color="tab:red")
+        x = x.view(-1)
+
+        plt.plot(x, x * 0 + val, color="tab:blue", linestyle="--")
+        plt.plot(x, x * 0 + val_link, color="tab:pink", linestyle="--")
+        plt.fill_between(x, x * 0 + lcb, x * 0 + ucb, color="tab:blue", alpha=0.2)
+        plt.fill_between(
+            x, x * 0 + lcb_link, x * 0 + ucb_link, color="tab:pink", alpha=0.2
+        )
+
+    plt.show()
diff --git a/stpy/point_processes/link_fun_rate_estimator.py b/stpy/point_processes/link_fun_rate_estimator.py
index 20cf463..ee51e4e 100644
--- a/stpy/point_processes/link_fun_rate_estimator.py
+++ b/stpy/point_processes/link_fun_rate_estimator.py
@@ -1,3 +1,4 @@
+from typing import List
 import numpy as np
 import torch
 import scipy
@@ -8,460 +9,614 @@
 import matplotlib.pyplot as plt
 from stpy.embeddings.embedding import HermiteEmbedding
 import scipy.integrate as integrate
-from stpy.helpers.ellipsoid_algorithms import maximize_quadratic_on_ellipse, minimize_quadratic_on_ellipse
-from stpy.helpers.ellipsoid_algorithms import maximize_matrix_quadratic_on_ellipse, minimize_matrix_quadratic_on_ellipse
+from stpy.helpers.ellipsoid_algorithms import (
+    maximize_quadratic_on_ellipse,
+    minimize_quadratic_on_ellipse,
+)
+from stpy.helpers.ellipsoid_algorithms import (
+    maximize_matrix_quadratic_on_ellipse,
+    minimize_matrix_quadratic_on_ellipse,
+)
 from stpy.point_processes.poisson import PoissonPointProcess
-from stpy.point_processes.poisson_rate_estimator import PositiveRateEstimator
+from stpy.point_processes.poisson_rate_estimator import PoissonRateEstimator
 from stpy.borel_set import BorelSet, HierarchicalBorelSets
 from stpy.kernels import KernelFunction
 
 ## implement loading data
 
-class PermanentalProcessRateEstimator(PositiveRateEstimator):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args,**kwargs)
-
-		self.integration = "fixed_quad"
-		self.product_integrals = {}
-		self.varLambdas = torch.zeros(size=(len(self.basic_sets), self.get_m(),self.get_m())).double()
-		self.opt = 'cvxpy'
-		if self.feedback == "count-record" and self.estimator=="least-sq":
-			print ("precomputing-integrals:")
-			for index_set, set in enumerate(self.basic_sets):
-				print (index_set,"/",len(self.basic_sets))
-				self.varLambdas[index_set, :] = self.product_integral(set)
-				self.variances[index_set] = set.volume() * self.B
-
-
-	def product_integral(self,S):
-
-		if S in self.product_integrals.keys():
-			return self.product_integrals[S]
-		else:
-
-			if "product_integral" in dir(self.packing):
-				Psi = self.packing.product_integral(S)
-				self.product_integrals[S] = Psi
-				return Psi
-
-			elif self.integration ==  "vec_quad":
-
-				if S.d == 2:
-					#Psi = torch.zeros(size=(self.get_m(), self.get_m())).double()
-					F = lambda x: (self.packing.embed(x).view(-1, 1) @\
-								   self.packing.embed(x).view(1, -1)).view(-1)
-					integrand = lambda x, y: F(torch.Tensor([x, y]).view(1, 2).double()).numpy()
-
-					val = quadvec2(integrand,float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-								   float(S.bounds[1, 0]), float(S.bounds[1, 1]),limit = 10,epsrel = 10e-3, epsabs = 10e-3, quadrature = 'gk15')
-					Psi = torch.from_numpy(val).view((self.get_m(), self.get_m()))
-
-			elif self.integration == "fixed_quad":
-
-				if S.d ==1:
-					weights, nodes = S.return_legendre_discretization(n=128)
-					Z = self.packing.embed(nodes)
-					M = torch.einsum('ij,ik->ijk', Z, Z)
-					Psi = torch.einsum('i,ijk->jk', weights, M)
-
-				if S.d ==2:
-					weights, nodes = S.return_legendre_discretization(n = 50)
-					Z = self.packing.embed(nodes)
-					M = torch.einsum('ij,ik->ijk',Z,Z)
-					Psi = torch.einsum('i,ijk->jk',weights,M)
-
-			else:
-				Psi = torch.zeros(size = (self.get_m(),self.get_m())).double()
-				for i in range(self.get_m()):
-					for j in range(self.get_m()):
-
-						if S.d == 1:
-							F_ij = lambda x: (self.packing.embed(torch.from_numpy(np.array(x)).view(1, -1)).view(-1)[i] *
-											  self.packing.embed(torch.from_numpy(np.array(x)).view(1, -1)).view(-1)[
-												  j]).numpy()
-							val, status = integrate.quad(F_ij,float(S.bounds[0,0]), float(S.bounds[0,1]))
-
-
-						elif S.d == 2:
-							F_ij = lambda x:  self.packing.embed(x).view(-1)[i] *self.packing.embed(x).view(-1)[j]
-							integrand = lambda x, y: F_ij(torch.Tensor([x, y]).view(1, 2).double()).numpy()
-							val,status = integrate.dblquad(integrand, float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-															lambda x: float(S.bounds[1, 0]),
-															lambda x: float(S.bounds[1, 1]),epsabs=1.49e-03, epsrel=1.49e-03)
-						else:
-							raise NotImplementedError("Integration above d>2 not implemented.")
-
-						Psi[i,j] = val
-						print(i, j, val)
-
-			self.product_integrals[S] = Psi
-			return Psi
-
-	def get_constraints(self):
-		s = self.get_m()
-		l = np.full(s, self.b)
-		u = np.full(s, self.B)
-		Lambda = np.identity(s)
-		return (l,Lambda,u)
-
-	def cov(self, inverse=False):
-		s = self.get_m()
-
-		if inverse==False:
-			return torch.zeros(size = (s,s)).double()
-		else:
-			return torch.zeros(size=(s, s)).double(),torch.zeros(size=(s, s)).double()
-
-
-	def sample(self, verbose = False, steps = 10, stepsize = None):
-
-		if self.data is None:
-			self.sampled_theta = torch.zeros(self.get_m()).double().view(-1,1)
-			return None
-
-		if self.observations is not None:
-			observations = self.observations.double()
-			sumLambda = self.sumLambda.double()
-			nabla = lambda theta: -torch.sum(torch.diag(1. /(observations@theta).view(-1)) @ observations)	\
-								  + (sumLambda.T + sumLambda) @ theta + self.s*theta.view(-1,1)
-		else:
-			sumLambda = self.sumLambda.double()
-			nabla = lambda theta: (sumLambda.T + sumLambda) @ theta + self.s*theta.view(-1,1)
-
-		theta = self.rate.view(-1, 1)
-
-		W = self.construct_covariance_matrix_laplace()
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-3))
-		eta = 0.5 / (L + 1)
-
-		for k in range(steps):
-			W = torch.randn(size=(self.get_m(), 1)).double()
-			theta = theta - eta * nabla(theta) + np.sqrt(2 * eta) * W
-			if verbose == True:
-				print("Iter:", k, theta.T)
-
-		self.sampled_theta = theta
-		return None
-
-	def sample_value(self, S):
-		"""
-		Given a pre-sampled value evaluate certain portions of the domain S
-		:param S:
-		:return:
-		"""
-		Z = self.product_integral(S)
-		map = self.sampled_theta.T@ Z @self.sampled_theta
-		return map
-
-
-	def sample_path(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return (self.packing.embed(xtest) @ self.sampled_theta)**2
-
-
-
-
-	def load_data(self,data):
-		super().load_data(data, times = False)
-		self.sumLambda = torch.zeros(size = (self.get_m(),self.get_m()))
-		if len(data) > 1:
-			for sample in data:
-				(S,obs,dt) = sample
-				self.sumLambda += self.product_integral(S) * dt
-
-	def add_data_point(self, new_data):
-		super().add_data_point(new_data, times = False)
-		(S, obs, dt) = new_data
-		self.sumLambda += self.product_integral(S) * dt
-
-	def penalized_likelihood(self, threads = 4):
-		sumLambda = self.sumLambda.numpy()
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			loss = lambda theta: float(-np.sum(np.log(  (observations@theta)**2 ))  + np.dot(theta, sumLambda@theta) + 0.5*self.s*np.sum(theta**2))
-		else:
-			loss = lambda theta: float(np.dot(theta, sumLambda @ theta) + 0.5*self.s * np.sum(theta ** 2))
-
-		theta = np.random.randn(self.get_m())
-		res = minimize(loss, theta, jac=None, method='L-BFGS-B')
-		self.rate = torch.from_numpy(res.x)
-		return self.rate
-
-	def construct_covariance_matrix_laplace(self):
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-
-		if self.feedback == "count-record":
-			if self.observations is not None:
-				for i in range(self.observations.size()[0]):
-					A = self.observations[i, :].view(-1, 1) @ self.observations[i, :].view(1, -1)
-					k = np.maximum(torch.dot(self.observations[i, :],self.rate.view(-1)) ** 2,self.b)
-					W = W + A / k
-			W += 2*self.sumLambda
-		else:
-			raise AssertionError("Not implemented.")
-		return W + torch.eye(self.get_m()).double()*self.s
-
-
-	def map_lcb_ucb_approx_action(self, S, dt=1.,  beta=2.):
-
-		phi = self.packing.integral(S)
-		map = (phi @ self.rate)
-
-		ucb = np.maximum((map + beta*np.sqrt(phi@self.W_inv_approx@phi.T))**2,(map - beta*np.sqrt(phi@self.W_inv_approx@phi.T))**2)
-		ucb = np.minimum(ucb,self.B*S.volume()*dt)
-		lcb = 0.
-
-		return dt*map**2, dt*lcb, dt*ucb
-
-	def mean_std_per_action(self,S,W, dt , beta):
-		Z = self.product_integral(S)
-
-		ucb, _ = maximize_matrix_quadratic_on_ellipse(Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-		lcb, _ = minimize_matrix_quadratic_on_ellipse(Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-
-		map = self.rate.T @ Z @ self.rate
-
-		return dt * map, dt * ucb, -lcb * dt
-
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return (self.packing.embed(xtest) @ self.rate)**2
-
-	def mean_rate_latent(self,S,n = 128):
-		xtest = S.return_discretization(n)
-		return self.packing.embed(xtest) @ self.rate
-
-
-	def map_lcb_ucb_approx(self,S,n,beta = 2.0, delta = 0.01):
-		xtest = S.return_discretization(n)
-		if self.data is None:
-			return  0 * xtest[:, 0].view(-1, 1),self.b + 0 * xtest[:, 0].view(-1, 1), self.B + 0 * xtest[:,0].view(-1,xtest.size()[0])
-		self.fit_ellipsoid_approx()
-
-		Phi = self.packing.embed(xtest).double()
-		map = Phi @ self.rate
-		N = Phi.size()[0]
-
-		ucb = torch.zeros(size=(N, 1)).double()
-		lcb = torch.zeros(size=(N, 1)).double()
-
-		for i in range(N):
-			x = Phi[i, :].view(-1,1)
-			maximum = np.maximum((map[i] - beta * np.sqrt(x.T @ self.W_inv_approx @ x))**2, (map[i] + beta * np.sqrt(x.T @ self.W_inv_approx @ x))**2)
-			ucb[i,0] = np.minimum( maximum ,self.B)
-			lcb[i,0] = 0.
-			#lcb[i, 0] = map[i] - np.sqrt(beta) * np.sqrt(x.T @ self.W_inv_approx @ x) ** 2
-		return map**2, lcb, ucb
-
-	def map_lcb_ucb(self, S, n, beta = 2.0, delta = 0.01):
-		"""
-		Calculate exact confidence using laplace approximation on a whole set domain
-		:param S: set
-		:param n: discretization
-		:param beta: beta
-		:return:
-		"""
-
-		xtest = S.return_discretization(n)
-		if self.data is None:
-			return self.b+0*xtest[:,0].view(-1,1),self.b+0*xtest[:,0].view(-1,1),self.B+0*xtest[:,0].view(-1,1)
-
-		N = xtest.size()[0]
-		Phi = self.packing.embed(xtest)
-		map = (Phi @ self.rate)**2
-
-		if self.uncertainty == "laplace":
-			W = self.construct_covariance_matrix_laplace()
-		ucb = torch.zeros(size=(N, 1)).double()
-		lcb = torch.zeros(size=(N, 1)).double()
-
-		for i in range(N):
-			x = Phi[i, :]
-			ucbi, _ = maximize_quadratic_on_ellipse(x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-			lcbi, _ = minimize_quadratic_on_ellipse(x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-			ucb[i, 0] = ucbi
-			lcb[i, 0] = lcbi
-
-		return map, lcb, ucb
+class PermanentalProcessRateEstimator(PoissonRateEstimator):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.integration = "fixed_quad"
+        self.product_integrals = {}
+        self.varLambdas = torch.zeros(
+            size=(len(self.basic_sets), self.get_m(), self.get_m())
+        ).double()
+        self.opt = "cvxpy"
+        if self.feedback == "count-record" and self.estimator == "least-sq":
+            print("precomputing-integrals:")
+            for index_set, set in enumerate(self.basic_sets):
+                print(index_set, "/", len(self.basic_sets))
+                self.varLambdas[index_set, :] = self.product_integral(set)
+                self.variances[index_set] = set.volume() * self.B
+
+    def product_integral(self, S):
+
+        if S in self.product_integrals.keys():
+            return self.product_integrals[S]
+        else:
+
+            if "product_integral" in dir(self.packing):
+                Psi = self.packing.product_integral(S)
+                self.product_integrals[S] = Psi
+                return Psi
+
+            elif self.integration == "vec_quad":
+
+                if S.d == 2:
+                    # Psi = torch.zeros(size=(self.get_m(), self.get_m())).double()
+                    F = lambda x: (
+                        self.packing.embed(x).view(-1, 1)
+                        @ self.packing.embed(x).view(1, -1)
+                    ).view(-1)
+                    integrand = lambda x, y: F(
+                        torch.tensor([x, y]).view(1, 2).double()
+                    ).numpy()
+
+                    val = quadvec2(
+                        integrand,
+                        float(S.bounds[0, 0]),
+                        float(S.bounds[0, 1]),
+                        float(S.bounds[1, 0]),
+                        float(S.bounds[1, 1]),
+                        limit=10,
+                        epsrel=10e-3,
+                        epsabs=10e-3,
+                        quadrature="gk15",
+                    )
+                    Psi = torch.from_numpy(val).view((self.get_m(), self.get_m()))
+
+            elif self.integration == "fixed_quad":
+
+                if S.d == 1:
+                    weights, nodes = S.return_legendre_discretization(n=128)
+                    Z = self.packing.embed(nodes)
+                    M = torch.einsum("ij,ik->ijk", Z, Z)
+                    Psi = torch.einsum("i,ijk->jk", weights, M)
+
+                if S.d == 2:
+                    weights, nodes = S.return_legendre_discretization(n=50)
+                    Z = self.packing.embed(nodes)
+                    M = torch.einsum("ij,ik->ijk", Z, Z)
+                    Psi = torch.einsum("i,ijk->jk", weights, M)
+
+            else:
+                Psi = torch.zeros(size=(self.get_m(), self.get_m())).double()
+                for i in range(self.get_m()):
+                    for j in range(self.get_m()):
+
+                        if S.d == 1:
+                            F_ij = lambda x: (
+                                self.packing.embed(
+                                    torch.from_numpy(np.array(x)).view(1, -1)
+                                ).view(-1)[i]
+                                * self.packing.embed(
+                                    torch.from_numpy(np.array(x)).view(1, -1)
+                                ).view(-1)[j]
+                            ).numpy()
+                            val, status = integrate.quad(
+                                F_ij, float(S.bounds[0, 0]), float(S.bounds[0, 1])
+                            )
+
+                        elif S.d == 2:
+                            F_ij = (
+                                lambda x: self.packing.embed(x).view(-1)[i]
+                                * self.packing.embed(x).view(-1)[j]
+                            )
+                            integrand = lambda x, y: F_ij(
+                                torch.tensor([x, y]).view(1, 2).double()
+                            ).numpy()
+                            val, status = integrate.dblquad(
+                                integrand,
+                                float(S.bounds[0, 0]),
+                                float(S.bounds[0, 1]),
+                                lambda x: float(S.bounds[1, 0]),
+                                lambda x: float(S.bounds[1, 1]),
+                                epsabs=1.49e-03,
+                                epsrel=1.49e-03,
+                            )
+                        else:
+                            raise NotImplementedError(
+                                "Integration above d>2 not implemented."
+                            )
+
+                        Psi[i, j] = val
+                        print(i, j, val)
+
+            self.product_integrals[S] = Psi
+            return Psi
+
+    def get_constraints(self):
+        s = self.get_m()
+        l = np.full(s, self.b)
+        u = np.full(s, self.B)
+        Lambda = np.identity(s)
+        return (l, Lambda, u)
+
+    def cov(self, inverse=False):
+        s = self.get_m()
+
+        if inverse == False:
+            return torch.zeros(size=(s, s)).double()
+        else:
+            return torch.zeros(size=(s, s)).double(), torch.zeros(size=(s, s)).double()
+
+    def sample(self, verbose=False, steps=10, stepsize=None):
+
+        if self.data is None:
+            self.sampled_theta = torch.zeros(self.get_m()).double().view(-1, 1)
+            return None
+
+        if self.observations is not None:
+            observations = self.observations.double()
+            sumLambda = self.sumLambda.double()
+            nabla = (
+                lambda theta: -torch.sum(
+                    torch.diag(1.0 / (observations @ theta).view(-1)) @ observations
+                )
+                + (sumLambda.T + sumLambda) @ theta
+                + self.s * theta.view(-1, 1)
+            )
+        else:
+            sumLambda = self.sumLambda.double()
+            nabla = lambda theta: (
+                sumLambda.T + sumLambda
+            ) @ theta + self.s * theta.view(-1, 1)
+
+        theta = self.rate.view(-1, 1)
+
+        W = self.construct_covariance_matrix_laplace()
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-3
+            )
+        )
+        eta = 0.5 / (L + 1)
+
+        for k in range(steps):
+            W = torch.randn(size=(self.get_m(), 1)).double()
+            theta = theta - eta * nabla(theta) + np.sqrt(2 * eta) * W
+            if verbose == True:
+                print("Iter:", k, theta.T)
+
+        self.sampled_theta = theta
+        return None
+
+    def sample_value(self, S):
+        """
+        Given a pre-sampled value evaluate certain portions of the domain S
+        :param S:
+        :return:
+        """
+        Z = self.product_integral(S)
+        map = self.sampled_theta.T @ Z @ self.sampled_theta
+        return map
+
+    def sample_path(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return (self.packing.embed(xtest) @ self.sampled_theta) ** 2
+
+    def load_data(self, data):
+        super().load_data(data, times=False)
+        self.sumLambda = torch.zeros(size=(self.get_m(), self.get_m()))
+        if len(data) > 1:
+            for sample in data:
+                (S, obs, dt) = sample
+                self.sumLambda += self.product_integral(S) * dt
+        else:
+            self.S = data[0][0]
+            self.dt = data[0][2]
+            assert isinstance(self.S, BorelSet)
+
+    def add_data_point(self, new_data):
+        super().add_data_point(new_data, times=False)
+        (S, obs, dt) = new_data
+        self.sumLambda += self.product_integral(S) * dt
+
+    def penalized_likelihood(self, threads=4):
+        sumLambda = self.sumLambda.numpy()
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            loss = lambda theta: float(
+                -np.sum(np.log((observations @ theta) ** 2))
+                + np.dot(theta, sumLambda @ theta)
+                + 0.5 * self.s * np.sum(theta**2)
+            )
+        else:
+            loss = lambda theta: float(
+                np.dot(theta, sumLambda @ theta) + 0.5 * self.s * np.sum(theta**2)
+            )
+
+        theta = np.random.randn(self.get_m())
+        res = minimize(loss, theta, jac=None, method="L-BFGS-B")
+        self.rate = torch.from_numpy(res.x)
+        return self.rate
+
+    def construct_covariance_matrix_laplace(self):
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+
+        if self.feedback == "count-record":
+            if self.observations is not None:
+                for i in range(self.observations.size()[0]):
+                    A = self.observations[i, :].view(-1, 1) @ self.observations[
+                        i, :
+                    ].view(1, -1)
+                    k = np.maximum(
+                        torch.dot(self.observations[i, :], self.rate.view(-1)) ** 2,
+                        self.b,
+                    )
+                    W = W + A / k
+            W += 2 * self.sumLambda
+        else:
+            raise AssertionError("Not implemented.")
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def map_lcb_ucb_approx_action(self, S, dt=1.0, beta=2.0):
+
+        phi = self.packing.integral(S)
+        map = phi @ self.rate
+
+        ucb = np.maximum(
+            (map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)) ** 2,
+            (map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)) ** 2,
+        )
+        ucb = np.minimum(ucb, self.B * S.volume() * dt)
+        lcb = 0.0
+
+        return dt * map**2, dt * lcb, dt * ucb
+
+    def mean_std_per_action(self, S, W, dt, beta):
+        Z = self.product_integral(S)
+
+        ucb, _ = maximize_matrix_quadratic_on_ellipse(
+            Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+        )
+        lcb, _ = minimize_matrix_quadratic_on_ellipse(
+            Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+        )
+
+        map = self.rate.T @ Z @ self.rate
+
+        return dt * map, dt * ucb, -lcb * dt
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return (self.packing.embed(xtest) @ self.rate) ** 2
+
+    def mean_rate_latent(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return self.packing.embed(xtest) @ self.rate
+
+    def map_lcb_ucb_approx(self, S, n, beta=2.0, delta=0.01):
+        xtest = S.return_discretization(n)
+        if self.data is None:
+            return (
+                0 * xtest[:, 0].view(-1, 1),
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, xtest.size()[0]),
+            )
+        self.fit_ellipsoid_approx()
+
+        Phi = self.packing.embed(xtest).double()
+        map = Phi @ self.rate
+        N = Phi.size()[0]
+
+        ucb = torch.zeros(size=(N, 1)).double()
+        lcb = torch.zeros(size=(N, 1)).double()
+
+        for i in range(N):
+            x = Phi[i, :].view(-1, 1)
+            maximum = np.maximum(
+                (map[i] - beta * np.sqrt(x.T @ self.W_inv_approx @ x)) ** 2,
+                (map[i] + beta * np.sqrt(x.T @ self.W_inv_approx @ x)) ** 2,
+            )
+            ucb[i, 0] = np.minimum(maximum, self.B)
+            lcb[i, 0] = 0.0
+            # lcb[i, 0] = map[i] - np.sqrt(beta) * np.sqrt(x.T @ self.W_inv_approx @ x) ** 2
+        return map**2, lcb, ucb
+
+    def map_lcb_ucb(self, S, n, beta=2.0, delta=0.01):
+        """
+        Calculate exact confidence using laplace approximation on a whole set domain
+        :param S: set
+        :param n: discretization
+        :param beta: beta
+        :return:
+        """
+
+        xtest = S.return_discretization(n)
+        if self.data is None:
+            return (
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, 1),
+            )
+
+        N = xtest.size()[0]
+        Phi = self.packing.embed(xtest)
+        map = (Phi @ self.rate) ** 2
+
+        if self.uncertainty == "laplace":
+            W = self.construct_covariance_matrix_laplace()
+        ucb = torch.zeros(size=(N, 1)).double()
+        lcb = torch.zeros(size=(N, 1)).double()
+
+        for i in range(N):
+            x = Phi[i, :]
+            ucbi, _ = maximize_quadratic_on_ellipse(
+                x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+            )
+            lcbi, _ = minimize_quadratic_on_ellipse(
+                x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+            )
+            ucb[i, 0] = ucbi
+            lcb[i, 0] = lcbi
+
+        return map, lcb, ucb
 
 
 class LogisticGaussProcessRateEstimator(PermanentalProcessRateEstimator):
 
-	def penalized_likelihood(self, threads=4):
-		logistic = lambda x: np.log(1 + np.exp(x))
-		weights = self.weights.numpy()
-		nodes = self.nodes.numpy()
-
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			loss = lambda theta: float(-np.sum(np.log(logistic(observations @ theta))) + np.sum(
-				weights * logistic(theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-		else:
-			loss = lambda theta: float(np.sum(weights * logistic(theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-
-		theta = np.random.randn(self.get_m())
-		res = minimize(loss, theta, jac= None, method='L-BFGS-B',options={'maxcor': 20,'iprint':-1,'maxfun':150000,'maxls': 50})
-		self.rate = torch.from_numpy(res.x)
-
-		return self.rate
-
-	def logistic(self, x):
-		return torch.log(1 + torch.exp(x))
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return self.logistic(self.packing.embed(xtest) @ self.rate)
+    def penalized_likelihood(self, threads=4):
+        logistic = lambda x: np.log(1 + np.exp(x))
+        weights = self.weights.numpy()
+        nodes = self.nodes.numpy()
+
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            loss = lambda theta: float(
+                -np.sum(np.log(logistic(observations @ theta)))
+                + np.sum(weights * logistic(theta @ nodes.T))
+                + self.s * np.sum(theta**2)
+            )
+        else:
+            loss = lambda theta: float(
+                np.sum(weights * logistic(theta @ nodes.T)) + self.s * np.sum(theta**2)
+            )
+
+        theta = np.random.randn(self.get_m())
+        res = minimize(
+            loss,
+            theta,
+            jac=None,
+            method="L-BFGS-B",
+            options={"maxcor": 20, "iprint": -1, "maxfun": 150000, "maxls": 50},
+        )
+        self.rate = torch.from_numpy(res.x)
+
+        return self.rate
+
+    def logistic(self, x):
+        return torch.log(1 + torch.exp(x))
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return self.logistic(self.packing.embed(xtest) @ self.rate)
 
 
 class ExpGaussProcessRateEstimator(PermanentalProcessRateEstimator):
 
+    def penalized_likelihood(self, threads=4):
+        # ONLY WORKS WITH ONE DATASET given by load_data!
+        # Get node function values and weights for Gauss-Legendre quadrature
+        weights, nodes = self.S.return_legendre_discretization(n=50)
+        nodes = nodes.to(torch.get_default_device())
+        weights = weights.cpu().numpy()
+        vals = self.packing.embed(nodes).cpu().numpy()
+
+        if self.observations is not None:
+            observations = self.observations.cpu().numpy()
+            loss = lambda theta: float(
+                -np.sum(observations @ theta)
+                + self.dt * np.sum(weights * np.exp(theta @ vals.T))
+                + self.s * 0.5 * np.sum(theta**2)
+            )
+        else:
+            loss = lambda theta: float(
+                np.sum(weights * np.exp(theta @ nodes.T)) + self.s * np.sum(theta**2)
+            )
+
+        theta = np.zeros(self.get_m())
+        res = minimize(
+            loss,
+            theta,
+            jac=None,
+            method="L-BFGS-B",
+            options={
+                "maxcor": 20,
+                "iprint": -1,
+                "maxfun": 150000,
+                "maxls": 100,
+                "ftol": 1e-12,
+                "eps": 1e-12,
+                "gtol": 1e-8,
+            },
+        )
+        self.rate = torch.tensor(res.x)
+
+        return self.rate
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return torch.exp(self.packing.embed(xtest) @ self.rate)
+
+    def rate_value(self, x, dt=1):
+        phi = self.packing.embed(x) * dt
+
+        if self.rate is not None:
+            map = torch.exp(phi @ self.rate.view(-1, 1))
+        else:
+            print("Rate function not fitted!")
+            map = 0 * phi[:, 0].view(-1, 1) + self.b
+
+        return map
 
-	def penalized_likelihood(self, threads=4):
-		weights = self.weights.numpy()
-		nodes = self.nodes.numpy()
-
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			loss = lambda theta: float(np.sum(observations @ theta) + np.sum(
-				weights * np.exp(-theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-		else:
-			loss = lambda theta: float(np.sum(weights * np.exp(-theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-
-		theta = np.zeros(self.get_m())
-		res = minimize(loss, theta, jac= None, method='L-BFGS-B',options={'maxcor': 20,'iprint':-1,
-																		  'maxfun':150000,'maxls': 100,
-																		  'ftol':1e-12,'eps':1e-12,'gtol':1e-8})
-		self.rate = torch.from_numpy(res.x)
-
-		return self.rate
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return torch.exp(-self.packing.embed(xtest) @ self.rate)
 
 if __name__ == "__main__":
-	torch.manual_seed(2)
-	np.random.seed(2)
-	d = 1
-	gamma = 0.1
-	n = 64
-	B = 4.
-	b = 0.1
-
-	process = PoissonPointProcess(d=1, B=B, b=b)
-	Sets = []
-	levels = 4
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	Sets = hierarchical_structure.get_all_sets()
-
-	D = BorelSet(1, bounds=torch.Tensor([[-1., 1.]]).double())
-
-	m = 64
-	embedding = HermiteEmbedding(m = m, d = 1, gamma = gamma)
-	k = KernelFunction(gamma = gamma)
-
-	estimator5 = PositiveRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d = d)
-
-	estimator4 = PermanentalProcessRateEstimator(process, hierarchical_structure,kernel_object=k, B=B, m=m, d = d)
-	#estimator = PermanentalProcessRateEstimator(process, hierarchical_structure,
-	#											kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom", approx="ellipsoid")
-	#estimator = LogGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
-	estimator = LogGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B+1, m=m, d=d, embedding=embedding)
-
-	#estimator = LogisticGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
-	estimator2 = LogisticGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding)
-	#estimator = ExpGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
-	estimator3 = ExpGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding)
-
-	estimators = [estimator,estimator2,estimator3,estimator4,estimator5]
-	names = ['sigmoid','logistic','exp','square','no-link']
-	bands = [True,False,False,False,True]
-
-
-	estimators = [estimator,estimator5,estimator4]
-	names = ['sigmoid','no-link','square']
-	bands = [False,False,False]
-
-	min_vol, max_vol = estimator.get_min_max()
-	dt = 10. / (b * min_vol)
-	dt = dt * 2
-
-	print("Suggested dt:", dt)
-	c = ['k', 'r', 'b', 'y', 'g', 'orange', 'brown', 'purple'] + ['k' for i in range(500)]
-
-	no_sets = len(Sets)
-
-
-	# no_samples = 3
-	# data = []
-	# samples = []
-	# repeats = 2
-	#
-	# for i in range(no_samples):
-	# 	j = np.random.randint(0, no_sets, 1)
-	# 	S = Sets[j[0]]
-	# 	for _ in range(repeats):
-	# 		sample = process.sample_discretized(S, dt)
-	# 		samples.append(sample)
-	# 		data.append((S, sample, dt))
-	#
-	# sample_D = process.sample_discretized(D, dt)
-	# samples.append(sample_D)
-	# no_samples = repeats * no_samples + 1
-	# data.append((D, sample_D, dt))
-
-
-	data_single = []
-	basic_sets = hierarchical_structure.get_sets_level(levels)
-	samples = []
-
-	for set in basic_sets:
-		sample = process.sample_discretized(set,dt)
-		data_single.append((set,sample,dt))
-		samples.append(sample)
-	data = data_single
-
-	# sample_D = torch.cat(samples)
-	# data = [(D,sample_D,dt)]
-
-	# data2 = []
-	# samples = []
-	# for set in basic_sets:
-	# 	sample = process.sample_discretized(set,dt*2)
-	# 	data2.append((set,sample,dt*2))
-	# 	samples.append(sample)
-	#
-	# sample_D_2 = torch.cat(samples)
-	# data = [(D, sample_D_2, dt*2)]
-	#
-	# data = data + data2
-
-	for estimator,name,band in zip(estimators,names,bands):
-		estimator.load_data(data)
-
-		xtest = D.return_discretization(n=n)
-
-		# likelihood based
-		estimator.fit_gp()
-		rate_mean = estimator.mean_rate(D,n = n)
-		p = plt.plot(xtest, rate_mean, label='likelihood: '+name)
-
-		if band == True:
-			_, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.)
-			plt.fill_between(xtest.numpy().flatten(), lcb.numpy().flatten(), ucb.numpy().flatten(), alpha=0.4,
-							 color=p[0].get_color(),	 label=name)
-
-
-
-	for j in range(len(samples)):
-		if samples[j] is not None:
-			plt.plot(samples[j], samples[j] * 0, 'o', color=c[j])
-
-	# for action in Sets:
-	# 	map, lcb, ucb = estimator.map_lcb_ucb_approx_action(action,beta=2.)
-	# 	x = np.linspace(action.bounds[0,0],action.bounds[0,1],2)
-	# 	plt.plot(x,x*0+float(ucb/action.volume()),'-o', color = "green")
-	process.visualize(D, samples=0, n=n, dt=1.)
-	plt.show()
+    torch.manual_seed(2)
+    np.random.seed(2)
+    d = 1
+    gamma = 0.1
+    n = 64
+    B = 4.0
+    b = 0.1
+
+    process = PoissonPointProcess(d=1, B=B, b=b)
+    Sets = []
+    levels = 4
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    Sets = hierarchical_structure.get_all_sets()
+
+    D = BorelSet(1, bounds=torch.tensor([[-1.0, 1.0]]).double())
+
+    m = 64
+    embedding = HermiteEmbedding(m=m, d=1, gamma=gamma)
+    k = KernelFunction(gamma=gamma)
+
+    estimator5 = PositiveRateEstimator(
+        process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d
+    )
+
+    estimator4 = PermanentalProcessRateEstimator(
+        process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d
+    )
+    # estimator = PermanentalProcessRateEstimator(process, hierarchical_structure,
+    # 											kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom", approx="ellipsoid")
+    # estimator = LogGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
+    estimator = LogGaussProcessRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B + 1,
+        m=m,
+        d=d,
+        embedding=embedding,
+    )
+
+    # estimator = LogisticGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
+    estimator2 = LogisticGaussProcessRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        embedding=embedding,
+    )
+    # estimator = ExpGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
+    estimator3 = ExpGaussProcessRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        embedding=embedding,
+    )
+
+    estimators = [estimator, estimator2, estimator3, estimator4, estimator5]
+    names = ["sigmoid", "logistic", "exp", "square", "no-link"]
+    bands = [True, False, False, False, True]
+
+    estimators = [estimator, estimator5, estimator4]
+    names = ["sigmoid", "no-link", "square"]
+    bands = [False, False, False]
+
+    min_vol, max_vol = estimator.get_min_max()
+    dt = 10.0 / (b * min_vol)
+    dt = dt * 2
+
+    print("Suggested dt:", dt)
+    c = ["k", "r", "b", "y", "g", "orange", "brown", "purple"] + [
+        "k" for i in range(500)
+    ]
+
+    no_sets = len(Sets)
+
+    # no_samples = 3
+    # data = []
+    # samples = []
+    # repeats = 2
+    #
+    # for i in range(no_samples):
+    # 	j = np.random.randint(0, no_sets, 1)
+    # 	S = Sets[j[0]]
+    # 	for _ in range(repeats):
+    # 		sample = process.sample_discretized(S, dt)
+    # 		samples.append(sample)
+    # 		data.append((S, sample, dt))
+    #
+    # sample_D = process.sample_discretized(D, dt)
+    # samples.append(sample_D)
+    # no_samples = repeats * no_samples + 1
+    # data.append((D, sample_D, dt))
+
+    data_single = []
+    basic_sets = hierarchical_structure.get_sets_level(levels)
+    samples = []
+
+    for set in basic_sets:
+        sample = process.sample_discretized(set, dt)
+        data_single.append((set, sample, dt))
+        samples.append(sample)
+    data = data_single
+
+    # sample_D = torch.cat(samples)
+    # data = [(D,sample_D,dt)]
+
+    # data2 = []
+    # samples = []
+    # for set in basic_sets:
+    # 	sample = process.sample_discretized(set,dt*2)
+    # 	data2.append((set,sample,dt*2))
+    # 	samples.append(sample)
+    #
+    # sample_D_2 = torch.cat(samples)
+    # data = [(D, sample_D_2, dt*2)]
+    #
+    # data = data + data2
+
+    for estimator, name, band in zip(estimators, names, bands):
+        estimator.load_data(data)
+
+        xtest = D.return_discretization(n=n)
+
+        # likelihood based
+        estimator.fit_gp()
+        rate_mean = estimator.mean_rate(D, n=n)
+        p = plt.plot(xtest, rate_mean, label="likelihood: " + name)
+
+        if band == True:
+            _, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.0)
+            plt.fill_between(
+                xtest.numpy().flatten(),
+                lcb.numpy().flatten(),
+                ucb.numpy().flatten(),
+                alpha=0.4,
+                color=p[0].get_color(),
+                label=name,
+            )
+
+    for j in range(len(samples)):
+        if samples[j] is not None:
+            plt.plot(samples[j], samples[j] * 0, "o", color=c[j])
+
+    # for action in Sets:
+    # 	map, lcb, ucb = estimator.map_lcb_ucb_approx_action(action,beta=2.)
+    # 	x = np.linspace(action.bounds[0,0],action.bounds[0,1],2)
+    # 	plt.plot(x,x*0+float(ucb/action.volume()),'-o', color = "green")
+    process.visualize(D, samples=0, n=n, dt=1.0)
+    plt.show()
diff --git a/stpy/point_processes/log_cox_process.py b/stpy/point_processes/log_cox_process.py
new file mode 100644
index 0000000..03e1780
--- /dev/null
+++ b/stpy/point_processes/log_cox_process.py
@@ -0,0 +1,247 @@
+from functools import partial
+from typing import List
+import numpy as np
+import scipy
+from stpy.borel_set import BorelSet
+from stpy.kernels import KernelFunction
+from tqdm import tqdm
+from autograd_minimize import minimize
+import torch
+
+device = torch.get_default_device()
+
+
+def sqrt(matrix: torch.Tensor) -> torch.Tensor:
+    return torch.from_numpy(
+        np.real(scipy.linalg.sqrtm(matrix.cpu().numpy() + 1e-5))
+    ).to(device)
+
+
+class LogCoxProcess:
+    def __init__(self, kernel_object: KernelFunction, integral_discretization: int):
+        self.kernel_object = kernel_object
+        self.kernel = kernel_object.kernel
+        self.integral_discretization = integral_discretization
+
+    def load_data(self, data: List):
+        # only works with 2d data!
+        observations = []
+        self.areas = []
+        dts = []
+        a_xs = []
+        a_ys = []
+        b_xs = []
+        b_ys = []
+
+        for A, x, dt in data:
+            observations.append(x)
+            a_xs.append(A.bounds[0][0])
+            b_xs.append(A.bounds[0][1])
+            a_ys.append(A.bounds[1][0])
+            b_ys.append(A.bounds[1][1])
+            dts.append(dt)
+            self.areas.append((A, dt))
+
+        self.observations = torch.cat(observations, dim=0)
+        self.dt = torch.tensor(dts, dtype=torch.float64)
+        self.a_x = torch.tensor(a_xs)
+        self.a_y = torch.tensor(a_ys)
+        self.b_x = torch.tensor(b_xs)
+        self.b_y = torch.tensor(b_ys)
+
+    def fit(self):
+        # Get the map by representer theorem
+        k_func = partial(self.kernel, b=self.observations)
+        k_int = self.kernel_object.integral(self.a_x, self.a_y, self.b_x, self.b_y)
+        k_obs = torch.cat(
+            (
+                k_func(a=self.observations),
+                self.dt.unsqueeze(1) * k_int(self.observations),
+            )
+        )
+
+        k_weights = []
+        k_nodes = []
+        k_factors = []
+        for A, dt in self.areas:
+            weights, nodes = A.return_legendre_discretization(
+                self.integral_discretization
+            )
+            nodes = nodes.to(device)
+            weights = weights.to(device)
+            k_n = torch.cat((k_func(a=nodes), self.dt.unsqueeze(1) * k_int(nodes)))
+            k_weights.append(weights)
+            k_nodes.append(k_n)
+            k_factors.append(dt)
+
+        k_int_int = []
+        for A, dt in self.areas:
+            weights, nodes = A.return_legendre_discretization(
+                self.integral_discretization
+            )
+            nodes = nodes.to(device)
+            weights = weights.to(device)
+            integral = dt * torch.sum(
+                weights * self.dt.unsqueeze(1) * k_int(nodes), dim=1
+            )  # sum over nodes
+            k_int_int.append(integral)
+
+        k_int_int = torch.stack(k_int_int)
+        k_obs_obs = k_func(a=self.observations)
+        k_int_obs = self.dt.unsqueeze(1) * k_int(
+            self.observations
+        )  # number of observations is columns
+        k_obs_int = k_int_obs.T
+
+        # Create one big kernel matrix out of the above four matrices
+        k_top = torch.cat((k_obs_obs, k_obs_int), dim=1)
+        k_bottom = torch.cat((k_int_obs, k_int_int), dim=1)
+        k_big = torch.cat((k_top, k_bottom), dim=0)
+
+        # Check if k_big is above zero everywhere
+        assert torch.all(k_big >= 0), "Kernel matrix should be strictly positive"
+
+        # Check if k_big is approximately symmetric
+        assert torch.allclose(
+            k_big, k_big.T, atol=1e-4
+        ), "Kernel matrix should be approximately symmetric"
+
+        def objective(alpha):
+            lkl_term_1 = (alpha @ k_obs).sum()  # Should be a single number now
+            lkl_term_2 = torch.sum(
+                torch.stack(
+                    [
+                        dt * torch.sum(w * torch.exp(alpha @ kn))
+                        for w, kn, dt in zip(k_weights, k_nodes, k_factors)
+                    ]
+                )
+            )
+
+            regularizer = alpha.T @ k_big @ alpha
+            return -lkl_term_1 + lkl_term_2 + regularizer * 0.5
+
+        alpha_0 = torch.zeros([len(self.observations) + len(self.a_x)])
+        res = minimize(
+            objective,
+            alpha_0.cpu().numpy(),
+            backend="torch",
+            method="L-BFGS-B",
+            precision="float64",
+            tol=1e-8,
+            torch_device=str(device),
+            options={
+                "ftol": 1e-08,
+                "gtol": 1e-08,
+                "eps": 1e-08,
+                "maxfun": 15000,
+                "maxiter": 15000,
+                "maxls": 20,
+            },
+        )
+        print(f"optimum found")
+
+        self.alpha_opt = torch.tensor(res.x)
+
+        def intensity(x: torch.tensor, dt=1):
+            k_obs = torch.cat((k_func(x), self.dt.unsqueeze(1) * k_int(x)))
+            return dt * torch.exp(torch.tensor(res.x) @ k_obs).unsqueeze(1)
+
+        self.rate_value = intensity
+
+        return intensity
+
+    def get_gamma_MAP(self, n, x, a, dt, lr=0.01, max_it=10000, eps=1e-6):
+        mean = 0
+        cov_Y = self.kernel(x, x)
+        Q = sqrt(cov_Y)
+        self.Q = Q
+
+        def f(arg):
+            y = arg @ Q + mean
+            return (-0.5) * arg.pow(2).sum() + (y * n - torch.exp(y) * a * dt).sum()
+
+        gamma = torch.zeros(len(x), dtype=torch.float64, requires_grad=True)
+        optimizer = torch.optim.SGD([gamma], lr=lr)
+
+        # Use tqdm to show progress
+        prev_loss = float("inf")
+        for _ in tqdm(range(max_it), desc="Optimizing gamma"):
+            optimizer.zero_grad()
+            loss = -f(gamma)  # we minimize -f because we want to maximize f
+            # if loss.item() > prev_loss:
+            #     print("Warning: Loss did not decrease")
+            prev_loss = loss.item()
+            loss.backward()
+            # If gradient is smaller than eps, return
+            if torch.norm(gamma.grad) < eps:
+                print("Solved to eps")
+                break
+            optimizer.step()
+
+        assert f(gamma) > f(
+            torch.distributions.MultivariateNormal(
+                loc=gamma, covariance_matrix=torch.eye(len(gamma)) * 50
+            ).sample()
+        )
+
+        return gamma.detach()
+
+    def sample_mala(self, n, x, a, dt, h, num_steps, burn_in_steps, initial_gamma=None):
+        # param n is 1d tensor with the counts of points in the cells
+        # param x is the discretization of the area we're interested in
+        # param a is either a 2d tensor with the areas of the discretization
+        # or a float that gives all areas
+        # step size h
+        gamma = self.get_MAP() if initial_gamma is None else initial_gamma
+        mean = 0  # prior mean I think?
+        cov_Y = self.kernel(x, x)
+        Q = sqrt(cov_Y)
+        self.Q = Q
+        accept_prob_sum = 0
+
+        # The log posterior over gamma given the data
+        def log_f(arg):
+            y = arg @ Q + mean
+            return (-0.5) * arg.pow(2).sum() + (y * n - torch.exp(y) * a * dt).sum()
+
+        base_line = log_f(gamma)
+
+        def f(arg):
+            return log_f(arg)  # - 2 * base_line
+
+        # Gradient of the energy
+        def grad(arg):
+            y = arg @ Q + mean
+            return -arg + (n - torch.exp(y) * a * dt) @ Q.T
+
+        # mean of the proposal distribution, named \xi in paper
+        def r_mean_given_arg(arg):
+            return arg + (h / 2.0) * grad(arg)
+
+        for i in range(num_steps):
+            # Proposal
+            proposal = torch.distributions.MultivariateNormal(
+                loc=r_mean_given_arg(gamma),
+                covariance_matrix=h * torch.eye(len(gamma), dtype=torch.float64),
+            ).sample()
+
+            accept_prob = torch.exp(
+                f(proposal)
+                - (gamma - r_mean_given_arg(proposal)).pow(2).sum() / (2 * h)
+            ) / (
+                torch.exp(
+                    f(gamma)
+                    - (proposal - r_mean_given_arg(gamma)).pow(2).sum() / (2 * h)
+                )
+            )
+
+            if np.random.rand() < accept_prob:
+                gamma = proposal
+
+            accept_prob_sum += min(accept_prob.item(), 1.0)
+
+            if i > burn_in_steps:
+                yield torch.exp(gamma @ Q + mean)
+
+        mean_accept_prob = accept_prob_sum / num_steps
+        print(mean_accept_prob)
diff --git a/stpy/point_processes/log_link_rate_estimator.py b/stpy/point_processes/log_link_rate_estimator.py
index 006470c..9ee1086 100644
--- a/stpy/point_processes/log_link_rate_estimator.py
+++ b/stpy/point_processes/log_link_rate_estimator.py
@@ -3,215 +3,267 @@
 import torch
 from scipy.optimize import minimize
 
-from stpy.point_processes.poisson.link_fun_rate_estimator import PermanentalProcessRateEstimator
+from stpy.point_processes.poisson.link_fun_rate_estimator import (
+    PermanentalProcessRateEstimator,
+)
 
 
 class LogGaussProcessRateEstimator(PermanentalProcessRateEstimator):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
-		self.discretization = 64
-
-		self.nodes = None
-		self.weights = None
-
-	def load_data(self, data):
-		super().load_data(data)
-
-		if len(data) > 1:
-			weights_arr = []
-			nodes_arr = []
-			for sample in data:
-				(S, obs, dt) = sample
-				weights, nodes = S.return_legendre_discretization(self.discretization)
-				nodes_arr.append(nodes)
-				weights_arr.append(weights * dt)
-
-			self.nodes = self.packing.embed(torch.cat(nodes_arr))
-			self.weights = torch.cat(weights_arr)
-
-	def add_data_point(self, new_data):
-		super().add_data_point(new_data)
-
-		S, obs, dt = new_data
-		weights, nodes = S.return_legendre_discretization(self.discretization)
-
-		if self.nodes is None:
-			self.nodes = self.packing.embed(nodes)
-			self.weights = weights * dt
-		else:
-			self.nodes = torch.cat((self.nodes, self.packing.embed(nodes)))
-			self.weights = torch.cat((self.weights, weights * dt))
-
-	def sample(self, verbose=False, steps=100, stepsize=None):
-
-		sigmoid_der_1 = lambda x: torch.exp(-x) / (torch.exp(-x) + 1) ** 2
-
-		if self.data is None:
-			self.sampled_theta = torch.zeros(self.get_m()).double().view(-1, 1)
-			return None
-
-		if self.observations is not None:
-			weights = self.weights
-			nodes = self.nodes
-
-			nabla = lambda theta: -torch.sum(
-				torch.diag(
-					sigmoid_der_1(self.observations @ theta).view(-1) / self.sigmoid(self.observations @ theta).view(
-						-1)) @ self.observations, dim=0).view(-1, 1) \
-								  + self.B * torch.sum(
-				torch.diag(weights.view(-1) * sigmoid_der_1(nodes @ theta).view(-1)) @ nodes, dim=0).view(-1,
-																										  1) + self.s * theta.view(
-				-1, 1)
-		else:
-			weights = self.weights
-			nodes = self.nodes
-			nabla = lambda theta: self.B * torch.sum(
-				torch.diag(weights.view(-1) * sigmoid_der_1(nodes @ theta).view(-1)) @ nodes, dim=0).view(-1,
-																										  1) + self.s * theta.view(
-				-1, 1)
-
-		# theta = self.rate.view(-1, 1)*np.nan
-
-		#	while torch.sum(torch.isnan(theta))>0:
-
-		theta = self.rate.view(-1, 1)
-		for k in range(steps):
-
-			W = self.construct_covariance_matrix_laplace(theta.view(-1))
-			L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-4))
-			eta = 0.5 / (L + 1)
-			print(eta)
-			s = torch.randn(size=(self.get_m(), 1)).double()
-			theta = theta - eta * nabla(theta) + np.sqrt(2 * eta) * s
-
-			if verbose == True:
-				print("Iter:", k, theta.T)
-
-		self.sampled_theta = theta
-		return None
-
-	def sample_value(self, S):
-		"""
-		Given a pre-sampled value evaluate certain portions of the domain S
-		:param S:
-		:return:
-		"""
-		weights, nodes = S.return_legendre_discretization(64)
-		Phi = self.packing.embed(nodes)
-		map_vals = torch.sum(weights * self.B * self.sigmoid(Phi @ self.sampled_theta))
-		return map_vals
-
-	def sample_path(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return self.sigmoid(self.packing.embed(xtest) @ self.sampled_theta) * self.B
-
-	def penalized_likelihood(self, threads=4):
-		sigmoid = lambda x: 1. / (1. + np.exp(-x))
-		weights = self.weights.numpy()
-		nodes = self.nodes.numpy()
-		# times = self.times.numpy()
-		# times = self.times.numpy()
-
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			# loss = lambda theta: float(-np.sum(np.log(self.B * sigmoid(observations @ theta))) \
-			# + self.B * np.einsum('i,i',(weights ,sigmoid(nodes @ theta))) + self.s * np.sum(theta ** 2))
-			loss = lambda theta: float(-np.sum(np.log(self.B * sigmoid(observations @ theta))) \
-									   + self.B * np.sum(
-				weights * sigmoid(nodes @ theta).reshape(-1)) + 0.5 * self.s * np.sum(theta ** 2))
-
-		else:
-			loss = lambda theta: float(
-				+self.B * np.sum(weights * sigmoid(theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-
-		theta = np.zeros(self.get_m())
-		res = minimize(loss, theta, jac=None, method='L-BFGS-B', options={'maxcor': 20, 'iprint': -1,
-																		  'maxfun': 150000, 'maxls': 50, 'ftol': 1e-12,
-																		  'eps': 1e-12, 'gtol': 1e-8})
-
-		self.rate = torch.from_numpy(res.x)
-
-		return self.rate
-
-	def construct_covariance_matrix_laplace(self, theta=None):
-		sigmoid_der_1 = lambda x: np.exp(-x) / (np.exp(-x) + 1) ** 2
-		sigmoid_der_2 = lambda x: 2 * np.exp(-2 * x) / (np.exp(-x) + 1) ** 3 - np.exp(-x) / (np.exp(-x) + 1) ** 2
-		sigmoid = lambda x: 1. / (1. + np.exp(-x))
-
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-
-		if self.feedback == "count-record":
-			if self.observations is not None:
-				if theta is None:
-					input = (self.observations @ self.rate).view(-1)
-				else:
-					input = (self.observations @ theta).view(-1)
-				scales = (sigmoid_der_1(input) ** 2 + sigmoid_der_2(input) * sigmoid(input)) / (sigmoid(input) ** 2)
-				W = torch.einsum('ij,i,ik->jk', self.observations, scales, self.observations)
-
-			if self.nodes is not None:
-				if theta is None:
-					scales = self.B * sigmoid_der_2(self.nodes @ self.rate) * self.weights
-				else:
-					scales = self.B * sigmoid_der_2(self.nodes @ theta) * self.weights
-				Z = torch.einsum('ij,i,ik->jk', self.nodes, scales, self.nodes)
-				W = W + Z
-
-		else:
-			raise AssertionError("Not implemented.")
-		return W + torch.eye(self.get_m()).double() * self.s
-
-	def mean_var_laplace_set(self, S, dt, beta=2.):
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix_laplace()
-			self.approx_fit = True
-			self.W_inv_approx = torch.pinverse(self.W)
-		return self.mean_std_per_action(S, self.W, dt, beta)
-
-	def mean_std_per_action(self, S, W, dt, beta):
-		weights, nodes = S.return_legendre_discretization(64)
-		Phi = self.packing.embed(nodes)
-		vars = torch.einsum('ij,jk,ki->i', Phi, self.W_inv_approx, Phi.T)
-
-		vars = (vars + np.abs(vars)) / 2
-		map_vals = weights * self.B * self.sigmoid(Phi @ self.rate)
-		lcb_vals = weights * self.B * self.sigmoid(Phi @ self.rate - beta * np.sqrt(vars))
-		ucb_vals = weights * self.B * self.sigmoid(Phi @ self.rate + beta * np.sqrt(vars))
-
-		return dt * torch.sum(map_vals), dt * torch.sum(ucb_vals), torch, sum(lcb_vals) * dt
-
-	def map_lcb_ucb(self, S, n, beta=2.0, delta=0.01):
-		"""
-		Calculate exact confidence using laplace approximation on a whole set domain
-		:param S: set
-		:param n: discretization
-		:param beta: beta
-		:return:
-		"""
-
-		xtest = S.return_discretization(n)
-		if self.data is None:
-			return self.b + 0 * xtest[:, 0].view(-1, 1), self.b + 0 * xtest[:, 0].view(-1, 1), self.B + 0 * xtest[:,
-																											0].view(-1,
-																													1)
-
-		Phi = self.packing.embed(xtest)
-		map = self.B * self.sigmoid(Phi @ self.rate)
-
-		if self.uncertainty == "laplace":
-			W = self.construct_covariance_matrix_laplace()
-		W_inv = torch.pinverse(W)
-
-		vars = torch.einsum('ij,jk,ki->i', Phi, W_inv, Phi.T)
-		lcb = self.B * self.sigmoid(Phi @ self.rate - beta * np.sqrt(vars))
-		ucb = self.B * self.sigmoid(Phi @ self.rate + beta * np.sqrt(vars))
-
-		return map, lcb, ucb
-
-	def sigmoid(self, x):
-		return 1. / (1. + torch.exp(-x))
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return self.sigmoid(self.packing.embed(xtest) @ self.rate) * self.B
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.discretization = 64
+
+        self.nodes = None
+        self.weights = None
+
+    def load_data(self, data):
+        super().load_data(data)
+
+        if len(data) > 1:
+            weights_arr = []
+            nodes_arr = []
+            for sample in data:
+                (S, obs, dt) = sample
+                weights, nodes = S.return_legendre_discretization(self.discretization)
+                nodes_arr.append(nodes)
+                weights_arr.append(weights * dt)
+
+            self.nodes = self.packing.embed(torch.cat(nodes_arr))
+            self.weights = torch.cat(weights_arr)
+
+    def add_data_point(self, new_data):
+        super().add_data_point(new_data)
+
+        S, obs, dt = new_data
+        weights, nodes = S.return_legendre_discretization(self.discretization)
+
+        if self.nodes is None:
+            self.nodes = self.packing.embed(nodes)
+            self.weights = weights * dt
+        else:
+            self.nodes = torch.cat((self.nodes, self.packing.embed(nodes)))
+            self.weights = torch.cat((self.weights, weights * dt))
+
+    def sample(self, verbose=False, steps=100, stepsize=None):
+
+        sigmoid_der_1 = lambda x: torch.exp(-x) / (torch.exp(-x) + 1) ** 2
+
+        if self.data is None:
+            self.sampled_theta = torch.zeros(self.get_m()).double().view(-1, 1)
+            return None
+
+        if self.observations is not None:
+            weights = self.weights
+            nodes = self.nodes
+
+            nabla = (
+                lambda theta: -torch.sum(
+                    torch.diag(
+                        sigmoid_der_1(self.observations @ theta).view(-1)
+                        / self.sigmoid(self.observations @ theta).view(-1)
+                    )
+                    @ self.observations,
+                    dim=0,
+                ).view(-1, 1)
+                + self.B
+                * torch.sum(
+                    torch.diag(weights.view(-1) * sigmoid_der_1(nodes @ theta).view(-1))
+                    @ nodes,
+                    dim=0,
+                ).view(-1, 1)
+                + self.s * theta.view(-1, 1)
+            )
+        else:
+            weights = self.weights
+            nodes = self.nodes
+            nabla = lambda theta: self.B * torch.sum(
+                torch.diag(weights.view(-1) * sigmoid_der_1(nodes @ theta).view(-1))
+                @ nodes,
+                dim=0,
+            ).view(-1, 1) + self.s * theta.view(-1, 1)
+
+        # theta = self.rate.view(-1, 1)*np.nan
+
+        # 	while torch.sum(torch.isnan(theta))>0:
+
+        theta = self.rate.view(-1, 1)
+        for k in range(steps):
+
+            W = self.construct_covariance_matrix_laplace(theta.view(-1))
+            L = float(
+                scipy.sparse.linalg.eigsh(
+                    W.numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-4
+                )
+            )
+            eta = 0.5 / (L + 1)
+            print(eta)
+            s = torch.randn(size=(self.get_m(), 1)).double()
+            theta = theta - eta * nabla(theta) + np.sqrt(2 * eta) * s
+
+            if verbose == True:
+                print("Iter:", k, theta.T)
+
+        self.sampled_theta = theta
+        return None
+
+    def sample_value(self, S):
+        """
+        Given a pre-sampled value evaluate certain portions of the domain S
+        :param S:
+        :return:
+        """
+        weights, nodes = S.return_legendre_discretization(64)
+        Phi = self.packing.embed(nodes)
+        map_vals = torch.sum(weights * self.B * self.sigmoid(Phi @ self.sampled_theta))
+        return map_vals
+
+    def sample_path(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return self.sigmoid(self.packing.embed(xtest) @ self.sampled_theta) * self.B
+
+    def penalized_likelihood(self, threads=4):
+        sigmoid = lambda x: 1.0 / (1.0 + np.exp(-x))
+        weights = self.weights.numpy()
+        nodes = self.nodes.numpy()
+        # times = self.times.numpy()
+        # times = self.times.numpy()
+
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            # loss = lambda theta: float(-np.sum(np.log(self.B * sigmoid(observations @ theta))) \
+            # + self.B * np.einsum('i,i',(weights ,sigmoid(nodes @ theta))) + self.s * np.sum(theta ** 2))
+            loss = lambda theta: float(
+                -np.sum(np.log(self.B * sigmoid(observations @ theta)))
+                + self.B * np.sum(weights * sigmoid(nodes @ theta).reshape(-1))
+                + 0.5 * self.s * np.sum(theta**2)
+            )
+
+        else:
+            loss = lambda theta: float(
+                +self.B * np.sum(weights * sigmoid(theta @ nodes.T))
+                + self.s * np.sum(theta**2)
+            )
+
+        theta = np.zeros(self.get_m())
+        res = minimize(
+            loss,
+            theta,
+            jac=None,
+            method="L-BFGS-B",
+            options={
+                "maxcor": 20,
+                "iprint": -1,
+                "maxfun": 150000,
+                "maxls": 50,
+                "ftol": 1e-12,
+                "eps": 1e-12,
+                "gtol": 1e-8,
+            },
+        )
+
+        self.rate = torch.from_numpy(res.x)
+
+        return self.rate
+
+    def construct_covariance_matrix_laplace(self, theta=None):
+        sigmoid_der_1 = lambda x: np.exp(-x) / (np.exp(-x) + 1) ** 2
+        sigmoid_der_2 = (
+            lambda x: 2 * np.exp(-2 * x) / (np.exp(-x) + 1) ** 3
+            - np.exp(-x) / (np.exp(-x) + 1) ** 2
+        )
+        sigmoid = lambda x: 1.0 / (1.0 + np.exp(-x))
+
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+
+        if self.feedback == "count-record":
+            if self.observations is not None:
+                if theta is None:
+                    input = (self.observations @ self.rate).view(-1)
+                else:
+                    input = (self.observations @ theta).view(-1)
+                scales = (
+                    sigmoid_der_1(input) ** 2 + sigmoid_der_2(input) * sigmoid(input)
+                ) / (sigmoid(input) ** 2)
+                W = torch.einsum(
+                    "ij,i,ik->jk", self.observations, scales, self.observations
+                )
+
+            if self.nodes is not None:
+                if theta is None:
+                    scales = (
+                        self.B * sigmoid_der_2(self.nodes @ self.rate) * self.weights
+                    )
+                else:
+                    scales = self.B * sigmoid_der_2(self.nodes @ theta) * self.weights
+                Z = torch.einsum("ij,i,ik->jk", self.nodes, scales, self.nodes)
+                W = W + Z
+
+        else:
+            raise AssertionError("Not implemented.")
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def mean_var_laplace_set(self, S, dt, beta=2.0):
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix_laplace()
+            self.approx_fit = True
+            self.W_inv_approx = torch.pinverse(self.W)
+        return self.mean_std_per_action(S, self.W, dt, beta)
+
+    def mean_std_per_action(self, S, W, dt, beta):
+        weights, nodes = S.return_legendre_discretization(64)
+        Phi = self.packing.embed(nodes)
+        vars = torch.einsum("ij,jk,ki->i", Phi, self.W_inv_approx, Phi.T)
+
+        vars = (vars + np.abs(vars)) / 2
+        map_vals = weights * self.B * self.sigmoid(Phi @ self.rate)
+        lcb_vals = (
+            weights * self.B * self.sigmoid(Phi @ self.rate - beta * np.sqrt(vars))
+        )
+        ucb_vals = (
+            weights * self.B * self.sigmoid(Phi @ self.rate + beta * np.sqrt(vars))
+        )
+
+        return (
+            dt * torch.sum(map_vals),
+            dt * torch.sum(ucb_vals),
+            torch,
+            sum(lcb_vals) * dt,
+        )
+
+    def map_lcb_ucb(self, S, n, beta=2.0, delta=0.01):
+        """
+        Calculate exact confidence using laplace approximation on a whole set domain
+        :param S: set
+        :param n: discretization
+        :param beta: beta
+        :return:
+        """
+
+        xtest = S.return_discretization(n)
+        if self.data is None:
+            return (
+                self.min_intensity + 0 * xtest[:, 0].view(-1, 1),
+                self.min_intensity + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, 1),
+            )
+
+        Phi = self.packing.embed(xtest)
+        map = self.B * self.sigmoid(Phi @ self.rate)
+
+        if self.uncertainty == "laplace":
+            W = self.construct_covariance_matrix_laplace()
+        W_inv = torch.pinverse(W)
+
+        vars = torch.einsum("ij,jk,ki->i", Phi, W_inv, Phi.T)
+        lcb = self.B * self.sigmoid(Phi @ self.rate - beta * np.sqrt(vars))
+        ucb = self.B * self.sigmoid(Phi @ self.rate + beta * np.sqrt(vars))
+
+        return map, lcb, ucb
+
+    def sigmoid(self, x):
+        return 1.0 / (1.0 + torch.exp(-x))
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return self.sigmoid(self.packing.embed(xtest) @ self.rate) * self.B
diff --git a/stpy/point_processes/loglinear_estimator.py b/stpy/point_processes/loglinear_estimator.py
index 221880a..179dc79 100644
--- a/stpy/point_processes/loglinear_estimator.py
+++ b/stpy/point_processes/loglinear_estimator.py
@@ -8,179 +8,207 @@
 import matplotlib.pyplot as plt
 from stpy.embeddings.embedding import HermiteEmbedding
 import scipy.integrate as integrate
-from stpy.helpers.ellipsoid_algorithms import maximize_quadratic_on_ellipse, minimize_quadratic_on_ellipse
+from stpy.helpers.ellipsoid_algorithms import (
+    maximize_quadratic_on_ellipse,
+    minimize_quadratic_on_ellipse,
+)
 from stpy.point_processes.poisson import PoissonPointProcess
 from stpy.point_processes.poisson_rate_estimator import PositiveRateEstimator
 from stpy.borel_set import BorelSet, HierarchicalBorelSets
 from stpy.kernels import KernelFunction
 
-class LogLinearRateEstimator(PositiveRateEstimator):
-
-	def __init__(self,*args,**kwargs):
-		super().__init__(*args,**kwargs)
-
-	def least_squares_weighted(self, threads=0):
-		theta = cp.Variable(self.get_m())
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		phis = self.varphis[mask, :].clone().numpy()
-		tau = self.total_bucketized_time.clone().numpy()
-
-		variances = self.variances.view(-1).clone().numpy()
-
-		for i in range(variances.shape[0]):
-			if mask[i] > 0:
-				variances[i] = variances[i] * tau[i] * self.variance_correction(variances[i] * tau[i])
-
-		selected_variances = variances[mask]
-		print (np.log(observations))
-		print (selected_variances)
-		objective = cp.Minimize(
-			cp.sum_squares((phis @ theta) - np.log(observations)/tau[mask])	)#+ self.s * cp.norm2(theta))
-
-		prob = cp.Problem(objective)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=True,
-				   mosek_params={mosek.iparam.num_threads: threads})
-
-		self.rate = torch.from_numpy(theta.value)
-		print (self.rate)
-		return self.rate
-
 
-	def mean_var_reg_set(self, S, dt=1., beta=2.):
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix_regression()
-			self.approx_fit = True
-
-		map = 0
-		lcb = 0
-		ucb = 0
-		for set in self.basic_sets:
-			if S.inside(set):
-				x = self.packing.integral(set).view(-1,1)
-				lcb = lcb +torch.exp(dt*(x@self.rate - beta*np.sqrt(x.T@self.W_inv@x) ))
-				ucb = ucb + torch.exp(dt*(x@self.rate + beta*np.sqrt(x.T@self.W_inv@x)))
-				map = map + torch.exp(dt*x@self.rate)
-		return map,ucb, lcb
-
-	def fit_ellipsoid_approx(self):
-		self.W =self.construct_covariance_matrix_regression()
-		self.W_inv = torch.pinverse(self.W)
-
-	# def map_lcb_ucb_approx_action(self, S, dt=1., beta=2.):
-	# 	phi = self.packing.integral(S) * dt
-	# 	map = phi @ self.rate
-	# 	ucb = map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
-	# 	ucb = np.minimum(ucb, self.B * S.volume() * dt)
-	#
-	# 	lcb = map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
-	# 	lcb = np.maximum(lcb, self.b * S.volume() * dt)
-	# 	return map, lcb, ucb
-
-
-	def construct_covariance_matrix_regression(self):
-
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-
-		if self.data is not None:
-			variances = self.variances
-
-			if self.feedback == "count-record":
-				mask = self.bucketized_counts > 0
-				tau = self.total_bucketized_time
-				for index_o, o in enumerate(self.bucketized_obs):
-					n = mask[index_o]
-					if n > 0:
-						A = self.varphis[index_o, :].view(-1, 1) @ self.varphis[index_o, :].view(1, -1) * tau[index_o]
-						W = W + A / (variances[index_o])
-
-			elif self.feedback == "histogram":
-
-				for datapoint in self.data:
-					(S, obs, dt) = datapoint
-					varphi = self.packing.integral(S) * dt
-					variance = varphi@self.rate
-					variance = variance
-					A = varphi.view(-1, 1) @ varphi.view(1, -1)
-					W = W + A / variance
-
-		return W + torch.eye(self.get_m()).double() * self.s
+class LogLinearRateEstimator(PositiveRateEstimator):
 
-	def mean_set(self, S, dt=1.):
-		mu = 0
-		for set in self.basic_sets:
-			if S.inside(set):
-				mu = mu + torch.exp(dt*self.packing.integral(set)@self.rate)
-		return mu
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def least_squares_weighted(self, threads=0):
+        theta = cp.Variable(self.get_m())
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        phis = self.varphis[mask, :].clone().numpy()
+        tau = self.total_bucketized_time.clone().numpy()
+
+        variances = self.variances.view(-1).clone().numpy()
+
+        for i in range(variances.shape[0]):
+            if mask[i] > 0:
+                variances[i] = (
+                    variances[i]
+                    * tau[i]
+                    * self.variance_correction(variances[i] * tau[i])
+                )
+
+        selected_variances = variances[mask]
+        print(np.log(observations))
+        print(selected_variances)
+        objective = cp.Minimize(
+            cp.sum_squares((phis @ theta) - np.log(observations) / tau[mask])
+        )  # + self.s * cp.norm2(theta))
+
+        prob = cp.Problem(objective)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=True,
+            mosek_params={mosek.iparam.num_threads: threads},
+        )
+
+        self.rate = torch.from_numpy(theta.value)
+        print(self.rate)
+        return self.rate
+
+    def mean_var_reg_set(self, S, dt=1.0, beta=2.0):
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix_regression()
+            self.approx_fit = True
+
+        map = 0
+        lcb = 0
+        ucb = 0
+        for set in self.basic_sets:
+            if S.inside(set):
+                x = self.packing.integral(set).view(-1, 1)
+                lcb = lcb + torch.exp(
+                    dt * (x @ self.rate - beta * np.sqrt(x.T @ self.W_inv @ x))
+                )
+                ucb = ucb + torch.exp(
+                    dt * (x @ self.rate + beta * np.sqrt(x.T @ self.W_inv @ x))
+                )
+                map = map + torch.exp(dt * x @ self.rate)
+        return map, ucb, lcb
+
+    def fit_ellipsoid_approx(self):
+        self.W = self.construct_covariance_matrix_regression()
+        self.W_inv = torch.pinverse(self.W)
+
+    # def map_lcb_ucb_approx_action(self, S, dt=1., beta=2.):
+    # 	phi = self.packing.integral(S) * dt
+    # 	map = phi @ self.rate
+    # 	ucb = map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
+    # 	ucb = np.minimum(ucb, self.B * S.volume() * dt)
+    #
+    # 	lcb = map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
+    # 	lcb = np.maximum(lcb, self.b * S.volume() * dt)
+    # 	return map, lcb, ucb
+
+    def construct_covariance_matrix_regression(self):
+
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+
+        if self.data is not None:
+            variances = self.variances
+
+            if self.feedback == "count-record":
+                mask = self.bucketized_counts > 0
+                tau = self.total_bucketized_time
+                for index_o, o in enumerate(self.bucketized_obs):
+                    n = mask[index_o]
+                    if n > 0:
+                        A = (
+                            self.varphis[index_o, :].view(-1, 1)
+                            @ self.varphis[index_o, :].view(1, -1)
+                            * tau[index_o]
+                        )
+                        W = W + A / (variances[index_o])
+
+            elif self.feedback == "histogram":
+
+                for datapoint in self.data:
+                    (S, obs, dt) = datapoint
+                    varphi = self.packing.integral(S) * dt
+                    variance = varphi @ self.rate
+                    variance = variance
+                    A = varphi.view(-1, 1) @ varphi.view(1, -1)
+                    W = W + A / variance
+
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def mean_set(self, S, dt=1.0):
+        mu = 0
+        for set in self.basic_sets:
+            if S.inside(set):
+                mu = mu + torch.exp(dt * self.packing.integral(set) @ self.rate)
+        return mu
 
 
 if __name__ == "__main__":
-	torch.manual_seed(2)
-	np.random.seed(2)
-	d = 1
-	gamma = 0.1
-	n = 64
-	B = 4.
-	b = 0.1
-
-	process = PoissonPointProcess(d=1, B=B, b=b)
-	Sets = []
-	levels = 5
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	Sets = hierarchical_structure.get_all_sets()
-
-	D = BorelSet(1, bounds=torch.Tensor([[-1., 1.]]).double())
-
-	m = 128
-	k = KernelFunction(gamma = gamma)
-	estimator = LogLinearRateEstimator(process, hierarchical_structure,
-									   kernel_object=k, B=B, m=m, d=d, estimator='least-sq')
-
-	min_vol, max_vol = estimator.get_min_max()
-
-	dt = 1. / (b * min_vol)
-	dt = dt * 2
-
-	print("Suggested dt:", dt)
-	c = ['k', 'r', 'b', 'y', 'g', 'orange', 'brown', 'purple'] + ['k' for i in range(500)]
-
-	no_sets = len(Sets)
-	no_samples = 0
-	data = []
-	samples = []
-	repeats = 2
-
-	for i in range(no_samples):
-		j = np.random.randint(0, no_sets, 1)
-		S = Sets[j[0]]
-		for _ in range(repeats):
-			sample = process.sample_discretized(S, dt)
-			samples.append(sample)
-			data.append((S, sample, dt))
-
-	sample_D = process.sample_discretized(D, dt)
-	samples.append(sample_D)
-	no_samples = repeats * no_samples + 1
-	data.append((D, sample_D, dt))
-
-	estimator.load_data(data)
-
-	xtest = D.return_discretization(n=n)
-
-	# likelihood based
-	estimator.fit_gp()
-
-	for set in estimator.basic_sets:
-		x = np.linspace(set.bounds[0,0],set.bounds[0,1],2)
-		val = estimator.mean_set(set)
-		plt.plot(x,x*0+float(val),'b-o')
-		vol = process.rate_volume(set)
-		plt.plot(x, x * 0 + float(vol), '-o',color = 'orange')
-	for j in range(no_samples):
-		if samples[j] is not None:
-			plt.plot(samples[j], samples[j] * 0, 'o', color=c[j])
-
-	process.visualize(D, samples=0, n=n, dt=1.)
\ No newline at end of file
+    torch.manual_seed(2)
+    np.random.seed(2)
+    d = 1
+    gamma = 0.1
+    n = 64
+    B = 4.0
+    b = 0.1
+
+    process = PoissonPointProcess(d=1, B=B, b=b)
+    Sets = []
+    levels = 5
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    Sets = hierarchical_structure.get_all_sets()
+
+    D = BorelSet(1, bounds=torch.tensor([[-1.0, 1.0]]).double())
+
+    m = 128
+    k = KernelFunction(gamma=gamma)
+    estimator = LogLinearRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        estimator="least-sq",
+    )
+
+    min_vol, max_vol = estimator.get_min_max()
+
+    dt = 1.0 / (b * min_vol)
+    dt = dt * 2
+
+    print("Suggested dt:", dt)
+    c = ["k", "r", "b", "y", "g", "orange", "brown", "purple"] + [
+        "k" for i in range(500)
+    ]
+
+    no_sets = len(Sets)
+    no_samples = 0
+    data = []
+    samples = []
+    repeats = 2
+
+    for i in range(no_samples):
+        j = np.random.randint(0, no_sets, 1)
+        S = Sets[j[0]]
+        for _ in range(repeats):
+            sample = process.sample_discretized(S, dt)
+            samples.append(sample)
+            data.append((S, sample, dt))
+
+    sample_D = process.sample_discretized(D, dt)
+    samples.append(sample_D)
+    no_samples = repeats * no_samples + 1
+    data.append((D, sample_D, dt))
+
+    estimator.load_data(data)
+
+    xtest = D.return_discretization(n=n)
+
+    # likelihood based
+    estimator.fit_gp()
+
+    for set in estimator.basic_sets:
+        x = np.linspace(set.bounds[0, 0], set.bounds[0, 1], 2)
+        val = estimator.mean_set(set)
+        plt.plot(x, x * 0 + float(val), "b-o")
+        vol = process.rate_volume(set)
+        plt.plot(x, x * 0 + float(vol), "-o", color="orange")
+    for j in range(no_samples):
+        if samples[j] is not None:
+            plt.plot(samples[j], samples[j] * 0, "o", color=c[j])
+
+    process.visualize(D, samples=0, n=n, dt=1.0)
diff --git a/stpy/point_processes/mbr_positive_estimator.py b/stpy/point_processes/mbr_positive_estimator.py
index ac753c5..ef0d29d 100644
--- a/stpy/point_processes/mbr_positive_estimator.py
+++ b/stpy/point_processes/mbr_positive_estimator.py
@@ -9,357 +9,428 @@
 import numpy as np
 import mosek
 
-class MBRPositiveEstimator(PermanentalProcessRateEstimator):
-
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args,**kwargs)
-
-		if self.feedback == "count-record":
-			self.varLambdas_vec = torch.zeros( size = (self.varLambdas.size()[0],self.varLambdas.size()[1]*self.varLambdas.size()[2])).double()
-			for i in range(self.varLambdas.size()[0]):
-				self.varLambdas_vec[i,:] = self.varLambdas[i,:,:].reshape(-1)
-
-		self.approx_solver = True
-
-	def fit_gp(self, threads=4):
-		if self.data is not None:
-			super().fit_gp(threads=threads)
-		else:
-			self.rate = None
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		emb = self.packing.embed(xtest)
-		mu = torch.einsum('ij,jk,ik->i',emb,self.rate,emb).view(-1,1)
-		return mu
 
-	def rate_value(self, x, dt=1):
-		emb = self.packing.embed(x)*dt
-		mu = torch.einsum('ij,jk,ik->i',emb,self.rate,emb).view(-1,1)
-		return mu
-
-	def mean_set(self,S,dt =1.):
-		if self.data is not None:
-			emb = self.product_integral(S) * dt
-			mu = torch.trace(emb@self.rate).view(1,1)
-		else:
-			mu = self.b*S.volume()
-		return mu
-
-	def penalized_likelihood(self, threads=4):
-		sumLambda = self.sumLambda.numpy()
-		Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			# cost = cp.sum_squares(cp.diag(emb @ A @ emb.T) - y.view(-1).numpy()) / (self.s ** 2) + (self.lam) * cp.norm(A, "fro")
-			objective = -cp.sum(cp.log(observations @ Theta @ observations.T)) + \
-						cp.trace(sumLambda @ Theta) + self.s * cp.sum_squares(cp.vec(Theta))
-		else:
-			objective = cp.trace(sumLambda @ Theta) + self.s * cp.sum_squares(cp.vec(Theta))
-
-		# if self.get_m() == 2:
-		# 	# use Lorentz-cone special result
-		# 	constraints = [cp.SOC(Theta[0,0]+Theta[1,1],Theta[1,1]    )]
-		# else:
-		# 	constraints = [Theta >> 0]
-		constraints = []
-		prob = cp.Problem(cp.Minimize(objective), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-					mosek.iparam.intpnt_solve_form:mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap:1e-3})
-		self.rate = torch.from_numpy(Theta.value)
-		return self.rate
-
-
-
-	def penalized_likelihood_bins(self, threads=4):
-		Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		tau = self.total_bucketized_time[mask].clone().numpy()
-		varLambdas_vec = self.varLambdas_vec[mask,:].clone().numpy()
-
-
-		objective = -cp.sum(observations @ cp.log(cp.multiply(tau,varLambdas_vec @ cp.vec(Theta)) ) ) + \
-						cp.sum(cp.multiply(tau,varLambdas_vec @ cp.vec(Theta))) + self.s * cp.sum_squares(cp.vec(Theta))
-
-		constraints = [Theta >> 0]
-		prob = cp.Problem(cp.Minimize(objective), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-					mosek.iparam.intpnt_solve_form:mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap:1e-3})
-		self.rate = torch.from_numpy(Theta.value)
-		return self.rate
-
-
-	def least_squares_weighted(self,threads = 4 ):
+class MBRPositiveEstimator(PermanentalProcessRateEstimator):
 
-		if self.approx_fit == False:
-			self.bucketization()
-
-		Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		tau = self.total_bucketized_time.clone().numpy()
-
-		# varsumLambdas
-		varLambdas_vec = self.varLambdas_vec[mask,:].clone().numpy()
-
-		variances = self.variances.view(-1).clone().numpy()
-
-		for i in range(variances.shape[0]):
-			if mask[i] > 0:
-				variances[i] = variances[i] * tau[i]* self.variance_correction(variances[i] * tau[i])
-
-		selected_variances = variances[mask]
-
-
-		objective =	cp.sum_squares( (varLambdas_vec@cp.vec(Theta) +
-										- observations)/np.sqrt(selected_variances) )+ self.s*cp.sum_squares(cp.vec(Theta))/2
-		constraints = [Theta >> 0]
-		prob = cp.Problem(cp.Minimize(objective), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-					mosek.iparam.intpnt_solve_form:mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap:1e-3})
-
-		self.rate = torch.from_numpy(Theta.value)
-		return self.rate
-
-	def construct_covariance_matrix(self):
-		if self.estimator == "bins":
-			self.construct_covariance_matrix_bins()
-		elif self.estimator =="least-sq":
-			self.construct_covariance_matrix_regression()
-		else:
-			raise NotImplementedError("Covariance not implemented")
-
-	def construct_covariance_matrix_regression(self):
-		varLambdas = self.varLambdas_vec.clone()
-		variances = self.variances
-		mask = self.bucketized_counts > 0
-		tau = self.total_bucketized_time
-		W = torch.zeros(size=(self.get_m()**2, self.get_m()**2)).double()
-		I = torch.eye(self.get_m() ** 2).double()
-		W_inv = self.s * torch.eye(self.get_m() ** 2).double()
-
-		for index_o, o in enumerate(self.bucketized_obs):
-			n = mask[index_o]
-			if n > 0:
-				k = self.variance_correction(tau[index_o] * variances[index_o])
-				v = tau[index_o] / (variances[index_o] * k)
-
-				vec = varLambdas[index_o, :].view(-1, 1)
-				A =  vec @ vec.T
-				W = W + A * v
-				denom = 1. + v*vec.T@W_inv@vec
-				W_inv = W_inv @ (I - v* vec@(vec.T@W_inv)/denom )
-
-		self.W = W +  self.s * torch.eye(self.get_m() ** 2).double()
-		self.W_inv = W_inv
-		#self.W_cholesky = torch.cholesky(self.W, upper=True)
-		return self.W
-
-
-	def construct_covariance_matrix_bins(self):
-		self.construct_covariance_matrix_regression()
-
-	def mean_var_reg_set(self,S, dt=1., beta=2., lcb_compute = False):
-
-		if self.data is None:
-			return S.volume()*self.b,S.volume()*self.B,S.volume()*self.b
-
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix()
-			self.approx_fit = True
-
-		map = None
-		lcb = None
-
-		if 	self.approx_solver == True:
-			ucb = self.band_no_opt(S, beta=beta, dt=dt, maximization=True)
-			if lcb_compute == True:
-				lcb = self.band_no_opt(S, beta=beta, dt=dt, maximization=False)
-		else:
-			ucb = self.band(S, beta=beta,dt=dt, maximization=True)
-			if lcb_compute == True:
-				lcb = self.band(S, beta=beta,dt=dt, maximization=False)
-
-		return map, ucb, lcb
-
-	def mean_var_bins_set(self,S, dt=1., beta=2., lcb_compute = False):
-		return self.mean_var_reg_set(S, dt=dt, beta=beta, lcb_compute = lcb_compute)
-
-	def band(self, S, beta=2.,dt=1., maximization=True):
-		emb = self.product_integral(S) * dt
-		A = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-		cost = cp.trace(A @ emb)
-		Z = self.W_cholesky.clone()
-		zero = np.zeros(self.get_m() ** 2)
-		constraints = [cp.SOC(zero.T @ cp.vec(A) + self.s * beta**2, Z @ (cp.vec(A) - cp.vec(self.rate.numpy())))]
-		constraints += [A >> 0]
-
-		if maximization == True:
-			prob = cp.Problem(cp.Maximize(cost), constraints)
-		else:
-			prob = cp.Problem(cp.Minimize(cost), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: 4,
-					mosek.iparam.intpnt_solve_form:mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas:1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap:1e-3})
-		ucb = torch.trace(torch.from_numpy(A.value) @ emb)
-		return ucb
-
-	def band_no_opt(self, S, beta=2.,dt=1., maximization=True):
-
-		if self.rate is None:
-			if maximization == True:
-				return S.volume()*dt*self.B
-			else:
-				return S.volume() * dt * self.b
-		else:
-			emb = self.product_integral(S)
-			cost = torch.trace(self.rate @ emb)
-			if maximization == True:
-				out = cost + beta* emb.view(1,-1)@self.W_inv@emb.view(-1,1)
-			else:
-				out = np.maximum(cost - beta* emb.view(1,-1)@self.W_inv@emb.view(-1,1),0.)
-			return out*dt
-
-	def gap(self, S, actions, w, dt, beta=2.):
-		"""
-		Estimates the gap of an action S,
-		:param S:
-		:param dt:
-		:return:
-		"""
-
-		if self.data is None:
-			return (self.B-self.b)*S.volume()/w(S)
-
-		if self.ucb_identified == False:
-			print("Recomputing UCB.....")
-			self.ucb_identified = True
-			self.max_ucb = -1000
-			self.ucb_action = None
-			for action in actions:
-				_, ucb,__ = self.mean_var_reg_set(action, dt=dt, beta=self.beta(0))
-				ucb = ucb / w(action)
-				if ucb > self.max_ucb:
-					self.max_ucb = ucb
-				self.ucb_action = action
-		map, ucb, lcb = self.mean_var_reg_set(S, dt=dt, beta=self.beta(0), lcb_compute = True)
-		gap = w(S) * self.max_ucb - lcb
-		return gap
-
-	def information(self, S, dt, precomputed = None):
-
-		if self.data is None:
-			return 1.
-
-		if self.W is None:
-			self.construct_covariance_matrix()
-
-		if self.feedback == "count-record":
-			varphi_UCB = self.product_integral(self.ucb_action).view(1,-1)*dt
-
-			ind = []
-			for index, set in enumerate(self.basic_sets):
-				if S.inside(set):
-					ind.append(index)
-			Upsilon = self.varLambdas_vec[ind, :]*dt
-
-			I = torch.eye(Upsilon.size()[0]).double()
-			G = self.W_inv - self.W_inv@Upsilon.T@torch.inverse(I + Upsilon @ Upsilon.T)@Upsilon@self.W_inv
-			return 10e-4 + torch.logdet( varphi_UCB @self.W_inv  @ varphi_UCB.T) - torch.logdet( varphi_UCB @ G  @ varphi_UCB.T)
-
-		elif self.feedback  =="histogram":
-			raise NotImplementedError("Not implemented.")
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.feedback == "count-record":
+            self.varLambdas_vec = torch.zeros(
+                size=(
+                    self.varLambdas.size()[0],
+                    self.varLambdas.size()[1] * self.varLambdas.size()[2],
+                )
+            ).double()
+            for i in range(self.varLambdas.size()[0]):
+                self.varLambdas_vec[i, :] = self.varLambdas[i, :, :].reshape(-1)
+
+        self.approx_solver = True
+
+    def fit_gp(self, threads=4):
+        if self.data is not None:
+            super().fit_gp(threads=threads)
+        else:
+            self.rate = None
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        emb = self.packing.embed(xtest)
+        mu = torch.einsum("ij,jk,ik->i", emb, self.rate, emb).view(-1, 1)
+        return mu
+
+    def rate_value(self, x, dt=1):
+        emb = self.packing.embed(x) * dt
+        mu = torch.einsum("ij,jk,ik->i", emb, self.rate, emb).view(-1, 1)
+        return mu
+
+    def mean_set(self, S, dt=1.0):
+        if self.data is not None:
+            emb = self.product_integral(S) * dt
+            mu = torch.trace(emb @ self.rate).view(1, 1)
+        else:
+            mu = self.b * S.volume()
+        return mu
+
+    def penalized_likelihood(self, threads=4):
+        sumLambda = self.sumLambda.numpy()
+        Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            # cost = cp.sum_squares(cp.diag(emb @ A @ emb.T) - y.view(-1).numpy()) / (self.s ** 2) + (self.lam) * cp.norm(A, "fro")
+            objective = (
+                -cp.sum(cp.log(observations @ Theta @ observations.T))
+                + cp.trace(sumLambda @ Theta)
+                + self.s * cp.sum_squares(cp.vec(Theta))
+            )
+        else:
+            objective = cp.trace(sumLambda @ Theta) + self.s * cp.sum_squares(
+                cp.vec(Theta)
+            )
+
+        # if self.get_m() == 2:
+        # 	# use Lorentz-cone special result
+        # 	constraints = [cp.SOC(Theta[0,0]+Theta[1,1],Theta[1,1]    )]
+        # else:
+        # 	constraints = [Theta >> 0]
+        constraints = []
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+        self.rate = torch.from_numpy(Theta.value)
+        return self.rate
+
+    def penalized_likelihood_bins(self, threads=4):
+        Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        tau = self.total_bucketized_time[mask].clone().numpy()
+        varLambdas_vec = self.varLambdas_vec[mask, :].clone().numpy()
+
+        objective = (
+            -cp.sum(
+                observations @ cp.log(cp.multiply(tau, varLambdas_vec @ cp.vec(Theta)))
+            )
+            + cp.sum(cp.multiply(tau, varLambdas_vec @ cp.vec(Theta)))
+            + self.s * cp.sum_squares(cp.vec(Theta))
+        )
+
+        constraints = [Theta >> 0]
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+        self.rate = torch.from_numpy(Theta.value)
+        return self.rate
+
+    def least_squares_weighted(self, threads=4):
+
+        if self.approx_fit == False:
+            self.bucketization()
+
+        Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        tau = self.total_bucketized_time.clone().numpy()
+
+        # varsumLambdas
+        varLambdas_vec = self.varLambdas_vec[mask, :].clone().numpy()
+
+        variances = self.variances.view(-1).clone().numpy()
+
+        for i in range(variances.shape[0]):
+            if mask[i] > 0:
+                variances[i] = (
+                    variances[i]
+                    * tau[i]
+                    * self.variance_correction(variances[i] * tau[i])
+                )
+
+        selected_variances = variances[mask]
+
+        objective = (
+            cp.sum_squares(
+                (varLambdas_vec @ cp.vec(Theta) + -observations)
+                / np.sqrt(selected_variances)
+            )
+            + self.s * cp.sum_squares(cp.vec(Theta)) / 2
+        )
+        constraints = [Theta >> 0]
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+
+        self.rate = torch.from_numpy(Theta.value)
+        return self.rate
+
+    def construct_covariance_matrix(self):
+        if self.estimator == "bins":
+            self.construct_covariance_matrix_bins()
+        elif self.estimator == "least-sq":
+            self.construct_covariance_matrix_regression()
+        else:
+            raise NotImplementedError("Covariance not implemented")
+
+    def construct_covariance_matrix_regression(self):
+        varLambdas = self.varLambdas_vec.clone()
+        variances = self.variances
+        mask = self.bucketized_counts > 0
+        tau = self.total_bucketized_time
+        W = torch.zeros(size=(self.get_m() ** 2, self.get_m() ** 2)).double()
+        I = torch.eye(self.get_m() ** 2).double()
+        W_inv = self.s * torch.eye(self.get_m() ** 2).double()
+
+        for index_o, o in enumerate(self.bucketized_obs):
+            n = mask[index_o]
+            if n > 0:
+                k = self.variance_correction(tau[index_o] * variances[index_o])
+                v = tau[index_o] / (variances[index_o] * k)
+
+                vec = varLambdas[index_o, :].view(-1, 1)
+                A = vec @ vec.T
+                W = W + A * v
+                denom = 1.0 + v * vec.T @ W_inv @ vec
+                W_inv = W_inv @ (I - v * vec @ (vec.T @ W_inv) / denom)
+
+        self.W = W + self.s * torch.eye(self.get_m() ** 2).double()
+        self.W_inv = W_inv
+        # self.W_cholesky = torch.cholesky(self.W, upper=True)
+        return self.W
+
+    def construct_covariance_matrix_bins(self):
+        self.construct_covariance_matrix_regression()
+
+    def mean_var_reg_set(self, S, dt=1.0, beta=2.0, lcb_compute=False):
+
+        if self.data is None:
+            return S.volume() * self.b, S.volume() * self.B, S.volume() * self.b
+
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix()
+            self.approx_fit = True
+
+        map = None
+        lcb = None
+
+        if self.approx_solver == True:
+            ucb = self.band_no_opt(S, beta=beta, dt=dt, maximization=True)
+            if lcb_compute == True:
+                lcb = self.band_no_opt(S, beta=beta, dt=dt, maximization=False)
+        else:
+            ucb = self.band(S, beta=beta, dt=dt, maximization=True)
+            if lcb_compute == True:
+                lcb = self.band(S, beta=beta, dt=dt, maximization=False)
+
+        return map, ucb, lcb
+
+    def mean_var_bins_set(self, S, dt=1.0, beta=2.0, lcb_compute=False):
+        return self.mean_var_reg_set(S, dt=dt, beta=beta, lcb_compute=lcb_compute)
+
+    def band(self, S, beta=2.0, dt=1.0, maximization=True):
+        emb = self.product_integral(S) * dt
+        A = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+        cost = cp.trace(A @ emb)
+        Z = self.W_cholesky.clone()
+        zero = np.zeros(self.get_m() ** 2)
+        constraints = [
+            cp.SOC(
+                zero.T @ cp.vec(A) + self.s * beta**2,
+                Z @ (cp.vec(A) - cp.vec(self.rate.numpy())),
+            )
+        ]
+        constraints += [A >> 0]
+
+        if maximization == True:
+            prob = cp.Problem(cp.Maximize(cost), constraints)
+        else:
+            prob = cp.Problem(cp.Minimize(cost), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: 4,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+        ucb = torch.trace(torch.from_numpy(A.value) @ emb)
+        return ucb
+
+    def band_no_opt(self, S, beta=2.0, dt=1.0, maximization=True):
+
+        if self.rate is None:
+            if maximization == True:
+                return S.volume() * dt * self.B
+            else:
+                return S.volume() * dt * self.b
+        else:
+            emb = self.product_integral(S)
+            cost = torch.trace(self.rate @ emb)
+            if maximization == True:
+                out = cost + beta * emb.view(1, -1) @ self.W_inv @ emb.view(-1, 1)
+            else:
+                out = np.maximum(
+                    cost - beta * emb.view(1, -1) @ self.W_inv @ emb.view(-1, 1), 0.0
+                )
+            return out * dt
+
+    def gap(self, S, actions, w, dt, beta=2.0):
+        """
+        Estimates the gap of an action S,
+        :param S:
+        :param dt:
+        :return:
+        """
+
+        if self.data is None:
+            return (self.B - self.b) * S.volume() / w(S)
+
+        if self.ucb_identified == False:
+            print("Recomputing UCB.....")
+            self.ucb_identified = True
+            self.max_ucb = -1000
+            self.ucb_action = None
+            for action in actions:
+                _, ucb, __ = self.mean_var_reg_set(action, dt=dt, beta=self.beta(0))
+                ucb = ucb / w(action)
+                if ucb > self.max_ucb:
+                    self.max_ucb = ucb
+                self.ucb_action = action
+        map, ucb, lcb = self.mean_var_reg_set(
+            S, dt=dt, beta=self.beta(0), lcb_compute=True
+        )
+        gap = w(S) * self.max_ucb - lcb
+        return gap
+
+    def information(self, S, dt, precomputed=None):
+
+        if self.data is None:
+            return 1.0
+
+        if self.W is None:
+            self.construct_covariance_matrix()
+
+        if self.feedback == "count-record":
+            varphi_UCB = self.product_integral(self.ucb_action).view(1, -1) * dt
+
+            ind = []
+            for index, set in enumerate(self.basic_sets):
+                if S.inside(set):
+                    ind.append(index)
+            Upsilon = self.varLambdas_vec[ind, :] * dt
+
+            I = torch.eye(Upsilon.size()[0]).double()
+            G = (
+                self.W_inv
+                - self.W_inv
+                @ Upsilon.T
+                @ torch.inverse(I + Upsilon @ Upsilon.T)
+                @ Upsilon
+                @ self.W_inv
+            )
+            return (
+                10e-4
+                + torch.logdet(varphi_UCB @ self.W_inv @ varphi_UCB.T)
+                - torch.logdet(varphi_UCB @ G @ varphi_UCB.T)
+            )
+
+        elif self.feedback == "histogram":
+            raise NotImplementedError("Not implemented.")
 
 
 if __name__ == "__main__":
-	torch.manual_seed(2)
-	np.random.seed(2)
-	d = 1
-	gamma = 0.2
-	n = 64
-	B = 4.
-	b = 0.5
-
-	process = PoissonPointProcess(d=1, B=B, b=b)
-	Sets = []
-	levels = 3
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	Sets = hierarchical_structure.get_all_sets()
-
-	D = BorelSet(1, bounds=torch.Tensor([[-1., 1.]]).double())
-
-	m = 32
-	embedding = HermiteEmbedding(m = m, d = 1, gamma = gamma)
-	k = KernelFunction(gamma = gamma)
-	estimator = MBRPositiveEstimator(process, hierarchical_structure, kernel_object=k,
-									 B=B, m=m, d=d, embedding=embedding, basis = "custom")
-	min_vol, max_vol = estimator.get_min_max()
-
-	dt = 10. / (b * min_vol)
-	dt = dt * 2
-
-	print("Suggested dt:", dt)
-	c = ['k', 'r', 'b', 'y', 'g', 'orange', 'brown', 'purple'] + ['k' for i in range(500)]
-
-	no_sets = len(Sets)
-	no_samples = 0
-	data = []
-	samples = []
-	repeats = 2
-
-	for i in range(no_samples):
-		j = np.random.randint(0, no_sets, 1)
-		S = Sets[j[0]]
-		for _ in range(repeats):
-			sample = process.sample_discretized(S, dt)
-			samples.append(sample)
-			data.append((S, sample, dt))
-
-	sample_D = process.sample_discretized(D, dt)
-	samples.append(sample_D)
-	no_samples = repeats * no_samples + 1
-	data.append((D, sample_D, dt))
-
-	estimator.load_data(data)
-
-	xtest = D.return_discretization(n=n)
-
-	# likelihood based
-	estimator.penalized_likelihood()
-	rate_mean = estimator.mean_rate(D,n = n)
-
-	#_, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.)
-
-
-	for j in range(no_samples):
-		if samples[j] is not None:
-			plt.plot(samples[j], samples[j] * 0, 'o', color=c[j])
-
-	plt.plot(xtest, rate_mean, label='likelihood - locations known')
-	#plt.fill_between(xtest.numpy().flatten(), lcb.numpy().flatten(), ucb.numpy().flatten(), alpha=0.4,
-	#				 color='blue', label='triangle')
-	process.visualize(D, samples=0, n=n, dt=1.)
+    torch.manual_seed(2)
+    np.random.seed(2)
+    d = 1
+    gamma = 0.2
+    n = 64
+    B = 4.0
+    b = 0.5
+
+    process = PoissonPointProcess(d=1, B=B, b=b)
+    Sets = []
+    levels = 3
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    Sets = hierarchical_structure.get_all_sets()
+
+    D = BorelSet(1, bounds=torch.tensor([[-1.0, 1.0]]).double())
+
+    m = 32
+    embedding = HermiteEmbedding(m=m, d=1, gamma=gamma)
+    k = KernelFunction(gamma=gamma)
+    estimator = MBRPositiveEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        embedding=embedding,
+        basis="custom",
+    )
+    min_vol, max_vol = estimator.get_min_max()
+
+    dt = 10.0 / (b * min_vol)
+    dt = dt * 2
+
+    print("Suggested dt:", dt)
+    c = ["k", "r", "b", "y", "g", "orange", "brown", "purple"] + [
+        "k" for i in range(500)
+    ]
+
+    no_sets = len(Sets)
+    no_samples = 0
+    data = []
+    samples = []
+    repeats = 2
+
+    for i in range(no_samples):
+        j = np.random.randint(0, no_sets, 1)
+        S = Sets[j[0]]
+        for _ in range(repeats):
+            sample = process.sample_discretized(S, dt)
+            samples.append(sample)
+            data.append((S, sample, dt))
+
+    sample_D = process.sample_discretized(D, dt)
+    samples.append(sample_D)
+    no_samples = repeats * no_samples + 1
+    data.append((D, sample_D, dt))
+
+    estimator.load_data(data)
+
+    xtest = D.return_discretization(n=n)
+
+    # likelihood based
+    estimator.penalized_likelihood()
+    rate_mean = estimator.mean_rate(D, n=n)
+
+    # _, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.)
+
+    for j in range(no_samples):
+        if samples[j] is not None:
+            plt.plot(samples[j], samples[j] * 0, "o", color=c[j])
+
+    plt.plot(xtest, rate_mean, label="likelihood - locations known")
+    # plt.fill_between(xtest.numpy().flatten(), lcb.numpy().flatten(), ucb.numpy().flatten(), alpha=0.4,
+    # 				 color='blue', label='triangle')
+    process.visualize(D, samples=0, n=n, dt=1.0)
diff --git a/stpy/point_processes/poisson.py b/stpy/point_processes/poisson.py
index 6776f2d..8a1ca1f 100644
--- a/stpy/point_processes/poisson.py
+++ b/stpy/point_processes/poisson.py
@@ -4,157 +4,194 @@
 from stpy.borel_set import BorelSet
 
 
+class PoissonPointProcess:
+    """
+    parametrized by log linear model
+
+    """
+
+    def __init__(self, d=1, B=1, b=0.2, rate=None, rate_volume=None):
+        self.B = B
+        self.d = d
+        self.b = b
+        if rate is None:
+            self.rate = self.rate_default
+        else:
+            self.rate = rate
+
+        self.rate_volume_f = rate_volume
+        self.exact = True
+
+    def rate_default(self, x, dt=1.0):
+        return (
+            self.B
+            * torch.sum(
+                torch.exp(-(x + 1)) * torch.sin(2 * x * np.pi) ** 2, dim=1
+            ).view(-1, 1)
+            + self.b
+        ) * dt
+
+    def rate_volume(self, S, dt=1, rate=None):
+        if self.rate_volume_f is None:
+            # integrate rate numerically over S
+            import scipy.integrate as integrate
+
+            if rate is None:
+                rate = self.rate
+            else:
+                rate = rate
+            integral = 0
+            if self.d == 1:
+                # integrate = S.volume()* self.rate(torch.from_numpy(S.bounds[0,1]).view(1))
+                integral, _ = integrate.quad(
+                    lambda x: rate(torch.tensor([x]).view(1, 1)).numpy(),
+                    float(S.bounds[0, 0]),
+                    float(S.bounds[0, 1]),
+                )
+            elif self.d == 2:
+                integrand = lambda x, y: rate(
+                    torch.tensor([x, y]).view(1, 2).double()
+                ).numpy()
+                integral, _ = integrate.dblquad(
+                    integrand,
+                    float(S.bounds[0, 0]),
+                    float(S.bounds[0, 1]),
+                    lambda x: float(S.bounds[1, 0]),
+                    lambda x: float(S.bounds[1, 1]),
+                )
+
+            return integral * dt
+        else:
+            return self.rate_volume_f(S) * dt
+
+    def sample_discretized(self, S, dt, n=50):
+        lam = np.maximum(float(self.rate_volume(S, dt)), 0)
+        count = np.random.poisson(lam=lam)
+        if count > 0:
+            x = S.return_discretization(n)
+            r = self.rate(x) * dt
+            r = torch.maximum(r, r * 0)
+            sample = torch.from_numpy(
+                np.random.choice(
+                    np.arange(0, x.size()[0], 1),
+                    size=count,
+                    p=(r / torch.sum(r)).numpy().reshape(-1),
+                )
+            )
+            return x[sample, :]
+        else:
+            return None
+
+    def sample_discretized_direct(self, x, val):
+        lam = 1000
+        count = np.random.poisson(lam=np.maximum(0, lam))
+        if count > 0:
+            val = torch.abs(val)
+            sample = torch.from_numpy(
+                np.random.choice(
+                    np.arange(0, x.size()[0], 1),
+                    size=count,
+                    p=(val / torch.sum(val)).numpy().reshape(-1),
+                )
+            )
+            return x[sample, :]
+        else:
+            return None
+
+    def sample(self, S, dt=1.0, verbose=False, rate=None):
+        """
+
+        :param S: set where it should be sampled
+        :return:
+        """
+        if self.exact == True:
+
+            return self.sample_discretized(S, dt=dt)
+
+        else:
+
+            lam = self.rate_volume(S, dt)
+            n = np.random.poisson(lam=lam)
+            new_sample = []
+            vol = S.volume()
+            size = 0
+
+            alpha = 1.0 / lam
+
+            while size < n:
+                # uniform sample g(s) = 1/vol(S)
+                sample = S.uniform_sample(1)
+
+                t = self.rate(sample) / (alpha * lam)
+                p = np.random.uniform(0, 1)
+                if p < t:
+                    new_sample.append(sample.view(1, -1))
+                    size = size + 1
+
+            if len(new_sample) > 1:
+                x = torch.cat(new_sample, dim=0)
+            else:
+                return None
+            return x
+
+    def rate_sets(self, Sets, dt=1):
+        res = []
+        for S in Sets:
+            res.append(self.rate_volume(S, dt=dt))
+        return res
+
+    def visualize(self, S, samples=2, n=10, dt=1.0, show=True):
+        xtest = S.return_discretization(n)
+        rate = self.rate(xtest)
+
+        if self.d == 1:
+            plt.plot(xtest, rate, label="rate", lw=3)
+            for i in range(samples):
+
+                x = self.sample(S, dt=dt)
+                if x is not None:
+                    n = x.size()[0]
+                    plt.plot(x, x * 0, "o", label="sample n=" + str(n))
+
+        elif self.d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), rate[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu, label="rate")
+            ax.contour(cs, colors="k")
+
+            for i in range(samples):
+                x = self.sample(S, dt=dt)
+                if x is not None:
+                    ax.plot(
+                        x[:, 0].detach().numpy(),
+                        x[:, 1].detach().numpy(),
+                        "o",
+                        ms=10,
+                        alpha=0.5,
+                        label="sample",
+                    )
+            ax.grid(c="k", ls="-", alpha=0.1)
+            plt.colorbar(cs)
+
+        plt.legend()
+        if show == True:
+            plt.show()
 
 
-class PoissonPointProcess():
-	"""
-	parametrized by log linear model
-
-	"""
-	def __init__(self, d = 1, B = 1, b= 0.2, rate = None, rate_volume = None):
-		self.B = B
-		self.d = d
-		self.b = b
-		if rate is None:
-			self.rate = self.rate_default
-		else:
-			self.rate = rate
-
-
-		self.rate_volume_f = rate_volume
-		self.exact = True
-
-	def rate_default(self,x, dt = 1.):
-		return (self.B*torch.sum(torch.exp(-(x+1))*torch.sin(2*x*np.pi)**2  ,dim =1).view(-1,1)+ self.b) *dt
-
-	def rate_volume(self,S, dt = 1, rate = None):
-		if self.rate_volume_f is None:
-			# integrate rate numerically over S
-			import scipy.integrate as integrate
-			if rate is None:
-				rate = self.rate
-			else:
-				rate = rate
-			integral = 0
-			if self.d == 1:
-				#integrate = S.volume()* self.rate(torch.from_numpy(S.bounds[0,1]).view(1))
-				integral,_ = integrate.quad(lambda x : rate(torch.Tensor([x]).view(1,1)).numpy(), float(S.bounds[0,0]), float(S.bounds[0,1]) )
-			elif self.d ==2:
-				integrand = lambda x, y: rate(torch.Tensor([x, y]).view(1, 2).double()).numpy()
-				integral, _ = integrate.dblquad(integrand, float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-												lambda x: float(S.bounds[1, 0]), lambda x: float(S.bounds[1, 1]))
-
-			return integral*dt
-		else:
-			return self.rate_volume_f(S)*dt
-
-	def sample_discretized(self, S, dt, n = 50):
-		lam = np.maximum(float(self.rate_volume(S, dt)),0)
-		count = np.random.poisson(lam=lam)
-		if count > 0:
-			x = S.return_discretization(n)
-			r = self.rate(x)*dt
-			r = torch.maximum(r,r*0)
-			sample = torch.from_numpy(np.random.choice(np.arange(0,x.size()[0],1), size = count, p=(r/torch.sum(r)).numpy().reshape(-1) ))
-			return x[sample,:]
-		else:
-			return None
-
-	def sample_discretized_direct(self, x,val):
-		lam = 1000
-		count = np.random.poisson(lam=np.maximum(0,lam))
-		if count > 0:
-			val = torch.abs(val)
-			sample = torch.from_numpy(np.random.choice(np.arange(0,x.size()[0],1),
-													   size = count, p=(val/torch.sum(val)).numpy().reshape(-1) ))
-			return x[sample,:]
-		else:
-			return None
-
-	def sample(self, S, dt = 1., verbose = False, rate = None):
-		"""
-
-		:param S: set where it should be sampled
-		:return:
-		"""
-		if self.exact == True:
-
-			return self.sample_discretized(S, dt = dt)
-
-		else:
-
-			lam = self.rate_volume(S, dt)
-			n = np.random.poisson(lam = lam)
-			new_sample = []
-			vol = S.volume()
-			size = 0
-
-			alpha = 1./lam
-
-			while size<n:
-				# uniform sample g(s) = 1/vol(S)
-				sample = S.uniform_sample(1)
-
-				t = self.rate(sample)/(alpha*lam)
-				p = np.random.uniform(0,1)
-				if p<t:
-					new_sample.append(sample.view(1,-1))
-					size = size + 1
-
-			if len(new_sample)>1:
-				x = torch.cat(new_sample, dim = 0)
-			else:
-				return None
-			return x
-
-	def rate_sets(self,Sets, dt = 1):
-		res = []
-		for S in Sets:
-			res.append(self.rate_volume(S,dt=dt))
-		return res
-
-	def visualize(self,S,samples = 2, n = 10, dt = 1., show = True):
-		xtest = S.return_discretization(n)
-		rate = self.rate(xtest)
-
-		if self.d == 1:
-			plt.plot(xtest, rate, label='rate', lw = 3)
-			for i in range(samples):
-
-				x = self.sample(S, dt= dt)
-				if x is not None:
-					n = x.size()[0]
-					plt.plot(x,x*0,'o', label = 'sample n='+str(n))
-
-		elif self.d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), rate[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu, label = 'rate')
-			ax.contour(cs, colors='k')
-
-			for i in range(samples):
-				x = self.sample(S, dt = dt)
-				if x is not None:
-					ax.plot(x[:, 0].detach().numpy(), x[:, 1].detach().numpy(), 'o', ms=10, alpha = 0.5, label = 'sample')
-			ax.grid(c='k', ls='-', alpha=0.1)
-			plt.colorbar(cs)
-
-		plt.legend()
-		if show == True:
-			plt.show()
-
 if __name__ == "__main__":
-	d = 2
-	n = 100
-	bounds = torch.Tensor([[-1,1],[-1,1]]).double()
-	D = BorelSet(d, bounds)
-
-	process = PoissonPointProcess(d = d, B = 2)
-	process.visualize(D, samples = 10, n = n, dt = 10)
-
-
+    d = 2
+    n = 100
+    bounds = torch.tensor([[-1, 1], [-1, 1]]).double()
+    D = BorelSet(d, bounds)
 
+    process = PoissonPointProcess(d=d, B=2)
+    process.visualize(D, samples=10, n=n, dt=10)
diff --git a/stpy/point_processes/poisson/__init__.py b/stpy/point_processes/poisson/__init__.py
index e69de29..32fcc1a 100644
--- a/stpy/point_processes/poisson/__init__.py
+++ b/stpy/point_processes/poisson/__init__.py
@@ -0,0 +1 @@
+from .poisson import PoissonPointProcess
diff --git a/stpy/point_processes/poisson/link_fun_rate_estimator.py b/stpy/point_processes/poisson/link_fun_rate_estimator.py
index d4e50d5..02e46fb 100644
--- a/stpy/point_processes/poisson/link_fun_rate_estimator.py
+++ b/stpy/point_processes/poisson/link_fun_rate_estimator.py
@@ -7,8 +7,14 @@
 
 from stpy.borel_set import BorelSet, HierarchicalBorelSets
 from stpy.embeddings.embedding import HermiteEmbedding
-from stpy.helpers.ellipsoid_algorithms import maximize_matrix_quadratic_on_ellipse, minimize_matrix_quadratic_on_ellipse
-from stpy.helpers.ellipsoid_algorithms import maximize_quadratic_on_ellipse, minimize_quadratic_on_ellipse
+from stpy.helpers.ellipsoid_algorithms import (
+    maximize_matrix_quadratic_on_ellipse,
+    minimize_matrix_quadratic_on_ellipse,
+)
+from stpy.helpers.ellipsoid_algorithms import (
+    maximize_quadratic_on_ellipse,
+    minimize_quadratic_on_ellipse,
+)
 from stpy.helpers.quadrature_helper import quadvec2
 from stpy.kernels import KernelFunction
 from stpy.point_processes.poisson import PoissonPointProcess
@@ -17,452 +23,584 @@
 
 ## implement loading data
 
+
 class PermanentalProcessRateEstimator(PoissonRateEstimator):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
-
-		self.integration = "fixed_quad"
-		self.product_integrals = {}
-		self.varLambdas = torch.zeros(size=(len(self.basic_sets), self.get_m(), self.get_m())).double()
-		self.opt = 'cvxpy'
-		if self.feedback == "count-record" and self.estimator == "least-sq":
-			print("precomputing-integrals:")
-			for index_set, set in enumerate(self.basic_sets):
-				print(index_set, "/", len(self.basic_sets))
-				self.varLambdas[index_set, :] = self.product_integral(set)
-				self.variances[index_set] = set.volume() * self.B
-
-	def product_integral(self, S):
-
-		if S in self.product_integrals.keys():
-			return self.product_integrals[S]
-		else:
-
-			if "product_integral" in dir(self.packing):
-				Psi = self.packing.product_integral(S)
-				self.product_integrals[S] = Psi
-				return Psi
-
-			elif self.integration == "vec_quad":
-
-				if S.d == 2:
-					# Psi = torch.zeros(size=(self.get_m(), self.get_m())).double()
-					F = lambda x: (self.packing.embed(x).view(-1, 1) @ \
-								   self.packing.embed(x).view(1, -1)).view(-1)
-					integrand = lambda x, y: F(torch.Tensor([x, y]).view(1, 2).double()).numpy()
-
-					val = quadvec2(integrand, float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-								   float(S.bounds[1, 0]), float(S.bounds[1, 1]), limit=10, epsrel=10e-3, epsabs=10e-3,
-								   quadrature='gk15')
-					Psi = torch.from_numpy(val).view((self.get_m(), self.get_m()))
-
-			elif self.integration == "fixed_quad":
-
-				if S.d == 1:
-					weights, nodes = S.return_legendre_discretization(n=128)
-					Z = self.packing.embed(nodes)
-					M = torch.einsum('ij,ik->ijk', Z, Z)
-					Psi = torch.einsum('i,ijk->jk', weights, M)
-
-				if S.d == 2:
-					weights, nodes = S.return_legendre_discretization(n=50)
-					Z = self.packing.embed(nodes)
-					M = torch.einsum('ij,ik->ijk', Z, Z)
-					Psi = torch.einsum('i,ijk->jk', weights, M)
-
-			else:
-				Psi = torch.zeros(size=(self.get_m(), self.get_m())).double()
-				for i in range(self.get_m()):
-					for j in range(self.get_m()):
-
-						if S.d == 1:
-							F_ij = lambda x: (
-										self.packing.embed(torch.from_numpy(np.array(x)).view(1, -1)).view(-1)[i] *
-										self.packing.embed(torch.from_numpy(np.array(x)).view(1, -1)).view(-1)[
-											j]).numpy()
-							val, status = integrate.quad(F_ij, float(S.bounds[0, 0]), float(S.bounds[0, 1]))
-
-
-						elif S.d == 2:
-							F_ij = lambda x: self.packing.embed(x).view(-1)[i] * self.packing.embed(x).view(-1)[j]
-							integrand = lambda x, y: F_ij(torch.Tensor([x, y]).view(1, 2).double()).numpy()
-							val, status = integrate.dblquad(integrand, float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-															lambda x: float(S.bounds[1, 0]),
-															lambda x: float(S.bounds[1, 1]), epsabs=1.49e-03,
-															epsrel=1.49e-03)
-						else:
-							raise NotImplementedError("Integration above d>2 not implemented.")
-
-						Psi[i, j] = val
-						print(i, j, val)
-
-			self.product_integrals[S] = Psi
-			return Psi
-
-	def get_constraints(self):
-		s = self.get_m()
-		l = np.full(s, self.b)
-		u = np.full(s, self.B)
-		Lambda = np.identity(s)
-		return (l, Lambda, u)
-
-	def cov(self, inverse=False):
-		s = self.get_m()
-
-		if inverse == False:
-			return torch.zeros(size=(s, s)).double()
-		else:
-			return torch.zeros(size=(s, s)).double(), torch.zeros(size=(s, s)).double()
-
-	def sample(self, verbose=False, steps=10, stepsize=None):
-
-		if self.data is None:
-			self.sampled_theta = torch.zeros(self.get_m()).double().view(-1, 1)
-			return None
-
-		if self.observations is not None:
-			observations = self.observations.double()
-			sumLambda = self.sumLambda.double()
-			nabla = lambda theta: -torch.sum(torch.diag(1. / (observations @ theta).view(-1)) @ observations) \
-								  + (sumLambda.T + sumLambda) @ theta + self.s * theta.view(-1, 1)
-		else:
-			sumLambda = self.sumLambda.double()
-			nabla = lambda theta: (sumLambda.T + sumLambda) @ theta + self.s * theta.view(-1, 1)
-
-		theta = self.rate.view(-1, 1)
-
-		W = self.construct_covariance_matrix_laplace()
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-3))
-		eta = 0.5 / (L + 1)
-
-		for k in range(steps):
-			W = torch.randn(size=(self.get_m(), 1)).double()
-			theta = theta - eta * nabla(theta) + np.sqrt(2 * eta) * W
-			if verbose == True:
-				print("Iter:", k, theta.T)
-
-		self.sampled_theta = theta
-		return None
-
-	def sample_value(self, S):
-		"""
-		Given a pre-sampled value evaluate certain portions of the domain S
-		:param S:
-		:return:
-		"""
-		Z = self.product_integral(S)
-		map = self.sampled_theta.T @ Z @ self.sampled_theta
-		return map
-
-	def sample_path(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return (self.packing.embed(xtest) @ self.sampled_theta) ** 2
-
-	def load_data(self, data):
-		super().load_data(data, times=False)
-		self.sumLambda = torch.zeros(size=(self.get_m(), self.get_m()))
-		if len(data) > 1:
-			for sample in data:
-				(S, obs, dt) = sample
-				self.sumLambda += self.product_integral(S) * dt
-
-	def add_data_point(self, new_data):
-		super().add_data_point(new_data, times=False)
-		(S, obs, dt) = new_data
-		self.sumLambda += self.product_integral(S) * dt
-
-	def penalized_likelihood(self, threads=4):
-		sumLambda = self.sumLambda.numpy()
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			loss = lambda theta: float(
-				-np.sum(np.log((observations @ theta) ** 2)) + np.dot(theta, sumLambda @ theta) + 0.5 * self.s * np.sum(
-					theta ** 2))
-		else:
-			loss = lambda theta: float(np.dot(theta, sumLambda @ theta) + 0.5 * self.s * np.sum(theta ** 2))
-
-		theta = np.random.randn(self.get_m())
-		res = minimize(loss, theta, jac=None, method='L-BFGS-B')
-		self.rate = torch.from_numpy(res.x)
-		return self.rate
-
-	def construct_covariance_matrix_laplace(self):
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-
-		if self.feedback == "count-record":
-			if self.observations is not None:
-				for i in range(self.observations.size()[0]):
-					A = self.observations[i, :].view(-1, 1) @ self.observations[i, :].view(1, -1)
-					k = np.maximum(torch.dot(self.observations[i, :], self.rate.view(-1)) ** 2, self.b)
-					W = W + A / k
-			W += 2 * self.sumLambda
-		else:
-			raise AssertionError("Not implemented.")
-		return W + torch.eye(self.get_m()).double() * self.s
-
-	def map_lcb_ucb_approx_action(self, S, dt=1., beta=2.):
-
-		phi = self.packing.integral(S)
-		map = (phi @ self.rate)
-
-		ucb = np.maximum((map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)) ** 2,
-						 (map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)) ** 2)
-		ucb = np.minimum(ucb, self.B * S.volume() * dt)
-		lcb = 0.
-
-		return dt * map ** 2, dt * lcb, dt * ucb
-
-	def mean_std_per_action(self, S, W, dt, beta):
-		Z = self.product_integral(S)
-
-		ucb, _ = maximize_matrix_quadratic_on_ellipse(Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-		lcb, _ = minimize_matrix_quadratic_on_ellipse(Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-
-		map = self.rate.T @ Z @ self.rate
-
-		return dt * map, dt * ucb, -lcb * dt
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return (self.packing.embed(xtest) @ self.rate) ** 2
-
-	def mean_rate_latent(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return self.packing.embed(xtest) @ self.rate
-
-	def map_lcb_ucb_approx(self, S, n, beta=2.0, delta=0.01):
-		xtest = S.return_discretization(n)
-		if self.data is None:
-			return 0 * xtest[:, 0].view(-1, 1), self.b + 0 * xtest[:, 0].view(-1, 1), self.B + 0 * xtest[:, 0].view(-1,
-																													xtest.size()[
-																														0])
-		self.fit_ellipsoid_approx()
-
-		Phi = self.packing.embed(xtest).double()
-		map = Phi @ self.rate
-		N = Phi.size()[0]
-
-		ucb = torch.zeros(size=(N, 1)).double()
-		lcb = torch.zeros(size=(N, 1)).double()
-
-		for i in range(N):
-			x = Phi[i, :].view(-1, 1)
-			maximum = np.maximum((map[i] - beta * np.sqrt(x.T @ self.W_inv_approx @ x)) ** 2,
-								 (map[i] + beta * np.sqrt(x.T @ self.W_inv_approx @ x)) ** 2)
-			ucb[i, 0] = np.minimum(maximum, self.B)
-			lcb[i, 0] = 0.
-		# lcb[i, 0] = map[i] - np.sqrt(beta) * np.sqrt(x.T @ self.W_inv_approx @ x) ** 2
-		return map ** 2, lcb, ucb
-
-	def map_lcb_ucb(self, S, n, beta=2.0, delta=0.01):
-		"""
-		Calculate exact confidence using laplace approximation on a whole set domain
-		:param S: set
-		:param n: discretization
-		:param beta: beta
-		:return:
-		"""
-
-		xtest = S.return_discretization(n)
-		if self.data is None:
-			return self.b + 0 * xtest[:, 0].view(-1, 1), self.b + 0 * xtest[:, 0].view(-1, 1), self.B + 0 * xtest[:,
-																											0].view(-1,
-																													1)
-
-		N = xtest.size()[0]
-		Phi = self.packing.embed(xtest)
-		map = (Phi @ self.rate) ** 2
-
-		if self.uncertainty == "laplace":
-			W = self.construct_covariance_matrix_laplace()
-		ucb = torch.zeros(size=(N, 1)).double()
-		lcb = torch.zeros(size=(N, 1)).double()
-
-		for i in range(N):
-			x = Phi[i, :]
-			ucbi, _ = maximize_quadratic_on_ellipse(x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-			lcbi, _ = minimize_quadratic_on_ellipse(x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta)
-			ucb[i, 0] = ucbi
-			lcb[i, 0] = lcbi
-
-		return map, lcb, ucb
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.integration = "fixed_quad"
+        self.product_integrals = {}
+        self.varLambdas = torch.zeros(
+            size=(len(self.basic_sets), self.get_m(), self.get_m())
+        ).double()
+        self.opt = "cvxpy"
+        if self.feedback == "count-record" and self.estimator == "least-sq":
+            print("precomputing-integrals:")
+            for index_set, set in enumerate(self.basic_sets):
+                print(index_set, "/", len(self.basic_sets))
+                self.varLambdas[index_set, :] = self.product_integral(set)
+                self.variances[index_set] = set.volume() * self.B
+
+    def product_integral(self, S):
+
+        if S in self.product_integrals.keys():
+            return self.product_integrals[S]
+        else:
+
+            if "product_integral" in dir(self.packing):
+                Psi = self.packing.product_integral(S)
+                self.product_integrals[S] = Psi
+                return Psi
+
+            elif self.integration == "vec_quad":
+
+                if S.d == 2:
+                    # Psi = torch.zeros(size=(self.get_m(), self.get_m())).double()
+                    F = lambda x: (
+                        self.packing.embed(x).view(-1, 1)
+                        @ self.packing.embed(x).view(1, -1)
+                    ).view(-1)
+                    integrand = lambda x, y: F(
+                        torch.tensor([x, y]).view(1, 2).double()
+                    ).numpy()
+
+                    val = quadvec2(
+                        integrand,
+                        float(S.bounds[0, 0]),
+                        float(S.bounds[0, 1]),
+                        float(S.bounds[1, 0]),
+                        float(S.bounds[1, 1]),
+                        limit=10,
+                        epsrel=10e-3,
+                        epsabs=10e-3,
+                        quadrature="gk15",
+                    )
+                    Psi = torch.from_numpy(val).view((self.get_m(), self.get_m()))
+
+            elif self.integration == "fixed_quad":
+
+                if S.d == 1:
+                    weights, nodes = S.return_legendre_discretization(n=128)
+                    Z = self.packing.embed(nodes)
+                    M = torch.einsum("ij,ik->ijk", Z, Z)
+                    Psi = torch.einsum("i,ijk->jk", weights, M)
+
+                if S.d == 2:
+                    weights, nodes = S.return_legendre_discretization(n=50)
+                    Z = self.packing.embed(nodes)
+                    M = torch.einsum("ij,ik->ijk", Z, Z)
+                    Psi = torch.einsum("i,ijk->jk", weights, M)
+
+            else:
+                Psi = torch.zeros(size=(self.get_m(), self.get_m())).double()
+                for i in range(self.get_m()):
+                    for j in range(self.get_m()):
+
+                        if S.d == 1:
+                            F_ij = lambda x: (
+                                self.packing.embed(
+                                    torch.from_numpy(np.array(x)).view(1, -1)
+                                ).view(-1)[i]
+                                * self.packing.embed(
+                                    torch.from_numpy(np.array(x)).view(1, -1)
+                                ).view(-1)[j]
+                            ).numpy()
+                            val, status = integrate.quad(
+                                F_ij, float(S.bounds[0, 0]), float(S.bounds[0, 1])
+                            )
+
+                        elif S.d == 2:
+                            F_ij = (
+                                lambda x: self.packing.embed(x).view(-1)[i]
+                                * self.packing.embed(x).view(-1)[j]
+                            )
+                            integrand = lambda x, y: F_ij(
+                                torch.tensor([x, y]).view(1, 2).double()
+                            ).numpy()
+                            val, status = integrate.dblquad(
+                                integrand,
+                                float(S.bounds[0, 0]),
+                                float(S.bounds[0, 1]),
+                                lambda x: float(S.bounds[1, 0]),
+                                lambda x: float(S.bounds[1, 1]),
+                                epsabs=1.49e-03,
+                                epsrel=1.49e-03,
+                            )
+                        else:
+                            raise NotImplementedError(
+                                "Integration above d>2 not implemented."
+                            )
+
+                        Psi[i, j] = val
+                        print(i, j, val)
+
+            self.product_integrals[S] = Psi
+            return Psi
+
+    def get_constraints(self):
+        s = self.get_m()
+        l = np.full(s, self.min_intensity)
+        u = np.full(s, self.B)
+        Lambda = np.identity(s)
+        return (l, Lambda, u)
+
+    def cov(self, inverse=False):
+        s = self.get_m()
+
+        if inverse == False:
+            return torch.zeros(size=(s, s)).double()
+        else:
+            return torch.zeros(size=(s, s)).double(), torch.zeros(size=(s, s)).double()
+
+    def sample(self, verbose=False, steps=10, stepsize=None):
+
+        if self.data is None:
+            self.sampled_theta = torch.zeros(self.get_m()).double().view(-1, 1)
+            return None
+
+        if self.observations is not None:
+            observations = self.observations.double()
+            sumLambda = self.sumLambda.double()
+            nabla = (
+                lambda theta: -torch.sum(
+                    torch.diag(1.0 / (observations @ theta).view(-1)) @ observations
+                )
+                + (sumLambda.T + sumLambda) @ theta
+                + self.s * theta.view(-1, 1)
+            )
+        else:
+            sumLambda = self.sumLambda.double()
+            nabla = lambda theta: (
+                sumLambda.T + sumLambda
+            ) @ theta + self.s * theta.view(-1, 1)
+
+        theta = self.rate.view(-1, 1)
+
+        W = self.construct_covariance_matrix_laplace()
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-3
+            )
+        )
+        eta = 0.5 / (L + 1)
+
+        for k in range(steps):
+            W = torch.randn(size=(self.get_m(), 1)).double()
+            theta = theta - eta * nabla(theta) + np.sqrt(2 * eta) * W
+            if verbose == True:
+                print("Iter:", k, theta.T)
+
+        self.sampled_theta = theta
+        return None
+
+    def sample_value(self, S):
+        """
+        Given a pre-sampled value evaluate certain portions of the domain S
+        :param S:
+        :return:
+        """
+        Z = self.product_integral(S)
+        map = self.sampled_theta.T @ Z @ self.sampled_theta
+        return map
+
+    def sample_path(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return (self.packing.embed(xtest) @ self.sampled_theta) ** 2
+
+    def load_data(self, data):
+        super().load_data(data, times=False)
+        self.sumLambda = torch.zeros(size=(self.get_m(), self.get_m()))
+        if len(data) > 1:
+            for sample in data:
+                (S, obs, dt) = sample
+                self.sumLambda += self.product_integral(S) * dt
+
+    def add_data_point(self, new_data):
+        super().add_data_point(new_data, times=False)
+        (S, obs, dt) = new_data
+        self.sumLambda += self.product_integral(S) * dt
+
+    def penalized_likelihood(self, threads=4):
+        sumLambda = self.sumLambda.numpy()
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            loss = lambda theta: float(
+                -np.sum(np.log((observations @ theta) ** 2))
+                + np.dot(theta, sumLambda @ theta)
+                + 0.5 * self.s * np.sum(theta**2)
+            )
+        else:
+            loss = lambda theta: float(
+                np.dot(theta, sumLambda @ theta) + 0.5 * self.s * np.sum(theta**2)
+            )
+
+        theta = np.random.randn(self.get_m())
+        res = minimize(loss, theta, jac=None, method="L-BFGS-B")
+        self.rate = torch.from_numpy(res.x)
+        return self.rate
+
+    def construct_covariance_matrix_laplace(self):
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+
+        if self.feedback == "count-record":
+            if self.observations is not None:
+                for i in range(self.observations.size()[0]):
+                    A = self.observations[i, :].view(-1, 1) @ self.observations[
+                        i, :
+                    ].view(1, -1)
+                    k = np.maximum(
+                        torch.dot(self.observations[i, :], self.rate.view(-1)) ** 2,
+                        self.min_intensity,
+                    )
+                    W = W + A / k
+            W += 2 * self.sumLambda
+        else:
+            raise AssertionError("Not implemented.")
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def map_lcb_ucb_approx_action(self, S, dt=1.0, beta=2.0):
+
+        phi = self.packing.integral(S)
+        map = phi @ self.rate
+
+        ucb = np.maximum(
+            (map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)) ** 2,
+            (map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)) ** 2,
+        )
+        ucb = np.minimum(ucb, self.B * S.volume() * dt)
+        lcb = 0.0
+
+        return dt * map**2, dt * lcb, dt * ucb
+
+    def mean_std_per_action(self, S, W, dt, beta):
+        Z = self.product_integral(S)
+
+        ucb, _ = maximize_matrix_quadratic_on_ellipse(
+            Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+        )
+        lcb, _ = minimize_matrix_quadratic_on_ellipse(
+            Z.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+        )
+
+        map = self.rate.T @ Z @ self.rate
+
+        return dt * map, dt * ucb, -lcb * dt
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return (self.packing.embed(xtest) @ self.rate) ** 2
+
+    def mean_rate_latent(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return self.packing.embed(xtest) @ self.rate
+
+    def map_lcb_ucb_approx(self, S, n, beta=2.0, delta=0.01):
+        xtest = S.return_discretization(n)
+        if self.data is None:
+            return (
+                0 * xtest[:, 0].view(-1, 1),
+                self.min_intensity + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, xtest.size()[0]),
+            )
+        self.fit_ellipsoid_approx()
+
+        Phi = self.packing.embed(xtest).double()
+        map = Phi @ self.rate
+        N = Phi.size()[0]
+
+        ucb = torch.zeros(size=(N, 1)).double()
+        lcb = torch.zeros(size=(N, 1)).double()
+
+        for i in range(N):
+            x = Phi[i, :].view(-1, 1)
+            maximum = np.maximum(
+                (map[i] - beta * np.sqrt(x.T @ self.W_inv_approx @ x)) ** 2,
+                (map[i] + beta * np.sqrt(x.T @ self.W_inv_approx @ x)) ** 2,
+            )
+            ucb[i, 0] = np.minimum(maximum, self.B)
+            lcb[i, 0] = 0.0
+        # lcb[i, 0] = map[i] - np.sqrt(beta) * np.sqrt(x.T @ self.W_inv_approx @ x) ** 2
+        return map**2, lcb, ucb
+
+    def map_lcb_ucb(self, S, n, beta=2.0, delta=0.01):
+        """
+        Calculate exact confidence using laplace approximation on a whole set domain
+        :param S: set
+        :param n: discretization
+        :param beta: beta
+        :return:
+        """
+
+        xtest = S.return_discretization(n)
+        if self.data is None:
+            return (
+                self.min_intensity + 0 * xtest[:, 0].view(-1, 1),
+                self.min_intensity + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, 1),
+            )
+
+        N = xtest.size()[0]
+        Phi = self.packing.embed(xtest)
+        map = (Phi @ self.rate) ** 2
+
+        if self.uncertainty == "laplace":
+            W = self.construct_covariance_matrix_laplace()
+        ucb = torch.zeros(size=(N, 1)).double()
+        lcb = torch.zeros(size=(N, 1)).double()
+
+        for i in range(N):
+            x = Phi[i, :]
+            ucbi, _ = maximize_quadratic_on_ellipse(
+                x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+            )
+            lcbi, _ = minimize_quadratic_on_ellipse(
+                x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta
+            )
+            ucb[i, 0] = ucbi
+            lcb[i, 0] = lcbi
+
+        return map, lcb, ucb
 
 
 class LogisticGaussProcessRateEstimator(PermanentalProcessRateEstimator):
 
-	def penalized_likelihood(self, threads=4):
-		logistic = lambda x: np.log(1 + np.exp(x))
-		weights = self.weights.numpy()
-		nodes = self.nodes.numpy()
-
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			loss = lambda theta: float(-np.sum(np.log(logistic(observations @ theta))) + np.sum(
-				weights * logistic(theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-		else:
-			loss = lambda theta: float(np.sum(weights * logistic(theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-
-		theta = np.random.randn(self.get_m())
-		res = minimize(loss, theta, jac=None, method='L-BFGS-B',
-					   options={'maxcor': 20, 'iprint': -1, 'maxfun': 150000, 'maxls': 50})
-		self.rate = torch.from_numpy(res.x)
-
-		return self.rate
-
-	def logistic(self, x):
-		return torch.log(1 + torch.exp(x))
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return self.logistic(self.packing.embed(xtest) @ self.rate)
+    def penalized_likelihood(self, threads=4):
+        logistic = lambda x: np.log(1 + np.exp(x))
+        weights = self.weights.numpy()
+        nodes = self.nodes.numpy()
+
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            loss = lambda theta: float(
+                -np.sum(np.log(logistic(observations @ theta)))
+                + np.sum(weights * logistic(theta @ nodes.T))
+                + self.s * np.sum(theta**2)
+            )
+        else:
+            loss = lambda theta: float(
+                np.sum(weights * logistic(theta @ nodes.T)) + self.s * np.sum(theta**2)
+            )
+
+        theta = np.random.randn(self.get_m())
+        res = minimize(
+            loss,
+            theta,
+            jac=None,
+            method="L-BFGS-B",
+            options={"maxcor": 20, "iprint": -1, "maxfun": 150000, "maxls": 50},
+        )
+        self.rate = torch.from_numpy(res.x)
+
+        return self.rate
+
+    def logistic(self, x):
+        return torch.log(1 + torch.exp(x))
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return self.logistic(self.packing.embed(xtest) @ self.rate)
 
 
 class ExpGaussProcessRateEstimator(PermanentalProcessRateEstimator):
 
-	def penalized_likelihood(self, threads=4):
-		weights = self.weights.numpy()
-		nodes = self.nodes.numpy()
-
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			loss = lambda theta: float(np.sum(observations @ theta) + np.sum(
-				weights * np.exp(-theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-		else:
-			loss = lambda theta: float(np.sum(weights * np.exp(-theta @ nodes.T)) + self.s * np.sum(theta ** 2))
-
-		theta = np.zeros(self.get_m())
-		res = minimize(loss, theta, jac=None, method='L-BFGS-B', options={'maxcor': 20, 'iprint': -1,
-																		  'maxfun': 150000, 'maxls': 100,
-																		  'ftol': 1e-12, 'eps': 1e-12, 'gtol': 1e-8})
-		self.rate = torch.from_numpy(res.x)
-
-		return self.rate
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return torch.exp(-self.packing.embed(xtest) @ self.rate)
+    def penalized_likelihood(self, threads=4):
+        weights = self.weights.numpy()
+        nodes = self.nodes.numpy()
+
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            loss = lambda theta: float(
+                np.sum(observations @ theta)
+                + np.sum(weights * np.exp(-theta @ nodes.T))
+                + self.s * np.sum(theta**2)
+            )
+        else:
+            loss = lambda theta: float(
+                np.sum(weights * np.exp(-theta @ nodes.T)) + self.s * np.sum(theta**2)
+            )
+
+        theta = np.zeros(self.get_m())
+        res = minimize(
+            loss,
+            theta,
+            jac=None,
+            method="L-BFGS-B",
+            options={
+                "maxcor": 20,
+                "iprint": -1,
+                "maxfun": 150000,
+                "maxls": 100,
+                "ftol": 1e-12,
+                "eps": 1e-12,
+                "gtol": 1e-8,
+            },
+        )
+        self.rate = torch.from_numpy(res.x)
+
+        return self.rate
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return torch.exp(-self.packing.embed(xtest) @ self.rate)
 
 
 if __name__ == "__main__":
-	torch.manual_seed(2)
-	np.random.seed(2)
-	d = 1
-	gamma = 0.1
-	n = 64
-	B = 4.
-	b = 0.1
-
-	process = PoissonPointProcess(d=1, B=B, b=b)
-	Sets = []
-	levels = 4
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	Sets = hierarchical_structure.get_all_sets()
-
-	D = BorelSet(1, bounds=torch.Tensor([[-1., 1.]]).double())
-
-	m = 64
-	embedding = HermiteEmbedding(m=m, d=1, gamma=gamma)
-	k = KernelFunction(gamma=gamma)
-
-	estimator5 = PoissonRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d)
-
-	estimator4 = PermanentalProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d)
-	# estimator = PermanentalProcessRateEstimator(process, hierarchical_structure,
-	#											kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom", approx="ellipsoid")
-	# estimator = LogGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
-	estimator = LogGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B + 1, m=m, d=d,
-											 embedding=embedding)
-
-	# estimator = LogisticGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
-	estimator2 = LogisticGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d,
-												   embedding=embedding)
-	# estimator = ExpGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
-	estimator3 = ExpGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d,
-											  embedding=embedding)
-
-	estimators = [estimator, estimator2, estimator3, estimator4, estimator5]
-	names = ['sigmoid', 'logistic', 'exp', 'square', 'no-link']
-	bands = [True, False, False, False, True]
-
-	estimators = [estimator, estimator5, estimator4]
-	names = ['sigmoid', 'no-link', 'square']
-	bands = [False, False, False]
-
-	min_vol, max_vol = estimator.get_min_max()
-	dt = 10. / (b * min_vol)
-	dt = dt * 2
-
-	print("Suggested dt:", dt)
-	c = ['k', 'r', 'b', 'y', 'g', 'orange', 'brown', 'purple'] + ['k' for i in range(500)]
-
-	no_sets = len(Sets)
-
-	# no_samples = 3
-	# data = []
-	# samples = []
-	# repeats = 2
-	#
-	# for i in range(no_samples):
-	# 	j = np.random.randint(0, no_sets, 1)
-	# 	S = Sets[j[0]]
-	# 	for _ in range(repeats):
-	# 		sample = process.sample_discretized(S, dt)
-	# 		samples.append(sample)
-	# 		data.append((S, sample, dt))
-	#
-	# sample_D = process.sample_discretized(D, dt)
-	# samples.append(sample_D)
-	# no_samples = repeats * no_samples + 1
-	# data.append((D, sample_D, dt))
-
-	data_single = []
-	basic_sets = hierarchical_structure.get_sets_level(levels)
-	samples = []
-
-	for set in basic_sets:
-		sample = process.sample_discretized(set, dt)
-		data_single.append((set, sample, dt))
-		samples.append(sample)
-	data = data_single
-
-	# sample_D = torch.cat(samples)
-	# data = [(D,sample_D,dt)]
-
-	# data2 = []
-	# samples = []
-	# for set in basic_sets:
-	# 	sample = process.sample_discretized(set,dt*2)
-	# 	data2.append((set,sample,dt*2))
-	# 	samples.append(sample)
-	#
-	# sample_D_2 = torch.cat(samples)
-	# data = [(D, sample_D_2, dt*2)]
-	#
-	# data = data + data2
-
-	for estimator, name, band in zip(estimators, names, bands):
-		estimator.load_data(data)
-
-		xtest = D.return_discretization(n=n)
-
-		# likelihood based
-		estimator.fit_gp()
-		rate_mean = estimator.mean_rate(D, n=n)
-		p = plt.plot(xtest, rate_mean, label='likelihood: ' + name)
-
-		if band == True:
-			_, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.)
-			plt.fill_between(xtest.numpy().flatten(), lcb.numpy().flatten(), ucb.numpy().flatten(), alpha=0.4,
-							 color=p[0].get_color(), label=name)
-
-	for j in range(len(samples)):
-		if samples[j] is not None:
-			plt.plot(samples[j], samples[j] * 0, 'o', color=c[j])
-
-	# for action in Sets:
-	# 	map, lcb, ucb = estimator.map_lcb_ucb_approx_action(action,beta=2.)
-	# 	x = np.linspace(action.bounds[0,0],action.bounds[0,1],2)
-	# 	plt.plot(x,x*0+float(ucb/action.volume()),'-o', color = "green")
-	process.visualize(D, samples=0, n=n, dt=1.)
-	plt.show()
+    torch.manual_seed(2)
+    np.random.seed(2)
+    d = 1
+    gamma = 0.1
+    n = 64
+    B = 4.0
+    b = 0.1
+
+    process = PoissonPointProcess(d=1, B=B, b=b)
+    Sets = []
+    levels = 4
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    Sets = hierarchical_structure.get_all_sets()
+
+    D = BorelSet(1, bounds=torch.tensor([[-1.0, 1.0]]).double())
+
+    m = 64
+    embedding = HermiteEmbedding(m=m, d=1, gamma=gamma)
+    k = KernelFunction(gamma=gamma)
+
+    estimator5 = PoissonRateEstimator(
+        hierarchical_structure,
+        kernel=k,
+        max_intensity=B,
+        basis_size_per_dim=m,
+        d=d,
+    )
+
+    estimator4 = PermanentalProcessRateEstimator(
+        process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d
+    )
+    # estimator = PermanentalProcessRateEstimator(process, hierarchical_structure,
+    # 											kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom", approx="ellipsoid")
+    # estimator = LogGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
+    estimator = LogGaussProcessRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B + 1,
+        m=m,
+        d=d,
+        embedding=embedding,
+    )
+
+    # estimator = LogisticGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
+    estimator2 = LogisticGaussProcessRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        embedding=embedding,
+    )
+    # estimator = ExpGaussProcessRateEstimator(process, hierarchical_structure, kernel_object=k, B=B, m=m, d=d, embedding=embedding, basis = "custom")
+    estimator3 = ExpGaussProcessRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        embedding=embedding,
+    )
+
+    estimators = [estimator, estimator2, estimator3, estimator4, estimator5]
+    names = ["sigmoid", "logistic", "exp", "square", "no-link"]
+    bands = [True, False, False, False, True]
+
+    estimators = [estimator, estimator5, estimator4]
+    names = ["sigmoid", "no-link", "square"]
+    bands = [False, False, False]
+
+    min_vol, max_vol = estimator.get_min_max()
+    dt = 10.0 / (b * min_vol)
+    dt = dt * 2
+
+    print("Suggested dt:", dt)
+    c = ["k", "r", "b", "y", "g", "orange", "brown", "purple"] + [
+        "k" for i in range(500)
+    ]
+
+    no_sets = len(Sets)
+
+    # no_samples = 3
+    # data = []
+    # samples = []
+    # repeats = 2
+    #
+    # for i in range(no_samples):
+    # 	j = np.random.randint(0, no_sets, 1)
+    # 	S = Sets[j[0]]
+    # 	for _ in range(repeats):
+    # 		sample = process.sample_discretized(S, dt)
+    # 		samples.append(sample)
+    # 		data.append((S, sample, dt))
+    #
+    # sample_D = process.sample_discretized(D, dt)
+    # samples.append(sample_D)
+    # no_samples = repeats * no_samples + 1
+    # data.append((D, sample_D, dt))
+
+    data_single = []
+    basic_sets = hierarchical_structure.get_sets_level(levels)
+    samples = []
+
+    for set in basic_sets:
+        sample = process.sample_discretized(set, dt)
+        data_single.append((set, sample, dt))
+        samples.append(sample)
+    data = data_single
+
+    # sample_D = torch.cat(samples)
+    # data = [(D,sample_D,dt)]
+
+    # data2 = []
+    # samples = []
+    # for set in basic_sets:
+    # 	sample = process.sample_discretized(set,dt*2)
+    # 	data2.append((set,sample,dt*2))
+    # 	samples.append(sample)
+    #
+    # sample_D_2 = torch.cat(samples)
+    # data = [(D, sample_D_2, dt*2)]
+    #
+    # data = data + data2
+
+    for estimator, name, band in zip(estimators, names, bands):
+        estimator.load_data(data)
+
+        xtest = D.return_discretization(n=n)
+
+        # likelihood based
+        estimator.fit_gp()
+        rate_mean = estimator.mean_rate(D, n=n)
+        p = plt.plot(xtest, rate_mean, label="likelihood: " + name)
+
+        if band == True:
+            _, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.0)
+            plt.fill_between(
+                xtest.numpy().flatten(),
+                lcb.numpy().flatten(),
+                ucb.numpy().flatten(),
+                alpha=0.4,
+                color=p[0].get_color(),
+                label=name,
+            )
+
+    for j in range(len(samples)):
+        if samples[j] is not None:
+            plt.plot(samples[j], samples[j] * 0, "o", color=c[j])
+
+    # for action in Sets:
+    # 	map, lcb, ucb = estimator.map_lcb_ucb_approx_action(action,beta=2.)
+    # 	x = np.linspace(action.bounds[0,0],action.bounds[0,1],2)
+    # 	plt.plot(x,x*0+float(ucb/action.volume()),'-o', color = "green")
+    process.visualize(D, samples=0, n=n, dt=1.0)
+    plt.show()
diff --git a/stpy/point_processes/poisson/loglinear_estimator.py b/stpy/point_processes/poisson/loglinear_estimator.py
index 4956fb4..d9ba3de 100644
--- a/stpy/point_processes/poisson/loglinear_estimator.py
+++ b/stpy/point_processes/poisson/loglinear_estimator.py
@@ -12,169 +12,195 @@
 
 class LogLinearRateEstimator(PoissonRateEstimator):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
-
-	def least_squares_weighted(self, threads=0):
-		theta = cp.Variable(self.get_m())
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		phis = self.varphis[mask, :].clone().numpy()
-		tau = self.total_bucketized_time.clone().numpy()
-
-		variances = self.variances.view(-1).clone().numpy()
-
-		for i in range(variances.shape[0]):
-			if mask[i] > 0:
-				variances[i] = variances[i] * tau[i] * self.variance_correction(variances[i] * tau[i])
-
-		selected_variances = variances[mask]
-		print(np.log(observations))
-		print(selected_variances)
-		objective = cp.Minimize(
-			cp.sum_squares((phis @ theta) - np.log(observations) / tau[mask]))  # + self.s * cp.norm2(theta))
-
-		prob = cp.Problem(objective)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=True,
-				   mosek_params={mosek.iparam.num_threads: threads})
-
-		self.rate = torch.from_numpy(theta.value)
-		print(self.rate)
-		return self.rate
-
-	def mean_var_reg_set(self, S, dt=1., beta=2.):
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix_regression()
-			self.approx_fit = True
-
-		map = 0
-		lcb = 0
-		ucb = 0
-		for set in self.basic_sets:
-			if S.inside(set):
-				x = self.packing.integral(set).view(-1, 1)
-				lcb = lcb + torch.exp(dt * (x @ self.rate - beta * np.sqrt(x.T @ self.W_inv @ x)))
-				ucb = ucb + torch.exp(dt * (x @ self.rate + beta * np.sqrt(x.T @ self.W_inv @ x)))
-				map = map + torch.exp(dt * x @ self.rate)
-		return map, ucb, lcb
-
-	def fit_ellipsoid_approx(self):
-		self.W = self.construct_covariance_matrix_regression()
-		self.W_inv = torch.pinverse(self.W)
-
-	# def map_lcb_ucb_approx_action(self, S, dt=1., beta=2.):
-	# 	phi = self.packing.integral(S) * dt
-	# 	map = phi @ self.rate
-	# 	ucb = map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
-	# 	ucb = np.minimum(ucb, self.B * S.volume() * dt)
-	#
-	# 	lcb = map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
-	# 	lcb = np.maximum(lcb, self.b * S.volume() * dt)
-	# 	return map, lcb, ucb
-
-	def construct_covariance_matrix_regression(self):
-
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-
-		if self.data is not None:
-			variances = self.variances
-
-			if self.feedback == "count-record":
-				mask = self.bucketized_counts > 0
-				tau = self.total_bucketized_time
-				for index_o, o in enumerate(self.bucketized_obs):
-					n = mask[index_o]
-					if n > 0:
-						A = self.varphis[index_o, :].view(-1, 1) @ self.varphis[index_o, :].view(1, -1) * tau[index_o]
-						W = W + A / (variances[index_o])
-
-			elif self.feedback == "histogram":
-
-				for datapoint in self.data:
-					(S, obs, dt) = datapoint
-					varphi = self.packing.integral(S) * dt
-					variance = varphi @ self.rate
-					variance = variance
-					A = varphi.view(-1, 1) @ varphi.view(1, -1)
-					W = W + A / variance
-
-		return W + torch.eye(self.get_m()).double() * self.s
-
-	def mean_set(self, S, dt=1.):
-		mu = 0
-		for set in self.basic_sets:
-			if S.inside(set):
-				mu = mu + torch.exp(dt * self.packing.integral(set) @ self.rate)
-		return mu
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def least_squares_weighted(self, threads=0):
+        theta = cp.Variable(self.get_m())
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        phis = self.varphis[mask, :].clone().numpy()
+        tau = self.total_bucketized_time.clone().numpy()
+
+        variances = self.variances.view(-1).clone().numpy()
+
+        for i in range(variances.shape[0]):
+            if mask[i] > 0:
+                variances[i] = (
+                    variances[i]
+                    * tau[i]
+                    * self.variance_correction(variances[i] * tau[i])
+                )
+
+        selected_variances = variances[mask]
+        print(np.log(observations))
+        print(selected_variances)
+        objective = cp.Minimize(
+            cp.sum_squares((phis @ theta) - np.log(observations) / tau[mask])
+        )  # + self.s * cp.norm2(theta))
+
+        prob = cp.Problem(objective)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=True,
+            mosek_params={mosek.iparam.num_threads: threads},
+        )
+
+        self.rate = torch.from_numpy(theta.value)
+        print(self.rate)
+        return self.rate
+
+    def mean_var_reg_set(self, S, dt=1.0, beta=2.0):
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix_regression()
+            self.approx_fit = True
+
+        map = 0
+        lcb = 0
+        ucb = 0
+        for set in self.basic_sets:
+            if S.inside(set):
+                x = self.packing.integral(set).view(-1, 1)
+                lcb = lcb + torch.exp(
+                    dt * (x @ self.rate - beta * np.sqrt(x.T @ self.W_inv @ x))
+                )
+                ucb = ucb + torch.exp(
+                    dt * (x @ self.rate + beta * np.sqrt(x.T @ self.W_inv @ x))
+                )
+                map = map + torch.exp(dt * x @ self.rate)
+        return map, ucb, lcb
+
+    def fit_ellipsoid_approx(self):
+        self.W = self.construct_covariance_matrix_regression()
+        self.W_inv = torch.pinverse(self.W)
+
+    # def map_lcb_ucb_approx_action(self, S, dt=1., beta=2.):
+    # 	phi = self.packing.integral(S) * dt
+    # 	map = phi @ self.rate
+    # 	ucb = map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
+    # 	ucb = np.minimum(ucb, self.B * S.volume() * dt)
+    #
+    # 	lcb = map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
+    # 	lcb = np.maximum(lcb, self.b * S.volume() * dt)
+    # 	return map, lcb, ucb
+
+    def construct_covariance_matrix_regression(self):
+
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+
+        if self.data is not None:
+            variances = self.variances
+
+            if self.feedback == "count-record":
+                mask = self.bucketized_counts > 0
+                tau = self.total_bucketized_time
+                for index_o, o in enumerate(self.bucketized_obs):
+                    n = mask[index_o]
+                    if n > 0:
+                        A = (
+                            self.varphis[index_o, :].view(-1, 1)
+                            @ self.varphis[index_o, :].view(1, -1)
+                            * tau[index_o]
+                        )
+                        W = W + A / (variances[index_o])
+
+            elif self.feedback == "histogram":
+
+                for datapoint in self.data:
+                    (S, obs, dt) = datapoint
+                    varphi = self.packing.integral(S) * dt
+                    variance = varphi @ self.rate
+                    variance = variance
+                    A = varphi.view(-1, 1) @ varphi.view(1, -1)
+                    W = W + A / variance
+
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def mean_set(self, S, dt=1.0):
+        mu = 0
+        for set in self.basic_sets:
+            if S.inside(set):
+                mu = mu + torch.exp(dt * self.packing.integral(set) @ self.rate)
+        return mu
 
 
 if __name__ == "__main__":
-	torch.manual_seed(2)
-	np.random.seed(2)
-	d = 1
-	gamma = 0.1
-	n = 64
-	B = 4.
-	b = 0.1
-
-	process = PoissonPointProcess(d=1, B=B, b=b)
-	Sets = []
-	levels = 5
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	Sets = hierarchical_structure.get_all_sets()
-
-	D = BorelSet(1, bounds=torch.Tensor([[-1., 1.]]).double())
-
-	m = 128
-	k = KernelFunction(gamma=gamma)
-	estimator = LogLinearRateEstimator(process, hierarchical_structure,
-									   kernel_object=k, B=B, m=m, d=d, estimator='least-sq')
-
-	min_vol, max_vol = estimator.get_min_max()
-
-	dt = 1. / (b * min_vol)
-	dt = dt * 2
-
-	print("Suggested dt:", dt)
-	c = ['k', 'r', 'b', 'y', 'g', 'orange', 'brown', 'purple'] + ['k' for i in range(500)]
-
-	no_sets = len(Sets)
-	no_samples = 0
-	data = []
-	samples = []
-	repeats = 2
-
-	for i in range(no_samples):
-		j = np.random.randint(0, no_sets, 1)
-		S = Sets[j[0]]
-		for _ in range(repeats):
-			sample = process.sample_discretized(S, dt)
-			samples.append(sample)
-			data.append((S, sample, dt))
-
-	sample_D = process.sample_discretized(D, dt)
-	samples.append(sample_D)
-	no_samples = repeats * no_samples + 1
-	data.append((D, sample_D, dt))
-
-	estimator.load_data(data)
-
-	xtest = D.return_discretization(n=n)
-
-	# likelihood based
-	estimator.fit_gp()
-
-	for set in estimator.basic_sets:
-		x = np.linspace(set.bounds[0, 0], set.bounds[0, 1], 2)
-		val = estimator.mean_set(set)
-		plt.plot(x, x * 0 + float(val), 'b-o')
-		vol = process.rate_volume(set)
-		plt.plot(x, x * 0 + float(vol), '-o', color='orange')
-	for j in range(no_samples):
-		if samples[j] is not None:
-			plt.plot(samples[j], samples[j] * 0, 'o', color=c[j])
-
-	process.visualize(D, samples=0, n=n, dt=1.)
+    torch.manual_seed(2)
+    np.random.seed(2)
+    d = 1
+    gamma = 0.1
+    n = 64
+    B = 4.0
+    b = 0.1
+
+    process = PoissonPointProcess(d=1, B=B, b=b)
+    Sets = []
+    levels = 5
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    Sets = hierarchical_structure.get_all_sets()
+
+    D = BorelSet(1, bounds=torch.tensor([[-1.0, 1.0]]).double())
+
+    m = 128
+    k = KernelFunction(gamma=gamma)
+    estimator = LogLinearRateEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        estimator="least-sq",
+    )
+
+    min_vol, max_vol = estimator.get_min_max()
+
+    dt = 1.0 / (b * min_vol)
+    dt = dt * 2
+
+    print("Suggested dt:", dt)
+    c = ["k", "r", "b", "y", "g", "orange", "brown", "purple"] + [
+        "k" for i in range(500)
+    ]
+
+    no_sets = len(Sets)
+    no_samples = 0
+    data = []
+    samples = []
+    repeats = 2
+
+    for i in range(no_samples):
+        j = np.random.randint(0, no_sets, 1)
+        S = Sets[j[0]]
+        for _ in range(repeats):
+            sample = process.sample_discretized(S, dt)
+            samples.append(sample)
+            data.append((S, sample, dt))
+
+    sample_D = process.sample_discretized(D, dt)
+    samples.append(sample_D)
+    no_samples = repeats * no_samples + 1
+    data.append((D, sample_D, dt))
+
+    estimator.load_data(data)
+
+    xtest = D.return_discretization(n=n)
+
+    # likelihood based
+    estimator.fit_gp()
+
+    for set in estimator.basic_sets:
+        x = np.linspace(set.bounds[0, 0], set.bounds[0, 1], 2)
+        val = estimator.mean_set(set)
+        plt.plot(x, x * 0 + float(val), "b-o")
+        vol = process.rate_volume(set)
+        plt.plot(x, x * 0 + float(vol), "-o", color="orange")
+    for j in range(no_samples):
+        if samples[j] is not None:
+            plt.plot(samples[j], samples[j] * 0, "o", color=c[j])
+
+    process.visualize(D, samples=0, n=n, dt=1.0)
diff --git a/stpy/point_processes/poisson/mbr_positive_estimator.py b/stpy/point_processes/poisson/mbr_positive_estimator.py
index 5924d22..8993cb7 100644
--- a/stpy/point_processes/poisson/mbr_positive_estimator.py
+++ b/stpy/point_processes/poisson/mbr_positive_estimator.py
@@ -8,355 +8,436 @@
 from stpy.embeddings.embedding import HermiteEmbedding
 from stpy.kernels import KernelFunction
 from stpy.point_processes.poisson import PoissonPointProcess
-from stpy.point_processes.poisson.link_fun_rate_estimator import PermanentalProcessRateEstimator
+from stpy.point_processes.poisson.link_fun_rate_estimator import (
+    PermanentalProcessRateEstimator,
+)
 
 
 class MBRPositiveEstimator(PermanentalProcessRateEstimator):
 
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
-
-		if self.feedback == "count-record":
-			self.varLambdas_vec = torch.zeros(
-				size=(self.varLambdas.size()[0], self.varLambdas.size()[1] * self.varLambdas.size()[2])).double()
-			for i in range(self.varLambdas.size()[0]):
-				self.varLambdas_vec[i, :] = self.varLambdas[i, :, :].reshape(-1)
-
-		self.approx_solver = True
-
-	def fit_gp(self, threads=4):
-		if self.data is not None:
-			super().fit_gp(threads=threads)
-		else:
-			self.rate = None
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		emb = self.packing.embed(xtest)
-		mu = torch.einsum('ij,jk,ik->i', emb, self.rate, emb).view(-1, 1)
-		return mu
-
-	def rate_value(self, x, dt=1):
-		emb = self.packing.embed(x) * dt
-		mu = torch.einsum('ij,jk,ik->i', emb, self.rate, emb).view(-1, 1)
-		return mu
-
-	def mean_set(self, S, dt=1.):
-		if self.data is not None:
-			emb = self.product_integral(S) * dt
-			mu = torch.trace(emb @ self.rate).view(1, 1)
-		else:
-			mu = self.b * S.volume()
-		return mu
-
-	def penalized_likelihood(self, threads=4):
-		sumLambda = self.sumLambda.numpy()
-		Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-
-		if self.observations is not None:
-			observations = self.observations.numpy()
-			# cost = cp.sum_squares(cp.diag(emb @ A @ emb.T) - y.view(-1).numpy()) / (self.s ** 2) + (self.lam) * cp.norm(A, "fro")
-			objective = -cp.sum(cp.log(observations @ Theta @ observations.T)) + \
-						cp.trace(sumLambda @ Theta) + self.s * cp.sum_squares(cp.vec(Theta))
-		else:
-			objective = cp.trace(sumLambda @ Theta) + self.s * cp.sum_squares(cp.vec(Theta))
-
-		# if self.get_m() == 2:
-		# 	# use Lorentz-cone special result
-		# 	constraints = [cp.SOC(Theta[0,0]+Theta[1,1],Theta[1,1]    )]
-		# else:
-		# 	constraints = [Theta >> 0]
-		constraints = []
-		prob = cp.Problem(cp.Minimize(objective), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-3})
-		self.rate = torch.from_numpy(Theta.value)
-		return self.rate
-
-	def penalized_likelihood_bins(self, threads=4):
-		Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		tau = self.total_bucketized_time[mask].clone().numpy()
-		varLambdas_vec = self.varLambdas_vec[mask, :].clone().numpy()
-
-		objective = -cp.sum(observations @ cp.log(cp.multiply(tau, varLambdas_vec @ cp.vec(Theta)))) + \
-					cp.sum(cp.multiply(tau, varLambdas_vec @ cp.vec(Theta))) + self.s * cp.sum_squares(cp.vec(Theta))
-
-		constraints = [Theta >> 0]
-		prob = cp.Problem(cp.Minimize(objective), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-3})
-		self.rate = torch.from_numpy(Theta.value)
-		return self.rate
-
-	def least_squares_weighted(self, threads=4):
-
-		if self.approx_fit == False:
-			self.bucketization()
-
-		Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		tau = self.total_bucketized_time.clone().numpy()
-
-		# varsumLambdas
-		varLambdas_vec = self.varLambdas_vec[mask, :].clone().numpy()
-
-		variances = self.variances.view(-1).clone().numpy()
-
-		for i in range(variances.shape[0]):
-			if mask[i] > 0:
-				variances[i] = variances[i] * tau[i] * self.variance_correction(variances[i] * tau[i])
-
-		selected_variances = variances[mask]
-
-		objective = cp.sum_squares((varLambdas_vec @ cp.vec(Theta) +
-									- observations) / np.sqrt(selected_variances)) + self.s * cp.sum_squares(
-			cp.vec(Theta)) / 2
-		constraints = [Theta >> 0]
-		prob = cp.Problem(cp.Minimize(objective), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-3})
-
-		self.rate = torch.from_numpy(Theta.value)
-		return self.rate
-
-	def construct_covariance_matrix(self):
-		if self.estimator == "bins":
-			self.construct_covariance_matrix_bins()
-		elif self.estimator == "least-sq":
-			self.construct_covariance_matrix_regression()
-		else:
-			raise NotImplementedError("Covariance not implemented")
-
-	def construct_covariance_matrix_regression(self):
-		varLambdas = self.varLambdas_vec.clone()
-		variances = self.variances
-		mask = self.bucketized_counts > 0
-		tau = self.total_bucketized_time
-		W = torch.zeros(size=(self.get_m() ** 2, self.get_m() ** 2)).double()
-		I = torch.eye(self.get_m() ** 2).double()
-		W_inv = self.s * torch.eye(self.get_m() ** 2).double()
-
-		for index_o, o in enumerate(self.bucketized_obs):
-			n = mask[index_o]
-			if n > 0:
-				k = self.variance_correction(tau[index_o] * variances[index_o])
-				v = tau[index_o] / (variances[index_o] * k)
-
-				vec = varLambdas[index_o, :].view(-1, 1)
-				A = vec @ vec.T
-				W = W + A * v
-				denom = 1. + v * vec.T @ W_inv @ vec
-				W_inv = W_inv @ (I - v * vec @ (vec.T @ W_inv) / denom)
-
-		self.W = W + self.s * torch.eye(self.get_m() ** 2).double()
-		self.W_inv = W_inv
-		# self.W_cholesky = torch.cholesky(self.W, upper=True)
-		return self.W
-
-	def construct_covariance_matrix_bins(self):
-		self.construct_covariance_matrix_regression()
-
-	def mean_var_reg_set(self, S, dt=1., beta=2., lcb_compute=False):
-
-		if self.data is None:
-			return S.volume() * self.b, S.volume() * self.B, S.volume() * self.b
-
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix()
-			self.approx_fit = True
-
-		map = None
-		lcb = None
-
-		if self.approx_solver == True:
-			ucb = self.band_no_opt(S, beta=beta, dt=dt, maximization=True)
-			if lcb_compute == True:
-				lcb = self.band_no_opt(S, beta=beta, dt=dt, maximization=False)
-		else:
-			ucb = self.band(S, beta=beta, dt=dt, maximization=True)
-			if lcb_compute == True:
-				lcb = self.band(S, beta=beta, dt=dt, maximization=False)
-
-		return map, ucb, lcb
-
-	def mean_var_bins_set(self, S, dt=1., beta=2., lcb_compute=False):
-		return self.mean_var_reg_set(S, dt=dt, beta=beta, lcb_compute=lcb_compute)
-
-	def band(self, S, beta=2., dt=1., maximization=True):
-		emb = self.product_integral(S) * dt
-		A = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
-		cost = cp.trace(A @ emb)
-		Z = self.W_cholesky.clone()
-		zero = np.zeros(self.get_m() ** 2)
-		constraints = [cp.SOC(zero.T @ cp.vec(A) + self.s * beta ** 2, Z @ (cp.vec(A) - cp.vec(self.rate.numpy())))]
-		constraints += [A >> 0]
-
-		if maximization == True:
-			prob = cp.Problem(cp.Maximize(cost), constraints)
-		else:
-			prob = cp.Problem(cp.Minimize(cost), constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: 4,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-3})
-		ucb = torch.trace(torch.from_numpy(A.value) @ emb)
-		return ucb
-
-	def band_no_opt(self, S, beta=2., dt=1., maximization=True):
-
-		if self.rate is None:
-			if maximization == True:
-				return S.volume() * dt * self.B
-			else:
-				return S.volume() * dt * self.b
-		else:
-			emb = self.product_integral(S)
-			cost = torch.trace(self.rate @ emb)
-			if maximization == True:
-				out = cost + beta * emb.view(1, -1) @ self.W_inv @ emb.view(-1, 1)
-			else:
-				out = np.maximum(cost - beta * emb.view(1, -1) @ self.W_inv @ emb.view(-1, 1), 0.)
-			return out * dt
-
-	def gap(self, S, actions, w, dt, beta=2.):
-		"""
-		Estimates the gap of an action S,
-		:param S:
-		:param dt:
-		:return:
-		"""
-
-		if self.data is None:
-			return (self.B - self.b) * S.volume() / w(S)
-
-		if self.ucb_identified == False:
-			print("Recomputing UCB.....")
-			self.ucb_identified = True
-			self.max_ucb = -1000
-			self.ucb_action = None
-			for action in actions:
-				_, ucb, __ = self.mean_var_reg_set(action, dt=dt, beta=self.beta(0))
-				ucb = ucb / w(action)
-				if ucb > self.max_ucb:
-					self.max_ucb = ucb
-				self.ucb_action = action
-		map, ucb, lcb = self.mean_var_reg_set(S, dt=dt, beta=self.beta(0), lcb_compute=True)
-		gap = w(S) * self.max_ucb - lcb
-		return gap
-
-	def information(self, S, dt, precomputed=None):
-
-		if self.data is None:
-			return 1.
-
-		if self.W is None:
-			self.construct_covariance_matrix()
-
-		if self.feedback == "count-record":
-			varphi_UCB = self.product_integral(self.ucb_action).view(1, -1) * dt
-
-			ind = []
-			for index, set in enumerate(self.basic_sets):
-				if S.inside(set):
-					ind.append(index)
-			Upsilon = self.varLambdas_vec[ind, :] * dt
-
-			I = torch.eye(Upsilon.size()[0]).double()
-			G = self.W_inv - self.W_inv @ Upsilon.T @ torch.inverse(I + Upsilon @ Upsilon.T) @ Upsilon @ self.W_inv
-			return 10e-4 + torch.logdet(varphi_UCB @ self.W_inv @ varphi_UCB.T) - torch.logdet(
-				varphi_UCB @ G @ varphi_UCB.T)
-
-		elif self.feedback == "histogram":
-			raise NotImplementedError("Not implemented.")
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.feedback == "count-record":
+            self.varLambdas_vec = torch.zeros(
+                size=(
+                    self.varLambdas.size()[0],
+                    self.varLambdas.size()[1] * self.varLambdas.size()[2],
+                )
+            ).double()
+            for i in range(self.varLambdas.size()[0]):
+                self.varLambdas_vec[i, :] = self.varLambdas[i, :, :].reshape(-1)
+
+        self.approx_solver = True
+
+    def fit_gp(self, threads=4):
+        if self.data is not None:
+            super().fit_gp(threads=threads)
+        else:
+            self.rate = None
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        emb = self.packing.embed(xtest)
+        mu = torch.einsum("ij,jk,ik->i", emb, self.rate, emb).view(-1, 1)
+        return mu
+
+    def rate_value(self, x, dt=1):
+        emb = self.packing.embed(x) * dt
+        mu = torch.einsum("ij,jk,ik->i", emb, self.rate, emb).view(-1, 1)
+        return mu
+
+    def mean_set(self, S, dt=1.0):
+        if self.data is not None:
+            emb = self.product_integral(S) * dt
+            mu = torch.trace(emb @ self.rate).view(1, 1)
+        else:
+            mu = self.min_intensity * S.volume()
+        return mu
+
+    def penalized_likelihood(self, threads=4):
+        sumLambda = self.sumLambda.numpy()
+        Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+
+        if self.observations is not None:
+            observations = self.observations.numpy()
+            # cost = cp.sum_squares(cp.diag(emb @ A @ emb.T) - y.view(-1).numpy()) / (self.s ** 2) + (self.lam) * cp.norm(A, "fro")
+            objective = (
+                -cp.sum(cp.log(observations @ Theta @ observations.T))
+                + cp.trace(sumLambda @ Theta)
+                + self.s * cp.sum_squares(cp.vec(Theta))
+            )
+        else:
+            objective = cp.trace(sumLambda @ Theta) + self.s * cp.sum_squares(
+                cp.vec(Theta)
+            )
+
+        # if self.get_m() == 2:
+        # 	# use Lorentz-cone special result
+        # 	constraints = [cp.SOC(Theta[0,0]+Theta[1,1],Theta[1,1]    )]
+        # else:
+        # 	constraints = [Theta >> 0]
+        constraints = []
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+        self.rate = torch.from_numpy(Theta.value)
+        return self.rate
+
+    def penalized_likelihood_bins(self, threads=4):
+        Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        tau = self.total_bucketized_time[mask].clone().numpy()
+        varLambdas_vec = self.varLambdas_vec[mask, :].clone().numpy()
+
+        objective = (
+            -cp.sum(
+                observations @ cp.log(cp.multiply(tau, varLambdas_vec @ cp.vec(Theta)))
+            )
+            + cp.sum(cp.multiply(tau, varLambdas_vec @ cp.vec(Theta)))
+            + self.s * cp.sum_squares(cp.vec(Theta))
+        )
+
+        constraints = [Theta >> 0]
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+        self.rate = torch.from_numpy(Theta.value)
+        return self.rate
+
+    def least_squares_weighted(self, threads=4):
+
+        if self.approx_fit == False:
+            self.bucketization()
+
+        Theta = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        tau = self.total_bucketized_time.clone().numpy()
+
+        # varsumLambdas
+        varLambdas_vec = self.varLambdas_vec[mask, :].clone().numpy()
+
+        variances = self.variances.view(-1).clone().numpy()
+
+        for i in range(variances.shape[0]):
+            if mask[i] > 0:
+                variances[i] = (
+                    variances[i]
+                    * tau[i]
+                    * self.variance_correction(variances[i] * tau[i])
+                )
+
+        selected_variances = variances[mask]
+
+        objective = (
+            cp.sum_squares(
+                (varLambdas_vec @ cp.vec(Theta) + -observations)
+                / np.sqrt(selected_variances)
+            )
+            + self.s * cp.sum_squares(cp.vec(Theta)) / 2
+        )
+        constraints = [Theta >> 0]
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+
+        self.rate = torch.from_numpy(Theta.value)
+        return self.rate
+
+    def construct_covariance_matrix(self):
+        if self.estimator == "bins":
+            self.construct_covariance_matrix_bins()
+        elif self.estimator == "least-sq":
+            self.construct_covariance_matrix_regression()
+        else:
+            raise NotImplementedError("Covariance not implemented")
+
+    def construct_covariance_matrix_regression(self):
+        varLambdas = self.varLambdas_vec.clone()
+        variances = self.variances
+        mask = self.bucketized_counts > 0
+        tau = self.total_bucketized_time
+        W = torch.zeros(size=(self.get_m() ** 2, self.get_m() ** 2)).double()
+        I = torch.eye(self.get_m() ** 2).double()
+        W_inv = self.s * torch.eye(self.get_m() ** 2).double()
+
+        for index_o, o in enumerate(self.bucketized_obs):
+            n = mask[index_o]
+            if n > 0:
+                k = self.variance_correction(tau[index_o] * variances[index_o])
+                v = tau[index_o] / (variances[index_o] * k)
+
+                vec = varLambdas[index_o, :].view(-1, 1)
+                A = vec @ vec.T
+                W = W + A * v
+                denom = 1.0 + v * vec.T @ W_inv @ vec
+                W_inv = W_inv @ (I - v * vec @ (vec.T @ W_inv) / denom)
+
+        self.W = W + self.s * torch.eye(self.get_m() ** 2).double()
+        self.W_inv = W_inv
+        # self.W_cholesky = torch.cholesky(self.W, upper=True)
+        return self.W
+
+    def construct_covariance_matrix_bins(self):
+        self.construct_covariance_matrix_regression()
+
+    def mean_var_reg_set(self, S, dt=1.0, beta=2.0, lcb_compute=False):
+
+        if self.data is None:
+            return (
+                S.volume() * self.min_intensity,
+                S.volume() * self.B,
+                S.volume() * self.min_intensity,
+            )
+
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix()
+            self.approx_fit = True
+
+        map = None
+        lcb = None
+
+        if self.approx_solver == True:
+            ucb = self.band_no_opt(S, beta=beta, dt=dt, maximization=True)
+            if lcb_compute == True:
+                lcb = self.band_no_opt(S, beta=beta, dt=dt, maximization=False)
+        else:
+            ucb = self.band(S, beta=beta, dt=dt, maximization=True)
+            if lcb_compute == True:
+                lcb = self.band(S, beta=beta, dt=dt, maximization=False)
+
+        return map, ucb, lcb
+
+    def mean_var_bins_set(self, S, dt=1.0, beta=2.0, lcb_compute=False):
+        return self.mean_var_reg_set(S, dt=dt, beta=beta, lcb_compute=lcb_compute)
+
+    def band(self, S, beta=2.0, dt=1.0, maximization=True):
+        emb = self.product_integral(S) * dt
+        A = cp.Variable((self.get_m(), self.get_m()), symmetric=True)
+        cost = cp.trace(A @ emb)
+        Z = self.W_cholesky.clone()
+        zero = np.zeros(self.get_m() ** 2)
+        constraints = [
+            cp.SOC(
+                zero.T @ cp.vec(A) + self.s * beta**2,
+                Z @ (cp.vec(A) - cp.vec(self.rate.numpy())),
+            )
+        ]
+        constraints += [A >> 0]
+
+        if maximization == True:
+            prob = cp.Problem(cp.Maximize(cost), constraints)
+        else:
+            prob = cp.Problem(cp.Minimize(cost), constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: 4,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-3,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-3,
+            },
+        )
+        ucb = torch.trace(torch.from_numpy(A.value) @ emb)
+        return ucb
+
+    def band_no_opt(self, S, beta=2.0, dt=1.0, maximization=True):
+
+        if self.rate is None:
+            if maximization == True:
+                return S.volume() * dt * self.B
+            else:
+                return S.volume() * dt * self.min_intensity
+        else:
+            emb = self.product_integral(S)
+            cost = torch.trace(self.rate @ emb)
+            if maximization == True:
+                out = cost + beta * emb.view(1, -1) @ self.W_inv @ emb.view(-1, 1)
+            else:
+                out = np.maximum(
+                    cost - beta * emb.view(1, -1) @ self.W_inv @ emb.view(-1, 1), 0.0
+                )
+            return out * dt
+
+    def gap(self, S, actions, w, dt, beta=2.0):
+        """
+        Estimates the gap of an action S,
+        :param S:
+        :param dt:
+        :return:
+        """
+
+        if self.data is None:
+            return (self.B - self.min_intensity) * S.volume() / w(S)
+
+        if self.ucb_identified == False:
+            print("Recomputing UCB.....")
+            self.ucb_identified = True
+            self.max_ucb = -1000
+            self.ucb_action = None
+            for action in actions:
+                _, ucb, __ = self.mean_var_reg_set(action, dt=dt, beta=self.beta(0))
+                ucb = ucb / w(action)
+                if ucb > self.max_ucb:
+                    self.max_ucb = ucb
+                self.ucb_action = action
+        map, ucb, lcb = self.mean_var_reg_set(
+            S, dt=dt, beta=self.beta(0), lcb_compute=True
+        )
+        gap = w(S) * self.max_ucb - lcb
+        return gap
+
+    def information(self, S, dt, precomputed=None):
+
+        if self.data is None:
+            return 1.0
+
+        if self.W is None:
+            self.construct_covariance_matrix()
+
+        if self.feedback == "count-record":
+            varphi_UCB = self.product_integral(self.ucb_action).view(1, -1) * dt
+
+            ind = []
+            for index, set in enumerate(self.basic_sets):
+                if S.inside(set):
+                    ind.append(index)
+            Upsilon = self.varLambdas_vec[ind, :] * dt
+
+            I = torch.eye(Upsilon.size()[0]).double()
+            G = (
+                self.W_inv
+                - self.W_inv
+                @ Upsilon.T
+                @ torch.inverse(I + Upsilon @ Upsilon.T)
+                @ Upsilon
+                @ self.W_inv
+            )
+            return (
+                10e-4
+                + torch.logdet(varphi_UCB @ self.W_inv @ varphi_UCB.T)
+                - torch.logdet(varphi_UCB @ G @ varphi_UCB.T)
+            )
+
+        elif self.feedback == "histogram":
+            raise NotImplementedError("Not implemented.")
 
 
 if __name__ == "__main__":
-	torch.manual_seed(2)
-	np.random.seed(2)
-	d = 1
-	gamma = 0.2
-	n = 64
-	B = 4.
-	b = 0.5
-
-	process = PoissonPointProcess(d=1, B=B, b=b)
-	Sets = []
-	levels = 3
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	Sets = hierarchical_structure.get_all_sets()
-
-	D = BorelSet(1, bounds=torch.Tensor([[-1., 1.]]).double())
-
-	m = 32
-	embedding = HermiteEmbedding(m=m, d=1, gamma=gamma)
-	k = KernelFunction(gamma=gamma)
-	estimator = MBRPositiveEstimator(process, hierarchical_structure, kernel_object=k,
-									 B=B, m=m, d=d, embedding=embedding, basis="custom")
-	min_vol, max_vol = estimator.get_min_max()
-
-	dt = 10. / (b * min_vol)
-	dt = dt * 2
-
-	print("Suggested dt:", dt)
-	c = ['k', 'r', 'b', 'y', 'g', 'orange', 'brown', 'purple'] + ['k' for i in range(500)]
-
-	no_sets = len(Sets)
-	no_samples = 0
-	data = []
-	samples = []
-	repeats = 2
-
-	for i in range(no_samples):
-		j = np.random.randint(0, no_sets, 1)
-		S = Sets[j[0]]
-		for _ in range(repeats):
-			sample = process.sample_discretized(S, dt)
-			samples.append(sample)
-			data.append((S, sample, dt))
-
-	sample_D = process.sample_discretized(D, dt)
-	samples.append(sample_D)
-	no_samples = repeats * no_samples + 1
-	data.append((D, sample_D, dt))
-
-	estimator.load_data(data)
-
-	xtest = D.return_discretization(n=n)
-
-	# likelihood based
-	estimator.penalized_likelihood()
-	rate_mean = estimator.mean_rate(D, n=n)
-
-	# _, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.)
-
-	for j in range(no_samples):
-		if samples[j] is not None:
-			plt.plot(samples[j], samples[j] * 0, 'o', color=c[j])
-
-	plt.plot(xtest, rate_mean, label='likelihood - locations known')
-	# plt.fill_between(xtest.numpy().flatten(), lcb.numpy().flatten(), ucb.numpy().flatten(), alpha=0.4,
-	#				 color='blue', label='triangle')
-	process.visualize(D, samples=0, n=n, dt=1.)
+    torch.manual_seed(2)
+    np.random.seed(2)
+    d = 1
+    gamma = 0.2
+    n = 64
+    B = 4.0
+    b = 0.5
+
+    process = PoissonPointProcess(d=1, B=B, b=b)
+    Sets = []
+    levels = 3
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    Sets = hierarchical_structure.get_all_sets()
+
+    D = BorelSet(1, bounds=torch.tensor([[-1.0, 1.0]]).double())
+
+    m = 32
+    embedding = HermiteEmbedding(m=m, d=1, gamma=gamma)
+    k = KernelFunction(gamma=gamma)
+    estimator = MBRPositiveEstimator(
+        process,
+        hierarchical_structure,
+        kernel_object=k,
+        B=B,
+        m=m,
+        d=d,
+        embedding=embedding,
+        basis="custom",
+    )
+    min_vol, max_vol = estimator.get_min_max()
+
+    dt = 10.0 / (b * min_vol)
+    dt = dt * 2
+
+    print("Suggested dt:", dt)
+    c = ["k", "r", "b", "y", "g", "orange", "brown", "purple"] + [
+        "k" for i in range(500)
+    ]
+
+    no_sets = len(Sets)
+    no_samples = 0
+    data = []
+    samples = []
+    repeats = 2
+
+    for i in range(no_samples):
+        j = np.random.randint(0, no_sets, 1)
+        S = Sets[j[0]]
+        for _ in range(repeats):
+            sample = process.sample_discretized(S, dt)
+            samples.append(sample)
+            data.append((S, sample, dt))
+
+    sample_D = process.sample_discretized(D, dt)
+    samples.append(sample_D)
+    no_samples = repeats * no_samples + 1
+    data.append((D, sample_D, dt))
+
+    estimator.load_data(data)
+
+    xtest = D.return_discretization(n=n)
+
+    # likelihood based
+    estimator.penalized_likelihood()
+    rate_mean = estimator.mean_rate(D, n=n)
+
+    # _, lcb, ucb = estimator.map_lcb_ucb(D, n, beta=2.)
+
+    for j in range(no_samples):
+        if samples[j] is not None:
+            plt.plot(samples[j], samples[j] * 0, "o", color=c[j])
+
+    plt.plot(xtest, rate_mean, label="likelihood - locations known")
+    # plt.fill_between(xtest.numpy().flatten(), lcb.numpy().flatten(), ucb.numpy().flatten(), alpha=0.4,
+    # 				 color='blue', label='triangle')
+    process.visualize(D, samples=0, n=n, dt=1.0)
diff --git a/stpy/point_processes/poisson/poisson.py b/stpy/point_processes/poisson/poisson.py
index 4228b1f..524f0c0 100644
--- a/stpy/point_processes/poisson/poisson.py
+++ b/stpy/point_processes/poisson/poisson.py
@@ -5,152 +5,199 @@
 from stpy.borel_set import BorelSet
 
 
-class PoissonPointProcess():
-	"""
-	parametrized by log linear model
-
-	"""
-
-	def __init__(self, d=1, B=1, b=0.2, rate=None, rate_volume=None):
-		self.B = B
-		self.d = d
-		self.b = b
-		if rate is None:
-			self.rate = self.rate_default
-		else:
-			self.rate = rate
-
-		self.rate_volume_f = rate_volume
-		self.exact = True
-
-	def rate_default(self, x, dt=1.):
-		return (self.B * torch.sum(torch.exp(-(x + 1)) * torch.sin(2 * x * np.pi) ** 2, dim=1).view(-1,
-																									1) + self.b) * dt
-
-	def rate_volume(self, S, dt=1, rate=None):
-		if self.rate_volume_f is None:
-			# integrate rate numerically over S
-			import scipy.integrate as integrate
-			if rate is None:
-				rate = self.rate
-			else:
-				rate = rate
-			integral = 0
-			if self.d == 1:
-				# integrate = S.volume()* self.rate(torch.from_numpy(S.bounds[0,1]).view(1))
-				integral, _ = integrate.quad(lambda x: rate(torch.Tensor([x]).view(1, 1)).numpy(),
-											 float(S.bounds[0, 0]), float(S.bounds[0, 1]))
-			elif self.d == 2:
-				integrand = lambda x, y: rate(torch.Tensor([x, y]).view(1, 2).double()).numpy()
-				integral, _ = integrate.dblquad(integrand, float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-												lambda x: float(S.bounds[1, 0]), lambda x: float(S.bounds[1, 1]))
-
-			return integral * dt
-		else:
-			return self.rate_volume_f(S) * dt
-
-	def sample_discretized(self, S, dt, n=100):
-		lam = np.maximum(float(self.rate_volume(S, dt)), 0)
-		count = np.random.poisson(lam=lam)
-		if count > 0:
-			x = S.return_discretization(n)
-			r = self.rate(x) * dt
-			r = torch.maximum(r, r * 0)
-			sample = torch.from_numpy(
-				np.random.choice(np.arange(0, x.size()[0], 1), size=count, p=(r / torch.sum(r)).numpy().reshape(-1)))
-			return x[sample, :]
-		else:
-			return None
-
-	def sample_discretized_direct(self, x, val):
-		lam = 1000
-		count = np.random.poisson(lam=np.maximum(0, lam))
-		if count > 0:
-			val = torch.abs(val)
-			sample = torch.from_numpy(np.random.choice(np.arange(0, x.size()[0], 1),
-													   size=count, p=(val / torch.sum(val)).numpy().reshape(-1)))
-			return x[sample, :]
-		else:
-			return None
-
-	def sample(self, S, dt=1., verbose=False, rate=None):
-		"""
-
-		:param S: set where it should be sampled
-		:return:
-		"""
-		if self.exact == True:
-			return self.sample_discretized(S, dt=dt)
-		else:
-
-			lam = self.rate_volume(S, dt)
-			n = np.random.poisson(lam=lam)
-			print("Number of events:", n)
-			alpha = 1.
-
-			new_sample = []
-			size = 0
-			while size < n:
-				# uniform sample g(s) = 1/vol(S)
-				sample = S.uniform_sample(1)
-				t = self.rate(sample) / (alpha)
-				p = np.random.uniform(0, 1)
-				if p < t:
-					new_sample.append(sample.view(1, -1))
-					size = size + 1
-
-			if len(new_sample) > 1:
-				x = torch.cat(new_sample, dim=0)
-			else:
-				return None
-			return x
-
-	def rate_sets(self, Sets, dt=1):
-		res = []
-		for S in Sets:
-			res.append(self.rate_volume(S, dt=dt))
-		return res
-
-	def visualize(self, S, samples=2, n=10, dt=1., show=True):
-		xtest = S.return_discretization(n)
-		rate = self.rate(xtest)
-
-		if self.d == 1:
-			plt.plot(xtest, rate, label='rate', lw=3)
-			for i in range(samples):
-
-				x = self.sample(S, dt=dt)
-				if x is not None:
-					n = x.size()[0]
-					plt.plot(x, x * 0, 'o', label='sample n=' + str(n))
-
-		elif self.d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), rate[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu, label='rate')
-			ax.contour(cs, colors='k')
-
-			for i in range(samples):
-				x = self.sample(S, dt=dt)
-				if x is not None:
-					ax.plot(x[:, 0].detach().numpy(), x[:, 1].detach().numpy(), 'o', ms=10, alpha=0.5, label='sample')
-			ax.grid(c='k', ls='-', alpha=0.1)
-			plt.colorbar(cs)
-
-		plt.legend()
-		if show == True:
-			plt.show()
+class PoissonPointProcess:
+    """
+    parametrized by log linear model
+
+    """
+
+    def __init__(
+        self, d=1.0, B=1.0, b=0.2, rate=None, rate_volume=None, naive_integral=False
+    ):
+        self.B = B
+        self.d = d
+        self.b = b
+        if rate is None:
+            self.rate = self.rate_default
+        else:
+            self.rate = rate
+
+        self.rate_volume_f = rate_volume
+        self.exact = True
+        self.naive_integral = naive_integral
+
+    def rate_default(self, x, dt=1.0):
+        return (
+            self.B
+            * torch.sum(
+                torch.exp(-(x + 1)) * torch.sin(2 * x * np.pi) ** 2, dim=1
+            ).view(-1, 1)
+            + self.b
+        ) * dt
+
+    def rate_volume(self, S, dt=1, rate=None):
+        if self.rate_volume_f is None:
+            # integrate rate numerically over S
+            import scipy.integrate as integrate
+
+            if rate is None:
+                rate = self.rate
+            else:
+                rate = rate
+            integral = 0
+            if self.d == 1:
+                # integrate = S.volume()* self.rate(torch.from_numpy(S.bounds[0,1]).view(1))
+                integral, _ = integrate.quad(
+                    lambda x: rate(torch.tensor([x]).view(1, 1)).numpy(),
+                    float(S.bounds[0, 0]),
+                    float(S.bounds[0, 1]),
+                )
+            elif self.d == 2:
+                integrand = lambda x, y: rate(
+                    torch.tensor([x, y]).view(1, 2).double()
+                ).numpy()
+                integral, _ = integrate.dblquad(
+                    integrand,
+                    float(S.bounds[0, 0]),
+                    float(S.bounds[0, 1]),
+                    lambda x: float(S.bounds[1, 0]),
+                    lambda x: float(S.bounds[1, 1]),
+                )
+
+            return integral * dt
+        else:
+            return self.rate_volume_f(S) * dt
+
+    def sample_discretized(self, S: BorelSet, dt, n=100):
+        x = S.return_discretization(n).to(device=torch.get_default_device())
+        r = self.rate(x) * dt
+        if self.naive_integral:
+            total_area = 1.0
+            for bound in S.bounds:
+                total_area *= bound[1] - bound[0]
+            lam = r.sum() * (total_area / len(x))
+        else:
+            lam = np.maximum(float(self.rate_volume(S, dt)), 0)
+        count = np.random.poisson(lam=lam.cpu().numpy())
+        if count > 0:
+            r = torch.maximum(r, r * 0)
+            sample = torch.from_numpy(
+                np.random.choice(
+                    np.arange(0, x.size()[0], 1),
+                    size=count,
+                    p=(r / torch.sum(r)).cpu().numpy().reshape(-1),
+                )
+            )
+            return x[sample, :]
+        else:
+            return None
+
+    def sample_discretized_direct(self, x, val):
+        lam = 1000
+        count = np.random.poisson(lam=np.maximum(0, lam))
+        if count > 0:
+            val = torch.abs(val)
+            sample = torch.from_numpy(
+                np.random.choice(
+                    np.arange(0, x.size()[0], 1),
+                    size=count,
+                    p=(val / torch.sum(val)).numpy().reshape(-1),
+                )
+            )
+            return x[sample, :]
+        else:
+            return None
+
+    def sample(self, S, dt=1.0, verbose=False, rate=None):
+        """
+
+        :param S: set where it should be sampled
+        :return:
+        """
+        if self.exact == True:
+            return self.sample_discretized(S, dt=dt)
+        else:
+
+            lam = self.rate_volume(S, dt)
+            n = np.random.poisson(lam=lam)
+            print("Number of events:", n)
+            alpha = 1.0
+
+            new_sample = []
+            size = 0
+            while size < n:
+                # uniform sample g(s) = 1/vol(S)
+                sample = S.uniform_sample(1)
+                t = self.rate(sample) / (alpha)
+                p = np.random.uniform(0, 1)
+                if p < t:
+                    new_sample.append(sample.view(1, -1))
+                    size = size + 1
+
+            if len(new_sample) > 1:
+                x = torch.cat(new_sample, dim=0)
+            else:
+                return None
+            return x
+
+    def rate_sets(self, Sets, dt=1):
+        res = []
+        for S in Sets:
+            res.append(self.rate_volume(S, dt=dt))
+        return res
+
+    def visualize(self, S, samples=2, n=10, dt=1.0, show=True):
+        xtest = S.return_discretization(n)
+        rate = self.rate(xtest)
+
+        if self.d == 1:
+            plt.plot(xtest, rate, label="rate", lw=3)
+            for i in range(samples):
+
+                x = self.sample(S, dt=dt)
+                if x is not None:
+                    n = x.size()[0]
+                    plt.plot(x, x * 0, "o", label="sample n=" + str(n))
+
+        elif self.d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), rate[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu, label="rate")
+            ax.contour(cs, colors="k")
+
+            for i in range(samples):
+                x = self.sample(S, dt=dt)
+                if x is not None:
+                    ax.plot(
+                        x[:, 0].detach().numpy(),
+                        x[:, 1].detach().numpy(),
+                        "o",
+                        ms=10,
+                        alpha=0.5,
+                        label="sample",
+                    )
+            ax.grid(c="k", ls="-", alpha=0.1)
+            plt.colorbar(cs)
+
+        plt.legend()
+        if show == True:
+            plt.show()
 
 
 if __name__ == "__main__":
-	d = 2
-	n = 100
-	bounds = torch.Tensor([[-1, 1], [-1, 1]]).double()
-	D = BorelSet(d, bounds)
+    d = 2
+    n = 100
+    bounds = torch.tensor([[-1, 1], [-1, 1]]).double()
+    D = BorelSet(d, bounds)
 
-	process = PoissonPointProcess(d=d, B=2)
-	process.visualize(D, samples=10, n=n, dt=10)
+    process = PoissonPointProcess(d=d, B=2)
+    process.visualize(D, samples=10, n=n, dt=10)
diff --git a/stpy/point_processes/poisson_rate_estimator.py b/stpy/point_processes/poisson_rate_estimator.py
index d91658e..51ae795 100644
--- a/stpy/point_processes/poisson_rate_estimator.py
+++ b/stpy/point_processes/poisson_rate_estimator.py
@@ -1,15 +1,27 @@
+import os
+from typing import Optional
 import cvxpy as cp
 import mosek
 import numpy as np
 import scipy
+from stpy.borel_set import BorelSet, HierarchicalBorelSets
+from stpy.embeddings.embedding import Embedding
+from stpy.kernels import KernelFunction
 import torch
 from autograd_minimize import minimize
 from quadprog import solve_qp
 from torchmin import minimize as minimize_torch
 
-from stpy.embeddings.bernstein_embedding import BernsteinEmbedding, BernsteinSplinesEmbedding, \
-	BernsteinSplinesOverlapping
-from stpy.embeddings.bump_bases import PositiveNystromEmbeddingBump, TriangleEmbedding, FaberSchauderEmbedding
+from stpy.embeddings.bernstein_embedding import (
+    BernsteinEmbedding,
+    BernsteinSplinesEmbedding,
+    BernsteinSplinesOverlapping,
+)
+from stpy.embeddings.bump_bases import (
+    PositiveNystromEmbeddingBump,
+    TriangleEmbedding,
+    FaberSchauderEmbedding,
+)
 from stpy.embeddings.optimal_positive_basis import OptimalPositiveBasis
 from stpy.helpers.ellipsoid_algorithms import maximize_on_elliptical_slice
 from stpy.point_processes.rate_estimator import RateEstimator
@@ -17,1947 +29,2578 @@
 
 class PoissonRateEstimator(RateEstimator):
 
-	def __init__(self, process, hierarchy, d=1, m=100, kernel_object=None, B=1., s=1., jitter=10e-8, b=0.,
-				 basis='triangle', estimator='likelihood', feedback='count-record', offset=0.1, uncertainty='laplace',
-				 approx=None, stepsize=None, embedding=None, beta=2., sampling='proximal+prox', peeking=True,
-				 constraints=True, var_cor_on=True,
-				 samples_nystrom=15000, inverted_constraint=False, steps=None, dual=True, no_anchor_points=1024, U=1.,
-				 opt='torch'):
-
-		self.process = process
-		self.d = d
-		self.s = s
-		self.b = b
-		self.B = B
-		self.U = U
-		self.stepsize = stepsize
-		self.sampling = sampling
-		self.steps = steps
-		self.opt = opt
-		self.kernel_object = kernel_object
-		# set hierarchy
-		self.constraints = constraints
-		self.hierarchy = hierarchy
-		self.ucb_identified = False
-		self.inverted_constraint = inverted_constraint
-		# approximation
-		self.loglikelihood = 0.
-		self.dual = dual
-		self.peeking = peeking
-		self.no_anchor_points = no_anchor_points
-		if beta < 0.:
-			self.beta = lambda t: self.beta_theory()
-		else:
-			self.beta = lambda t: beta
-		self.var_cor_on = var_cor_on
-
-		if basis == 'triangle':
-			self.packing = TriangleEmbedding(d, m, kernel_object=kernel_object, B=B, b=b, offset=offset,
-											 s=np.sqrt(jitter))
-		elif basis == 'bernstein':
-			self.packing = BernsteinEmbedding(d, m, kernel_object=kernel_object, B=B, b=b, offset=offset,
-											  s=np.sqrt(jitter))
-		elif basis == 'splines':
-			self.packing = BernsteinSplinesEmbedding(d, m, kernel_object=kernel_object, B=B, b=b, offset=offset,
-													 s=np.sqrt(jitter))
-		elif basis == 'nystrom':
-			self.packing = PositiveNystromEmbeddingBump(d, m, kernel_object=kernel_object, B=B, b=b, offset=offset,
-														s=np.sqrt(jitter), samples=samples_nystrom)
-		elif basis == 'overlap-splines':
-			self.packing = BernsteinSplinesOverlapping(d, m, kernel_object=kernel_object, B=B, b=b, offset=offset,
-													   s=np.sqrt(jitter))
-		elif basis == 'faber':
-			self.packing = FaberSchauderEmbedding(d, m, kernel_object=kernel_object, B=B, b=b, offset=offset,
-												  s=np.sqrt(jitter))
-		elif basis == "optimal-positive":
-			self.packing = OptimalPositiveBasis(d, m, kernel_object=kernel_object, B=B, b=b, offset=offset,
-												s=np.sqrt(jitter), samples=samples_nystrom)
-		elif basis == "custom":
-			self.packing = embedding
-		else:
-			raise NotImplementedError("The request positive basis is not implemented.")
-		self.m = m
-		self.data = None
-		self.covariance = False
-
-		# stabilizing the matrix inversion
-		self.jitter = jitter
-
-		# for variance stabilization
-		self.stabilization = None
-		self.approx_fit = False
-
-		# properties of rate estimator
-		self.estimator = estimator
-		self.feedback = feedback
-		self.uncertainty = uncertainty
-		self.approx = approx
-
-		# precompute information
-		self.basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
-
-		self.varphis = torch.zeros(size=(len(self.basic_sets), self.get_m())).double()
-		self.variances = torch.ones(size=(len(self.basic_sets), 1)).double().view(-1)
-		self.variances_histogram = []
-		self.observations = None
-		self.rate = None
-		self.W = (s) * torch.eye(self.get_m()).double()
-		self.W_inv_approx = (1. / s) * torch.eye(self.get_m()).double()
-		self.beta_value = 2.
-		self.sampled_theta = None
-
-		if self.dual == True:
-			if self.d == 1:
-				anchor = no_anchor_points
-				self.anchor_points = self.hierarchy.top_node.return_discretization(anchor)
-				self.anchor_weights = torch.zeros(size=(anchor, 1)).double().view(-1)
-			elif self.d == 2:
-				anchor = no_anchor_points
-				self.anchor_points = self.hierarchy.top_node.return_discretization(int(np.sqrt(anchor)))
-				self.anchor_weights = torch.zeros(size=(anchor, 1)).double().view(-1)
-			self.global_dt = 0.
-			self.anchor_points_emb = self.packing.embed(self.anchor_points)
-
-		if feedback == "count-record" and basis != "custom":
-			print("Precomputing phis.")
-			for index_set, set in enumerate(self.basic_sets):
-				self.varphis[index_set, :] = self.packing.integral(set)
-				self.variances[index_set] = set.volume() * self.B
-		else:
-			pass
-
-		print("Precomputation finished.")
-
-	def add_data_point(self, new_data, times=True):
-
-		super().add_data_point(new_data, times=times)
-
-		if self.rate is not None:
-			rate = self.rate
-		else:
-			l, _, u = self.get_constraints()
-			Gamma_half = self.cov()
-			rate = Gamma_half @ u
-
-		if self.feedback == 'histogram':
-			val = self.packing.integral(new_data[0]) @ rate * new_data[2]
-			v = - np.log(val) + val
-
-		elif self.feedback == 'count-record':
-			v = self.packing.integral(new_data[0]) @ rate * new_data[2]
-			if new_data[1] is not None:
-				val2 = self.packing.embed(new_data[1]) @ rate * new_data[2]
-				v = v - torch.sum(np.log(val2))
-
-		self.loglikelihood += v
-
-	def beta_theory(self):
-		if self.approx_fit == False:
-			l, Lambda, u = self.get_constraints()
-			Gamma_half, invGamma_half = self.cov(inverse=True)
-
-			## norm
-			norm = self.s
-
-			## constraints
-			eps = 10e-3
-			res = Gamma_half @ self.rate.view(-1, 1) - torch.from_numpy(l).view(-1, 1)
-			xi = res.clone()
-			xi[res > eps] = 0.
-
-			constraint = xi.T @ Gamma_half @ self.W_inv_approx @ Gamma_half.T @ xi
-
-			## concentration
-			vol = 4 * np.log(1. / 0.1) + torch.logdet(self.W) - self.get_m() * np.log(self.s)
-			self.beta_value = np.sqrt(norm + vol + constraint)
-			print('-------------------')
-			print("New beta:", self.beta_value)
-			print("norm:", norm)
-			print("constraint:", constraint)
-			print("vol:", vol)
-			print("-------------------")
-		else:
-			pass
-		return self.beta_value
-
-	def get_constraints(self):
-		return self.packing.get_constraints()
-
-	def cov(self, inverse=False):
-		return self.packing.cov(inverse=inverse)
-
-	def fit_gp(self, threads=4):
-
-		if self.data is not None:
-			if self.feedback == "count-record":
-
-				if self.estimator == "likelihood":
-					if self.opt == 'cvxpy':
-						self.penalized_likelihood(threads=threads)
-					elif self.opt == 'torch':
-						self.penalized_likelihood_fast(threads=threads)
-					else:
-						raise NotImplementedError("The optimization method does not exist")
-
-				elif self.estimator == "least-sq":
-					self.least_squares_weighted()
-
-				elif self.estimator == "bins":
-					self.penalized_likelihood_bins()
-
-				else:
-					raise AssertionError("wrong name.")
-
-
-			elif self.feedback == 'histogram':
-
-				if self.estimator == "likelihood":
-					self.penalized_likelihood_integral()
-
-				elif self.estimator == "least-sq":
-					self.least_squares_weighted_integral()
-
-				elif self.estimator == "bins":
-					self.penalized_likelihood_integral_bins()
-
-				else:
-					raise AssertionError("wrong name.")
-			else:
-				raise AssertionError("wrong name.")
-		else:
-			l, Lambda, u = self.get_constraints()
-			Gamma_half = self.cov()
-			self.rate = l
-
-	def sample_mirror_langevin(self, steps=500, verbose=False):
-
-		l, Lambda, u = self.get_constraints()
-		Gamma_half, invGamma_half = self.cov(inverse=True)
-
-		v = torch.from_numpy((u + l) / 2.).view(-1, 1)
-		S = torch.diag(torch.from_numpy(u - l).view(-1) / 2.).double()
-
-		phis = self.phis.clone() @ invGamma_half
-
-		if self.observations is not None:
-			obs = self.observations @ invGamma_half
-		else:
-			obs = None
-
-		invGamma = invGamma_half.T @ invGamma_half
-		transform = lambda y: S @ torch.tanh(y) + v
-
-		if self.feedback == "count-record" and self.dual == False:
-			if obs is not None:
-				func = lambda y: -torch.sum(torch.log(obs @ transform(y)).view(-1)) \
-								 + torch.sum(phis @ transform(y)) \
-								 + self.s * transform(y).T @ invGamma @ transform(y) + torch.sum(
-					torch.log(1. / (1. - transform(y) ** 2)))
-			else:
-				func = lambda y: torch.sum(phis @ transform(y)) \
-								 + self.s * transform(y).T @ invGamma @ transform(y) + torch.sum(
-					torch.log(1. / (1. - transform(y) ** 2)))  # torch.sum(torch.log(0.5*(1.+torch.cosh(2*y))))
-
-
-		elif self.feedback == "count-record" and self.dual == True:
-			mask = self.bucketized_counts > 0
-			phis = self.varphis[mask, :] @ invGamma_half
-			tau = self.total_bucketized_time[mask]
-
-			if obs is not None:
-				obs = self.anchor_points_emb @ invGamma_half
-				weights = self.anchor_weights
-				mask = weights > 0.
-
-				func = lambda y: -torch.sum(weights[mask].view(-1, 1) * torch.log(obs[mask, :] @ transform(y))) \
-								 + torch.sum(tau.view(-1, 1) * (phis @ transform(y))) \
-								 + self.s * transform(y).T @ invGamma @ transform(y) + torch.sum(
-					torch.log(1. / (1. - (transform(y) ** 2))))  # + torch.sum(torch.log(0.5*(1.+torch.cosh(2*y))))
-			else:
-				func = lambda y: torch.sum(tau.view(-1, 1) * (phis @ transform(y))) \
-								 + self.s * transform(y).T @ invGamma @ transform(y) + torch.sum(
-					torch.log(1. / (1. - transform(y) ** 2)))  # + torch.sum(torch.log(0.5*(1.+torch.cosh(2*y))))
-
-		elif self.feedback == "histogram":
-			func = lambda y: - torch.sum(
-				self.counts.clone().view(-1) * torch.log(phis @ (S @ torch.tanh(y) + v)).view(-1)) \
-							 + torch.sum(phis @ (S @ torch.tanh(y) + v)) \
-							 + self.s * (S @ torch.tanh(y) + v).T @ invGamma @ (S @ torch.tanh(y) + v)
-
-		y = torch.rand(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=True)
-
-		# initiallize with map sqeezed more
-		y.data = Gamma_half @ self.rate.view(-1, 1)  # u < theta < l
-
-		u_new = u + 0.01
-		l_new = l - 0.01
-		v2 = torch.from_numpy((u_new + l_new) / 2.).view(-1, 1)
-		S2 = torch.diag(torch.from_numpy(u_new - l_new).view(-1) / 2.).double()
-		#
-		y.data = torch.inverse(S2) @ (y.data - v2)
-		y.data = torch.atanh(y.data)
-
-		W = S.T @ invGamma_half.T @ self.construct_covariance_matrix_laplace() @ invGamma_half @ S
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-8))
-		eta = 0.05 / (L + 1)
-
-		print("Eta:", eta)
-
-		for k in range(steps):
-
-			w = torch.randn(size=(self.get_m(), 1)).double()
-			nabla_y = torch.autograd.functional.jacobian(func, y).data[0, 0, :, :]
-			y.data = y.data - eta * nabla_y + np.sqrt(2 * eta) * w
-			theta = torch.tanh(y).detach()
-
-			if verbose == True:
-				print("Iter:", k, (S @ theta + v).T)
-				print(y.T)
-
-		self.sampled_theta = invGamma_half @ transform(y.data)
-
-	def sample_projected_langevin(self, steps=300, verbose=False, stepsize=None):
-		"""
-		:param burn_in:
-		:return:
-		"""
-
-		Gamma_half = self.packing.cov()
-
-		def prox(x):
-			z = x.numpy()
-			theta = cp.Variable((self.get_m(), 1))
-			objective = cp.Minimize(cp.sum_squares(z - theta))
-			constraints = []
-			l, Lambda, u = self.get_constraints()
-			Lambda = Lambda @ Gamma_half.numpy()
-			constraints.append(Lambda @ theta >= l.reshape(-1, 1))
-			prob = cp.Problem(objective, constraints)
-			prob.solve(solver=cp.OSQP, warm_start=False, verbose=False, eps_abs=1e-3, eps_rel=1e-3)
-			return torch.from_numpy(theta.value)
-
-		if self.feedback == "count-record" and self.dual == False:
-			if self.observations is not None:
-				nabla = lambda y: -torch.einsum('i,ij->j', 1. / (self.observations @ y).view(-1),
-												self.observations).view(-1, 1) + \
-								  torch.sum(self.phis, dim=0).view(-1, 1) \
-								  + self.s * y.view(-1, 1)
-			else:
-				nabla = lambda theta: torch.sum(self.phis, dim=0).view(-1, 1) + self.s * theta.view(-1, 1)
-
-		elif self.feedback == "count-record" and self.dual == True:
-			mask = self.bucketized_counts > 0
-			phis = self.varphis[mask, :]
-			tau = self.total_bucketized_time[mask]
-
-			if self.observations is not None:
-				obs = self.anchor_points_emb
-				weights = self.anchor_weights
-				mask = weights > 0.
-				nabla = lambda y: -torch.einsum('i,ij->j', weights[mask] / ((obs[mask, :] @ y).view(-1)),
-												obs[mask]).view(-1, 1) + \
-								  torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-								  + self.s * y.view(-1, 1)
-			else:
-				nabla = lambda y: torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-								  + self.s * y.view(-1, 1)
-
-
-		elif self.feedback == "histogram":
-			nabla = lambda theta: -torch.sum(torch.diag((1. / (self.phis @ theta).view(-1)) * self.counts) @ self.phis,
-											 dim=0).view(-1, 1) \
-								  + torch.sum(self.phis, dim=0).view(-1, 1) + self.s * theta.view(-1, 1)
-
-		theta = self.rate.view(-1, 1)
-		W = self.construct_covariance_matrix_laplace(minimal=True)
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-5))
-
-		if stepsize is None:
-			eta = 0.5 / (L + 1)
-		else:
-			eta = np.minimum(1, stepsize * 0.5 / L)
-
-		print(eta)
-		for k in range(steps):
-			w = torch.randn(size=(self.get_m(), 1)).double()
-			theta = prox(theta - eta * nabla(theta) + np.sqrt(2 * eta) * w)
-
-			if verbose == True:
-				print("Iter:", k, theta.T)
-
-		self.sampled_theta = theta
-
-	def sample_proximal_langevin_prox(self, steps=300, verbose=False, stepsize=None):
-		"""
-		:param burn_in:
-		:return:
-		"""
-
-		Gamma_half, invGamma_half = self.packing.cov(inverse=True)
-		# invGamma = invGamma_half.T @ invGamma_half
-		l, Lambda, u = self.get_constraints()
-		Lambda = Lambda @ Gamma_half.numpy()
-
-		def prox(x):
-			res = solve_qp(np.eye(self.get_m()), x.numpy().reshape(-1), C=Gamma_half.numpy(), b=l.numpy(),
-						   factorized=True)
-			return torch.from_numpy(res[0]).view(-1, 1)
-
-		# theta_n = cp.Variable((self.get_m(), 1))
-		# x = cp.Parameter((self.get_m(), 1))
-		# objective = cp.Minimize(cp.sum_squares(x - theta_n))
-		#
-		# constraints = []
-		# l, Lambda, u = self.get_constraints()
-		# Lambda = Lambda @ Gamma_half.numpy()
-		# constraints.append(Lambda @ theta_n >= l.reshape(-1, 1))
-		# constraints.append(Lambda @ theta_n <= u.reshape(-1, 1))
-		#
-		# prob = cp.Problem(objective, constraints)
-
-		# def prox(x):
-		# 	return Gamma_half @ torch.from_numpy(scipy.optimize.nnls(invGamma.numpy(), (invGamma_half@x).numpy().reshape(-1), maxiter = 1000)[0]).view(-1,1)
-
-		if self.data is not None:
-			if self.feedback == "count-record" and self.dual == False:
-				if self.observations is not None:
-					nabla = lambda y: -torch.einsum('i,ij->j', 1. / (self.observations @ y).view(-1),
-													self.observations).view(-1, 1) + \
-									  torch.sum(self.phis, dim=0).view(-1, 1) \
-									  + self.s * y.view(-1, 1)
-				else:
-					nabla = lambda theta: torch.sum(self.phis, dim=0).view(-1, 1) + self.s * theta.view(-1, 1)
-
-			elif self.feedback == "count-record" and self.dual == True:
-				mask = self.bucketized_counts > 0
-				phis = self.varphis[mask, :]
-				tau = self.total_bucketized_time[mask]
-
-				if self.observations is not None:
-					obs = self.anchor_points_emb
-					weights = self.anchor_weights
-					mask = weights > 0.
-					nabla = lambda y: -torch.einsum('i,ij->j', weights[mask] / ((obs[mask, :] @ y).view(-1)),
-													obs[mask]).view(-1, 1) + \
-									  torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-									  + self.s * y.view(-1, 1)
-				else:
-					nabla = lambda y: torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-									  + self.s * y.view(-1, 1)
-
-
-			elif self.feedback == "histogram":
-				nabla = lambda theta: -torch.sum(
-					torch.diag((1. / (self.phis @ theta).view(-1)) * self.counts) @ self.phis,
-					dim=0).view(-1, 1) \
-									  + torch.sum(self.phis, dim=0).view(-1, 1) + self.s * theta.view(-1, 1)
-		else:
-			nabla = lambda theta: self.s * theta.view(-1, 1)
-
-		if self.rate is not None:
-			theta = self.rate.view(-1, 1)
-		else:
-			theta = self.b + 0.05 * torch.rand(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=False).view(
-				-1, 1) ** 2
-
-		for k in range(steps):
-			w = torch.randn(size=(self.get_m(), 1)).double()
-
-			# calculate proper step-size
-			W = self.construct_covariance_matrix_laplace(theta=theta)
-			L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-3))
-			if stepsize is not None:
-				eta = 0.5 * stepsize / L
-			else:
-				eta = 0.5 / L
-
-			# prox calculate
-			# x.value = theta.numpy()
-			# prob.solve(solver=cp.OSQP, warm_start=True, verbose=False, eps_abs=1e-3, eps_rel=1e-3)
-			# proximal_theta = torch.from_numpy(theta_n.value)
-
-			# update step
-			#			theta = 0.5 * theta - eta * nabla(theta) + 0.5 * proximal_theta + np.sqrt(2 * eta) * w
-
-			# update step
-			theta = 0.5 * theta - eta * nabla(theta) + 0.5 * prox(theta) + np.sqrt(2 * eta) * w
-			if verbose == True:
-				print("Iter:", k, theta.T)
-
-		self.sampled_theta = prox(theta)
-
-	def sample_proximal_langevin_simple_prox(self, steps=300, verbose=False):
-
-		Gamma_half, invGamma_half = self.packing.cov(inverse=True)
-		l, Lambda, u = self.get_constraints()
-		prox_simple = lambda x: torch.minimum(torch.maximum(x.view(-1), torch.from_numpy(l).view(-1)) \
-											  , torch.from_numpy(u).view(-1)).view(-1, 1)
-
-		def prox(x):
-			return invGamma_half @ prox_simple(Gamma_half @ x)
-
-		phis = self.phis
-		if self.feedback == "count-record" and self.dual == False:
-			if self.observations is not None:
-				obs = self.observations
-
-				func = lambda y: -torch.sum(torch.log(obs @ y)) \
-								 + torch.sum((phis @ y)) \
-								 + self.s * y.T @ y
-
-				nabla = lambda y: -torch.einsum('i,ij->j', 1. / (obs @ y).view(-1), obs).view(-1, 1) + \
-								  torch.sum(phis, dim=0).view(-1, 1) \
-								  + self.s * y.view(-1, 1)
-			else:
-				func = lambda y: torch.sum(phis @ y).view(-1, 1) \
-								 + self.s * y.T @ y
-
-				nabla = lambda y: torch.sum(phis, dim=0).view(-1, 1) + self.s * y.view(-1, 1)
-
-
-
-
-
-		elif self.feedback == "count-record" and self.dual == True:
-			mask = self.bucketized_counts > 0
-			phis = self.varphis[mask, :]
-			tau = self.total_bucketized_time[mask]
-
-			if self.observations is not None:
-				obs = self.anchor_points_emb
-				weights = self.anchor_weights
-				mask = weights > 0.
-				func = lambda y: -torch.sum(weights[mask].view(-1, 1) * torch.log(obs[mask, :] @ y)) \
-								 + torch.sum(tau.view(-1, 1) * (phis @ y)) \
-								 + self.s * y.T @ y
-
-				nabla = lambda y: -torch.einsum('i,ij->j', weights[mask] / ((obs[mask, :] @ y).view(-1)),
-												obs[mask]).view(-1, 1) + \
-								  torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-								  + self.s * y.view(-1, 1)
-			else:
-				func = lambda y: torch.sum(tau.view(-1, 1) * (phis @ y)) \
-								 + self.s * y.T @ y
-
-				nabla = lambda y: torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-								  + self.s * y.view(-1, 1)
-
-		elif self.feedback == "histogram":
-			func = lambda y: - torch.sum(self.counts.view(-1) * torch.log(phis @ y).view(-1)) + \
-							 torch.sum(phis @ y) \
-							 + self.s * y.T @ y
-			nabla = lambda y: -torch.einsum('i,ij->j', self.counts.view(-1) / (phis @ y).view(-1), phis).view(-1, 1) + \
-							  torch.sum(phis, dim=0).view(-1, 1) + self.s * y
-
-		# hessian = lambda y: self.construct_covariance_matrix_laplace()
-
-		y = prox(torch.randn(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=True))
-		y.data = self.rate.view(-1, 1)
-
-		W = self.construct_covariance_matrix_laplace()
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-5))
-
-		eta = 0.5 / (L + 1)
-
-		for k in range(steps):
-			W = torch.randn(size=(self.get_m(), 1)).double()
-			nabla_y = nabla(y.data)
-			y.data = (1 - eta) * y.data - eta * nabla_y + eta * prox(y.data) + np.sqrt(2 * eta) * W
-			if verbose == True:
-				print("Iter:", k, y.T)
-				print("grad:", y.grad.T)
-
-		self.sampled_theta = prox(y.detach())
-
-	def sample_hessian_positive_langevin(self, steps=500, verbose=False, stepsize=None):
-
-		if self.data is not None:
-			if self.feedback == "count-record" and self.dual == False:
-				if self.observations is not None:
-					nabla = lambda y: -torch.einsum('i,ij->j', 1. / (self.observations @ y).view(-1),
-													self.observations).view(-1, 1) + \
-									  torch.sum(self.phis, dim=0).view(-1, 1) \
-									  + self.s * y.view(-1, 1)
-				else:
-					nabla = lambda theta: torch.sum(self.phis, dim=0).view(-1, 1) + self.s * theta.view(-1, 1)
-
-			elif self.feedback == "count-record" and self.dual == True:
-
-				mask = self.bucketized_counts > 0
-				phis = self.varphis[mask, :]
-				tau = self.total_bucketized_time[mask]
-
-				if self.observations is not None:
-					obs = self.anchor_points_emb
-					weights = self.anchor_weights
-					mask = weights > 0.
-					nabla = lambda y: -torch.einsum('i,ij->j', weights[mask] / ((obs[mask, :] @ y).view(-1)),
-													obs[mask]).view(-1, 1) + \
-									  torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-									  + self.s * y.view(-1, 1)
-				else:
-					nabla = lambda y: torch.einsum('i,ij->j', tau, phis).view(-1, 1) \
-									  + self.s * y.view(-1, 1)
-
-
-			elif self.feedback == "histogram":
-				nabla = lambda theta: -torch.sum(
-					torch.diag((1. / (self.phis @ theta).view(-1)) * self.counts) @ self.phis,
-					dim=0).view(-1, 1) \
-									  + torch.sum(self.phis, dim=0).view(-1, 1) + self.s * theta.view(-1, 1)
-		else:
-			nabla = lambda theta: self.s * theta.view(-1, 1)
-
-		Gamma_half = self.packing.cov()
-		lz, Lambda, u = self.get_constraints()
-
-		Lambda = torch.from_numpy(Lambda) @ Gamma_half
-		y = self.b + 0.05 * torch.rand(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=True).view(-1) ** 2
-
-		if self.rate is not None:
-			y.data = self.rate.data + Gamma_half @ y.data
-		else:
-			y.data = Gamma_half @ y.data
-
-		if verbose == True:
-			print("initial point")
-			print(y.data)
-
-		W = self.construct_covariance_matrix_laplace()
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-5))
-
-		if stepsize is None:
-			eta = 1. / (L + 1)
-		else:
-			eta = stepsize / (L + 1)
-
-		D = lambda x: torch.diag(1. / torch.abs(Lambda @ x).view(-1))
-		sqrt_hessian = lambda x: Lambda @ D(x)
-
-		phi = lambda x: -torch.sum(torch.log(Lambda @ x))
-		nabla_phi = lambda x: -torch.einsum('i,ij->j', 1. / (Lambda @ x).view(-1), Lambda)
-		hessian_phi = lambda x: Lambda.T @ torch.diag(1. / (Lambda @ x).view(-1) ** 2) @ Lambda
-
-		for k in range(steps):
-			w = torch.randn(size=(self.get_m(), 1)).double()
-			nabla_val = nabla(y)
-			H = sqrt_hessian(y.data)
-			z = nabla_phi(y.data).view(-1, 1) - eta * nabla_val + np.sqrt(2 * eta) * H @ w
-
-			# y.data = newton_solve(lambda s: nabla_phi(s).reshape(-1)-z.data.reshape(-1),y.reshape(-1),
-			#  					  verbose = verbose, grad = hessian_phi).view(-1,1)
-
-			# # minimization appraoch
-			def objective(s):
-				return torch.sum((nabla_phi(s).reshape(-1) - z.reshape(-1)) ** 2)
-
-			# #
-
-			# x0 = y.reshape(-1).clone().detach().numpy()
-			# res = minimize(objective, x0, backend='torch', method='Newton-CG', precision='float64', tol=1e-5, hvp_type='vhp')
-			# y.data = torch.from_numpy(res.x)
-
-			x0 = y.reshape(-1).clone()
-			res = minimize_torch(objective, x0, method='newton-cg', tol=1e-5)
-			y.data = res.x
-
-			if verbose:
-				print("Iter:", k)
-				print(y.T)
-
-		self.sampled_theta = y.data
-
-	def sample_mla_prime(self, steps=100, verbose=False, stepsize=None):
-		Gamma_half, invGamma_half = self.packing.cov(inverse=True)
-		invGamma = invGamma_half.T @ invGamma_half
-		l, Lambda, u = self.get_constraints()
-		Lambda = torch.from_numpy(Lambda) @ Gamma_half
-
-		if self.data is not None:
-			if self.feedback == "count-record" and self.dual == False:
-				if self.observations is not None:
-					observations = self.observations @ invGamma_half
-					phis = self.phis @ invGamma_half
-					nabla = lambda y: -torch.einsum('i,ij->j', 1. / (observations @ y).view(-1),
-													observations).view(-1, 1) + \
-									  torch.sum(phis, dim=0).view(-1, 1) \
-									  + self.s * invGamma @ y.view(-1, 1)
-				else:
-					nabla = lambda theta: torch.sum(phis, dim=0).view(-1, 1) + self.s * invGamma @ theta.view(-1, 1)
-
-		else:
-			nabla = lambda theta: self.s * invGamma @ theta.view(-1, 1)
-
-		y = self.b + 0.05 * torch.rand(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=True).reshape(-1,
-																												1) ** 2
-		# if self.rate is not None:
-		# 	y.data = Gamma_half @ self.rate.data.view(-1,1) + y.data
-		# else:
-		y.data = y.data
-
-		if verbose == True:
-			print("initial point")
-			print(y.data)
-
-		W = invGamma_half.T @ self.construct_covariance_matrix_laplace() @ invGamma_half
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-5))
-
-		if stepsize is None:
-			eta = 1. / (L + 1)
-		else:
-			eta = stepsize / (L + 1)
-
-		from stpy.approx_inference.sampling_helper import get_increment
-		for k in range(steps):
-
-			nabla_val = nabla(y)
-
-			# cvxpy minimization
-			# x = cp.Variable((self.get_m(), 1))
-			# objective = cp.Minimize( eta * nabla_val.detach().numpy().T @ x - cp.sum(cp.log(x)) -(-1./y.data).T@x)
-			# constraints = [x >= 0.]
-			#
-			# prob = cp.Problem(objective, constraints)
-			# prob.solve(solver = cp.MOSEK)
-
-			w0 = (eta * nabla_val.data + 1. / y.data)
-			# initial point for the solve
-			# w0 = -1./( torch.from_numpy(x.value))
-
-			# simulate
-			f = lambda w, n: n / torch.abs(w)
-			w = get_increment(eta, 1000, f, w0, path=False)
-
-			# back mirror map
-			y.data = (-1. / w)
-
-			if verbose:
-				print("Iter:", k)
-				print(y.T)
-
-		self.sampled_theta = invGamma_half @ y.data
-
-	def sample_hessian_positive_langevin_2(self, steps=500, verbose=False, stepsize=None, preconditioner=True):
-
-		Gamma_half, invGamma_half = self.packing.cov(inverse=True)
-		invGamma = invGamma_half @ invGamma_half
-		if self.data is not None:
-
-			if self.feedback == "count-record" and self.dual == False:
-
-				observations = self.observations @ invGamma_half
-				phis = self.phis @ invGamma_half
-
-				if self.observations is not None:
-					nabla = lambda y: -torch.einsum('i,ij->j', 1. / (observations @ y).view(-1),
-													observations).view(-1, 1) + \
-									  torch.sum(phis, dim=0).view(-1, 1) \
-									  + self.s * invGamma @ y.view(-1, 1)
-				else:
-					nabla = lambda theta: torch.sum(phis, dim=0).view(-1, 1) + self.s * invGamma @ theta.view(-1, 1)
-
-		else:
-			nabla = lambda theta: self.s * invGamma @ theta.view(-1, 1)
-
-		y = torch.rand(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=True).view(-1) ** 2
-		# if self.rate is not None:
-		#	y.data = Gamma_half @ self.rate.data + y.data
-
-		if verbose == True:
-			print("initial point")
-			print(y.data)
-
-		W = self.construct_covariance_matrix_laplace(minimal=True)
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-5))
-
-		if stepsize is None:
-			eta = 1. / (L + 1)
-		else:
-			eta = stepsize / (L + 1)
-
-		for k in range(steps):
-			w = torch.randn(size=(self.get_m(), 1)).double() / torch.abs(y.data).view(-1, 1)
-			nabla_val = nabla(y)
-			z = -1. / y.data.view(-1, 1) + self.b - eta * Gamma_half @ nabla_val + np.sqrt(2 * eta) * Gamma_half @ w
-			y.data = -1. / z + self.b
-
-			if verbose:
-				print("Iter:", k)
-				print(y.T)
-
-		self.sampled_theta = invGamma_half @ y.data
-
-	def sample_newton_langevin(self, steps=1000, stepsize=None, verbose=False):
-		Gamma_half, invGamma_half = self.packing.cov(inverse=True)
-		invGamma = invGamma_half @ invGamma_half
-		if self.data is not None:
-
-			if self.feedback == "count-record" and self.dual == False:
-
-				observations = self.observations @ invGamma_half
-				phis = self.phis @ invGamma_half
-
-				if self.observations is not None:
-					nabla = lambda y, bar: -torch.einsum('i,ij->j', 1. / (observations @ y).view(-1),
-														 observations).view(-1, 1) + \
-										   torch.sum(phis, dim=0).view(-1, 1) \
-										   + self.s * invGamma @ y.view(-1, 1) - bar * 1. / y
-				else:
-					nabla = lambda theta, bar: torch.sum(phis, dim=0).view(-1, 1) + self.s * invGamma @ theta.view(
-						-1, 1) - bar * 1. / theta
-
-		else:
-			nabla = lambda theta, bar: self.s * invGamma @ theta.view(-1, 1) - bar * 1. / theta
-
-		y = 0.05 * torch.rand(size=(self.get_m(), 1), dtype=torch.float64, requires_grad=True).view(-1, 1) ** 2
-
-		barrier = 10.
-		# hessian = lambda theta,bar: torch.einsum('ik,k,kj->ij',observations.T,(observations@theta).view(-1),observations) + invGamma + bar/theta**2
-		hessian = lambda theta, bar: observations.T @ torch.diag(
-			1 / (observations @ theta).view(-1) ** 2) @ observations + invGamma + torch.diag(bar / theta.view(-1) ** 2)
-		hessian_sqrt = lambda theta, bar: torch.cholesky(hessian(theta, bar))
-		eta = 1.
-
-		for k in range(steps):
-			w = torch.randn(size=(self.get_m(), 1)).double()
-			nabla_val = nabla(y, barrier)
-			y.data = y.data - torch.linalg.solve(hessian(y.data, barrier), nabla_val) + np.sqrt(
-				2 * eta) * torch.linalg.solve(hessian_sqrt(y.data, barrier), w)
-
-			if verbose:
-				print("Iter:", k)
-				print(y.T)
-
-		self.sampled_theta = invGamma_half @ y.data
-
-	# self.sampled_theta = y.data
-
-	def sample_hmc(self, steps=1000, stepsize=None, verbose=False):
-		import hamiltorch
-		phis = self.phis
-		if self.feedback == "count-record" and self.dual == False:
-			if self.observations is not None:
-				obs = self.observations
-				func = lambda y: torch.sum(torch.log(obs @ y)) \
-								 - torch.sum((phis @ y)) \
-								 - self.s * y.T @ y
-			else:
-				func = lambda y: - torch.sum(phis @ y).view(-1, 1) \
-								 - self.s * y.T @ y
-
-		num_samples = 1
-		num_steps_per_sample = steps
-		if stepsize is None:
-			step_size = 1e-8
-		else:
-			step_size = stepsize
-
-		params_init = self.rate
-		self.sample_theta = hamiltorch.sample(log_prob_func=func,
-											  params_init=params_init,
-											  num_samples=num_samples,
-											  step_size=step_size,
-											  num_steps_per_sample=num_steps_per_sample)
-		print(self.sampled_theta)
-
-	def sample_variational(self, xtest, accuracy=1e-4, verbose=False, samples=1):
-		from stpy.approx_inference.variational_mf import VMF_SGCP
-		cov_params = [self.kernel_object.kappa, self.kernel_object.gamma]
-		S_borders = np.array([[-1., 1.]])
-		num_inducing_points = self.m
-		num_integration_points = 256
-		X = self.x
-
-		var_mf_sgcp = VMF_SGCP(S_borders, X, cov_params, num_inducing_points,
-							   num_integration_points=num_integration_points,
-							   update_hyperparams=False, output=0, conv_crit=accuracy)
-		var_mf_sgcp.run()
-		sample_paths = var_mf_sgcp.sample_posterior(xtest, num_samples=1.)
-		return sample_paths
-
-	def sample(self, verbose=False, steps=1000, domain=None):
-		"""
-		:return:
-		"""
-		if self.steps is not None:
-			steps = self.steps
-
-		if self.stepsize is not None:
-			stepsize = self.stepsize
-		else:
-			stepsize = None
-
-		l, Lambda, u = self.get_constraints()
-		print("Sampling started.")
-		if self.rate is None:
-			self.fit_gp()
-
-		if self.sampling == 'mirror':
-			self.sample_mirror_langevin(steps=steps, verbose=verbose)
-		elif self.sampling == 'proximal+prox':
-			self.sample_proximal_langevin_prox(steps=steps, verbose=verbose)
-		elif self.sampling == "proximal+simple_prox":
-			self.sample_proximal_langevin_simple_prox(steps=steps, verbose=verbose)
-		elif self.sampling == "hessian":
-			self.sample_hessian_positive_langevin(steps=steps, verbose=verbose, stepsize=stepsize)
-		elif self.sampling == "hessian2":
-			self.sample_hessian_positive_langevin_2(steps=steps, verbose=verbose, stepsize=stepsize)
-		elif self.sampling == "mla_prime":
-			self.sample_mla_prime(steps=steps, verbose=verbose, stepsize=stepsize)
-		elif self.sampling == 'hmc':
-			self.sample_hmc(steps=steps, verbose=verbose, stepsize=stepsize)
-		elif self.sampling == 'polyia_variational':
-			self.sample_variational(accuracy=1. / steps, verbose=verbose)
-		else:
-			raise NotImplementedError("Sampling of such is not supported.")
-
-		print("Sampling finished.")
-
-	def sampled_lcb_ucb(self, xtest, samples=100, delta=0.1):
-		paths = []
-		for i in range(samples):
-			self.sample()
-			path = self.sample_path_points(xtest).view(1, -1)
-			paths.append(path)
-
-		paths = torch.cat(paths, dim=0)
-		lcb = torch.quantile(paths, delta, dim=0)
-		ucb = torch.quantile(paths, 1 - delta, dim=0)
-		return lcb, ucb
-
-	def penalized_likelihood_fast(self, threads=4):
-		l, Lambda, u = self.get_constraints()
-		Gamma_half, invGamma_half = self.cov(inverse=True)
-
-		if self.dual == False:
-			# using all points without anchor points
-			if self.observations is not None:
-				def objective(theta):
-					return -torch.sum(torch.log(self.observations @ invGamma_half @ theta)) + torch.sum(
-						self.phis @ invGamma_half @ theta) + self.s * 0.5 * torch.sum((invGamma_half @ theta) ** 2)
-			else:
-				def objective(theta):
-					return torch.sum(self.phis @ invGamma_half @ theta) + self.s * 0.5 * torch.sum(
-						(invGamma_half @ theta) ** 2)
-		else:
-			# using anchor points
-			mask = self.bucketized_counts > 0
-			phis = self.varphis[mask, :]
-			tau = self.total_bucketized_time[mask]
-
-			if self.observations is not None:
-				observations = self.anchor_points_emb
-				weights = self.anchor_weights
-				mask = weights > 0.
-
-				def objective(theta):
-					return -torch.einsum('i,i', weights[mask],
-										 torch.log(observations[mask, :] @ invGamma_half @ theta)) + torch.einsum('i,i',
-																												  tau,
-																												  phis @ invGamma_half @ theta) + self.s * 0.5 * torch.sum(
-						(invGamma_half @ theta) ** 2)
-			else:
-				def objective(theta):
-					return torch.einsum('i,i', tau, phis @ invGamma_half @ theta) + self.s * 0.5 * torch.sum(
-						(invGamma_half @ theta) ** 2)
-
-		if self.rate is not None:
-			theta0 = torch.zeros(size=(self.get_m(), 1)).view(-1).double()
-			theta0.data = self.rate.data
-		else:
-			theta0 = torch.zeros(size=(self.get_m(), 1)).view(-1).double()
-
-		eps = 1e-4
-		res = minimize(objective, theta0.numpy(), backend='torch', method='L-BFGS-B',
-					   bounds=(l[0] + eps, u[0]), precision='float64', tol=1e-8,
-					   options={'ftol': 1e-08,
-								'gtol': 1e-08, 'eps': 1e-08,
-								'maxfun': 15000, 'maxiter': 15000,
-								'maxls': 20})
-
-		self.rate = invGamma_half @ torch.from_numpy(res.x)
-		print(res.message)
-		return self.rate
-
-	def penalized_likelihood(self, threads=4):
-
-		theta = cp.Variable(self.get_m())
-		l, Lambda, u = self.get_constraints()
-
-		Gamma_half = self.cov(inverse=False)
-
-		if self.dual == False:
-
-			# using all points without anchor points
-			phis = self.phis.numpy()
-			if self.observations is not None:
-				observations = self.observations.numpy()
-				objective = cp.Minimize(-cp.sum(cp.log(observations @ theta)) +
-										cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(theta))
-			else:
-				objective = cp.Minimize(cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(theta))
-
-		else:
-
-			# using anchor points
-			mask = self.bucketized_counts.clone().numpy() > 0
-			phis = self.varphis[mask, :].clone().numpy()
-			tau = self.total_bucketized_time[mask].clone().numpy()
-
-			if self.observations is not None:
-				observations = self.anchor_points_emb.numpy()
-				weights = self.anchor_weights.numpy()
-				mask = weights > 0.
-				objective = cp.Minimize(-cp.sum(cp.multiply(weights[mask], cp.log(observations[mask, :] @ theta))) +
-										cp.sum(cp.multiply(tau, phis @ theta)) + self.s * 0.5 * cp.sum_squares(theta))
-			else:
-				objective = cp.Minimize(cp.sum(cp.multiply(tau, phis @ theta)) + self.s * 0.5 * cp.sum_squares(theta))
-
-		constraints = []
-
-		Lambda = Lambda @ Gamma_half.numpy()
-
-		constraints.append(Lambda @ theta >= l)
-		constraints.append(Lambda @ theta <= u)
-
-		prob = cp.Problem(objective, constraints)
-
-		if self.rate is not None:
-			theta.value = self.rate.numpy()
-
-		try:
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.num_threads: threads,
-									 mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-4})
-
-			self.rate = torch.from_numpy(theta.value)
-			return self.rate
-		except:
-			print("Optimization failed. Using the old value.")
-			print(prob.status)
-			return self.rate
-
-	def penalized_likelihood_integral(self, threads=4):
-
-		phis = self.phis.numpy()
-		counts = self.counts.numpy()
-
-		theta = cp.Variable(self.get_m())
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov().numpy()
-		objective = cp.Minimize(-cp.sum(counts @ cp.log(phis @ theta)) + cp.sum(phis @ theta)
-								+ self.s * 0.5 * cp.sum_squares(theta))
-
-		constraints = []
-		Lambda = Lambda @ Gamma_half
-		constraints.append(Lambda @ theta >= l)
-		constraints.append(Lambda @ theta <= u)
-
-		# if self.rate is not None:
-		#	theta.value = self.rate.numpy()
-		try:
-			prob = cp.Problem(objective, constraints)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.num_threads: threads,
-									 mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-4})
-			self.rate = torch.from_numpy(theta.value)
-		except:
-			print("Optimization failed. Using the old value.")
-			print(prob.status)
-
-		return self.rate
-
-	def bucketization(self):
-
-		phis = []
-		observations = []
-
-		# project sets to smallest forms, and then sum on those only
-		basic_sets = self.basic_sets
-
-		data_basic = [[] for _ in range(len(basic_sets))]
-		sensing_times = [[] for _ in range(len(basic_sets))]
-		counts = torch.zeros(len(basic_sets)).int()
-		total_data = 0.
-		self.total_bucketized_obs = torch.zeros(size=(len(basic_sets), 1)).double().view(-1)
-		self.total_bucketized_time = torch.zeros(size=(len(basic_sets), 1)).double().view(-1)
-
-		for sample in self.data:
-			S, obs, dt = sample
-			if obs is not None:
-				total_data = total_data + obs.size()[0]  # total counts
-				for index, elementary in enumerate(basic_sets):  # iterate over basic sets
-					mask = elementary.is_inside(obs)  # mask which belong to the elementary
-					if S.inside(elementary) == True:
-						data_basic[index].append(obs[mask])
-						counts[index] += 1
-						sensing_times[index].append(dt)
-			else:
-				for index, elementary in enumerate(basic_sets):
-					if S.inside(elementary) == True:
-						data_basic[index].append(torch.Tensor([]))
-						counts[index] += 1
-						sensing_times[index].append(dt)
-
-		for index, elementary in enumerate(basic_sets):
-			arr = np.array([int(elem.size()[0]) for elem in data_basic[index]])  # counts over sensing rounds
-			phi = self.packing.integral(elementary)  # * counts[index]
-
-			self.total_bucketized_obs[index] = float(np.sum(arr))
-			self.total_bucketized_time[index] = float(np.sum(sensing_times[index]))
-
-			observations.append(arr)
-			phis.append(phi.view(1, -1))  # construct varphi_B
-
-		self.bucketized_obs = observations.copy()  # these are number of counts associated with sensings
-		self.bucketized_time = sensing_times.copy()  # these are times each basic set has been sensed
-		self.bucketized_counts = counts  # these are count each basic set has been sensed
-
-	def variance_correction(self, variance):
-
-		if self.var_cor_on == 1:
-
-			g = lambda B, k, mu: -0.5 * (B ** 2) / ((mu ** 2) * k) - B / (mu * k) + (np.exp(B / (k * mu)) - 1)
-			gn = lambda k: g(self.U, k, variance)
-
-			from scipy import optimize
-			k = optimize.bisect(gn, 1, 10000000)
-
-			return k
-		else:
-			return 1.
-
-	def least_squares_weighted(self, threads=4):
-
-		# if self.approx_fit == False:
-		# 	self.bucketization()
-
-		theta = cp.Variable(self.get_m())
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov().numpy()
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		phis = self.varphis[mask, :].clone().numpy()
-		tau = self.total_bucketized_time.clone().numpy()
-
-		variances = self.variances.view(-1).clone().numpy()
-
-		for i in range(variances.shape[0]):
-			if mask[i] > 0:
-				variances[i] = variances[i] * tau[i] * self.variance_correction(variances[i] * tau[i])
-
-		selected_variances = variances[mask]
-		objective = cp.Minimize(
-			cp.sum_squares((cp.multiply((phis @ theta), tau[mask]) - observations) / (np.sqrt(selected_variances)))
-			+ 0.5 * self.s * cp.norm2(theta) ** 2)
-
-		constraints = []
-		Lambda = Lambda @ Gamma_half
-		# constraints.append(Lambda @ theta >= l)
-		constraints.append(Lambda @ theta <= u)
-
-		prob = cp.Problem(objective, constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-4})
-		print(prob.status)
-		self.rate = torch.from_numpy(theta.value)
-		return self.rate
-
-	def least_sqaures_weighted_fast(self, threads=4):
-
-		l, Lambda, u = self.get_constraints()
-		Gamma_half, invGamma_half = self.cov(inverse=True)
-
-		mask = self.bucketized_counts > 0
-		observations = self.total_bucketized_obs[mask]
-		phis = self.varphis[mask, :]
-		tau = self.total_bucketized_time
-
-		variances = self.variances.view(-1)
-		for i in range(variances.size()[0]):
-			if mask[i] > 0:
-				variances[i] = variances[i] * tau[i] * self.variance_correction(variances[i] * tau[i])
-		selected_variances = variances[mask]
-
-		def objective(theta):
-			return torch.sum(
-				((tau[mask] * (phis @ invGamma_half @ theta) - observations) / (np.sqrt(selected_variances))) ** 2) \
-				   + self.s * 0.5 * torch.sum((invGamma_half @ theta) ** 2)
-
-		if self.rate is not None:
-			theta0 = torch.zeros(size=(self.get_m(), 1)).view(-1).double()
-			theta0.data = Gamma_half @ self.rate.data
-		else:
-			theta0 = torch.zeros(size=(self.get_m(), 1)).view(-1).double()
-
-		eps = 1e-4
-		res = minimize(objective, theta0.numpy(), backend='torch', method='L-BFGS-B',
-					   bounds=(l[0] + eps, u[0]), precision='float64', tol=1e-8,
-					   options={'ftol': 1e-06,
-								'gtol': 1e-06, 'eps': 1e-08,
-								'maxfun': 15000, 'maxiter': 15000,
-								'maxls': 20})
-		self.rate = invGamma_half @ torch.from_numpy(res.x)
-
-		return self.rate
-
-	def least_squares_weighted_integral(self, threads=4):
-
-		# if self.approx_fit == False:
-		# 	self.bucketization()
-
-		theta = cp.Variable(self.get_m())
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov().numpy()
-
-		phis = self.phis.clone().numpy()  # integrated actions
-		if self.rate is None:
-			rate = torch.pinverse(torch.from_numpy(Gamma_half)) @ torch.from_numpy(u)
-		else:
-			rate = self.rate.clone()
-
-		if len(self.variances_histogram) > 0:
-			variances = self.variances_histogram.numpy()
-
-			for i in range(variances.shape[0]):
-				variances[i] = variances[i] * self.variance_correction(variances[i])
-		else:
-			variances = np.zeros(len(self.data))
-			i = 0
-			for S, obs, dt in self.data:
-				variances[i] = S.volume() * self.B
-				variances[i] = variances[i] * self.variance_correction(variances[i])
-				i = i + 1
-
-		observations = self.counts.clone().numpy()
-
-		objective = cp.Minimize(cp.sum_squares((phis @ theta - observations) / np.sqrt(variances))
-								+ self.s * cp.sum_squares(theta))
-		constraints = []
-		Lambda = Lambda @ Gamma_half
-		constraints.append(Lambda @ theta >= l)
-		constraints.append(Lambda @ theta <= u)
-		prob = cp.Problem(objective, constraints)
-
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-				   mosek_params={mosek.iparam.num_threads: threads,
-								 mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-								 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-								 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-								 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-
-		self.rate = torch.from_numpy(theta.value)
-
-		return self.rate
-
-	def penalized_likelihood_bins(self, threads=4):
-		theta = cp.Variable(self.get_m())
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov().numpy()
-
-		mask = self.bucketized_counts.clone().numpy() > 0
-		observations = self.total_bucketized_obs[mask].clone().numpy()
-		phis = self.varphis[mask, :].clone().numpy()
-		tau = self.total_bucketized_time[mask].clone().numpy()
-
-		constraints = []
-		Lambda = Lambda @ Gamma_half
-		constraints.append(Lambda @ theta >= l)
-		constraints.append(Lambda @ theta <= u)
-
-		objective = cp.Minimize(
-			-cp.sum(observations @ cp.log(cp.multiply(tau, phis @ theta))) + cp.sum(cp.multiply(phis @ theta, tau))
-			+ self.s * 0.5 * cp.sum_squares(theta))
-		prob = cp.Problem(objective, constraints)
-		try:
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.num_threads: threads,
-									 mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-8,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-8,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-8})
-
-			self.rate = torch.from_numpy(theta.value)
-		except:
-			print("optimization failed.")
-		return self.rate
-
-	def penalized_likelihood_integral_bins(self, threads=4):
-		phis = self.phis.numpy()
-		counts = self.counts.numpy()
-
-		theta = cp.Variable(self.get_m())
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov().numpy()
-		objective = cp.Minimize(-cp.sum(counts @ cp.log(phis @ theta)) + cp.sum(phis @ theta)
-								+ self.s * 0.5 * cp.sum_squares(theta))
-
-		constraints = []
-		Lambda = Lambda @ Gamma_half
-		constraints.append(Lambda @ theta >= l)
-		constraints.append(Lambda @ theta <= u)
-
-		try:
-			if constraints:
-				prob = cp.Problem(objective, constraints)
-			else:
-				prob = cp.Problem(objective)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False,
-					   mosek_params={mosek.iparam.num_threads: threads,
-									 mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-									 mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
-									 mosek.dparam.intpnt_co_tol_rel_gap: 1e-6})
-			self.rate = torch.from_numpy(theta.value)
-		except:
-			print("Optimization failed. Using the old value.")
-
-		return self.rate
-
-	def update_variances(self, value=False, force=False):
-		self.approx_fit = True
-		if (self.feedback == "count-record" and self.estimator=="least-sq") or force == True:
-			print("updating variance")
-			for index, set in enumerate(self.basic_sets):
-				if value == False:
-					ucb = self.ucb(set)
-					self.variances[index] = np.minimum(ucb, self.variances[index])
-				else:
-					self.variances[index] = self.mean_set(set)
-		else:
-			if self.data is not None:
-				if self.peeking == True:
-					new_var = []
-					for S, _, dt in self.data:
-						new_var.append(float(self.ucb(S)) * dt)
-					self.variances_histogram = torch.Tensor(new_var.copy()).double()
-				else:
-					last = self.data[-1]
-					new_var = torch.Tensor([self.ucb(last[0]) * last[2]]).double()
-					if len(self.variances_histogram) > 0:
-						self.variances_histogram = torch.cat((self.variances_histogram, new_var))
-					else:
-						self.variances_histogram = new_var
-		self.approx_fit = False
-
-	def ucb(self, S, dt=1., delta=0.5):
-
-		if self.data is None or self.rate is None:
-			return self.B * S.volume() * dt
-
-		if self.approx == None:
-
-			if self.uncertainty == "laplace":
-				return self.mean_var_laplace_set(S, dt=dt, beta=self.beta(0))[1]
-
-			elif self.uncertainty == "least-sq":
-				return self.mean_var_reg_set(S, dt=dt, beta=self.beta(0))[1]
-
-			elif self.uncertainty == "bins":
-				return self.mean_var_bins_set(S, dt=dt, beta=self.beta(0))[1]
-
-			elif self.uncertainty == "likelihood-ratio":
-				return self.mean_var_ratio_set(S, dt=dt, beta=self.beta(0))[1]
-
-			elif self.uncertainty == "conformal":
-				return self.mean_var_conformal_set(S, dt=dt, delta=delta)[2]
-
-			else:
-				raise AssertionError("Not Implemented.")
-
-		elif self.approx == "ellipsoid":
-
-			if self.approx_fit == False:
-				self.fit_ellipsoid_approx()
-				self.beta(0)
-				print("Fitting Approximation.")
-				self.approx_fit = True
-			return self.map_lcb_ucb_approx_action(S, dt=dt, beta=self.beta(0))[2]
-		else:
-			raise AssertionError("Not implemented.")
-
-	def mean_std_per_action(self, S, W, dt, beta):
-
-		phi = self.packing.integral(S) * dt
-		Gamma_half = self.cov().numpy()
-
-		l, Lambda, u = self.get_constraints()
-
-		Lambda = Lambda @ Gamma_half
-		ucb, _ = maximize_on_elliptical_slice(phi.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta, l, Lambda, u)
-		lcb, _ = maximize_on_elliptical_slice(-phi.numpy(), (W).numpy(), self.rate.view(-1).numpy(), beta, l, Lambda, u)
-		map = phi @ self.rate
-
-		return map, float(ucb), -float(lcb)
-
-	def mean_var_laplace_set(self, S, dt, beta=2.):
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix_laplace()
-			self.approx_fit = True
-		return self.mean_std_per_action(S, self.W, dt, beta)
-
-	def mean_var_reg_set(self, S, dt, beta=2.):
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix_regression()
-			self.approx_fit = True
-		return self.mean_std_per_action(S, self.W, dt, beta)
-
-	def mean_var_bins_set(self, S, dt, beta=2.):
-		if self.approx_fit == False:
-			self.W = self.construct_covariance_matrix_bins()
-			self.approx_fit = True
-		return self.mean_std_per_action(S, self.W, dt, beta)
-
-	def mean_var_ratio_set(self, S, dt, beta=2.):
-		x = self.packing.integral(S) * dt
-		map = x @ self.rate
-		# v = np.log(1. / 0.1) - torch.sum(self.counts.double() @ torch.log(self.phis.double() @ self.rate)) \
-		#	+ torch.sum(self.phis.double() @ self.rate) + 0.5 * self.s * torch.norm(self.rate) ** 2
-		v = np.log(1. / 0.1) + self.likelihood + 0.5 * self.s * torch.norm(self.rate) ** 2
-
-		phis = self.phis.numpy()
-		counts = self.counts.numpy()
-		theta = cp.Variable(self.get_m())
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov().numpy()
-
-		objective_min = cp.Minimize(x @ theta)
-		objective_max = cp.Maximize(x @ theta)
-
-		constraints = []
-		Lambda = Lambda @ Gamma_half
-		constraints.append(Lambda @ theta >= l)
-		constraints.append(Lambda @ theta <= u)
-
-		constraints.append(
-			-cp.sum(counts @ cp.log(phis @ theta)) + cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(
-				theta) <= v)
-
-		prob = cp.Problem(objective_min, constraints)
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
-		lcb = np.dot(theta.value, x)
-		prob = cp.Problem(objective_max, constraints)
-		prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
-		ucb = np.dot(theta.value, x)
-
-		return map, ucb, lcb
-
-	def map_lcb_ucb_approx_action(self, S, dt=1., beta=2.):
-		phi = self.packing.integral(S)
-		map = dt * phi @ self.rate
-
-		ucb = map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
-		# ucb = np.minimum(dt * ucb, self.B * S.volume() * dt)
-
-		lcb = map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
-		# lcb = np.maximum(dt * lcb, self.b * S.volume() * dt)
-		return map, lcb, ucb
-
-	def fit_ellipsoid_approx(self):
-
-		if self.uncertainty == "laplace":
-			self.W = self.construct_covariance_matrix_laplace()
-		elif self.uncertainty == 'least-sq':
-			self.W = self.construct_covariance_matrix_regression()
-		elif self.uncertainty == 'bins':
-			self.W = self.construct_covariance_matrix_bins()
-		else:
-			raise AssertionError("Not implemented.")
-
-		self.W_inv_approx = torch.pinverse(self.W)
-
-	def construct_covariance_matrix(self):
-		if self.estimator == "likelihood":
-			self.W = self.construct_covariance_matrix_laplace()
-		elif self.estimator == "least-sq":
-			self.W = self.construct_covariance_matrix_regression()
-		elif self.estimator == "bins":
-			self.W = self.construct_covariance_matrix_bins()
-		else:
-			raise NotImplementedError("This estimator is not implemented.")
-		return self.W
-
-	def construct_covariance_matrix_laplace(self, theta=None):
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-
-		if self.feedback == "count-record":
-
-			if self.observations is not None:
-
-				if theta is None:
-					D = torch.diag(1. / ((self.observations @ self.rate).view(-1) ** 2))
-					W = self.observations.T @ D @ self.observations
-				else:
-					D = torch.diag(1. / ((self.observations @ theta).view(-1) ** 2))
-					W = self.observations.T @ D @ self.observations
-
-		elif self.feedback == "histogram":
-			# D = torch.diag(self.counts / (self.phis @ self.rate).view(-1) ** 2)
-			if len(self.variances_histogram) > 0:
-				variances = self.variances_histogram.view(-1).clone()
-
-				for i in range(variances.shape[0]):
-					variances[i] = variances[i] * self.variance_correction(variances[i])
-
-				D = torch.diag(self.counts / variances ** 2)
-
-			W = self.phis.T @ D @ self.phis
-		else:
-			raise AssertionError("Not implemented.")
-
-		return W + torch.eye(self.get_m()).double() * self.s
-
-	def construct_covariance_matrix_regression(self):
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-		if self.data is not None:
-			variances = self.variances
-			if self.feedback == "count-record":
-				mask = self.bucketized_counts > 0
-				tau = self.total_bucketized_time
-				for index_o, o in enumerate(self.bucketized_obs):
-					n = mask[index_o]
-					if n > 0:
-						A = self.varphis[index_o, :].view(-1, 1) @ self.varphis[index_o, :].view(1, -1) * tau[index_o]
-						k = self.variance_correction(tau[index_o] * variances[index_o])
-						W = W + A / (variances[index_o] * k)
-
-			elif self.feedback == "histogram":
-
-				if len(self.variances_histogram) > 0:
-					variances = self.variances_histogram.view(-1).clone()
-
-					for i in range(variances.shape[0]):
-						variances[i] = variances[i] * self.variance_correction(variances[i])
-
-					D = torch.diag(1. / variances)
-					W = self.phis.T @ D @ self.phis
-
-		return W + torch.eye(self.get_m()).double() * self.s
-
-	def construct_covariance_matrix_bins(self):
-		W = torch.zeros(size=(self.get_m(), self.get_m())).double()
-
-		if self.feedback == "count-record":
-
-			mask = self.bucketized_counts > 0
-			tau = self.total_bucketized_time
-			varphis = self.varphis[mask, :]
-			variances = self.variances.view(-1).clone()
-
-			for i in range(variances.size()[0]):
-				if mask[i] > 0:
-					variances[i] = variances[i] * self.variance_correction(variances[i] * tau[i])
-
-			variances = variances[mask]
-			tau = tau[mask]
-
-			if self.observations is not None:
-				D = torch.diag(tau / variances)
-				W = varphis.T @ D @ varphis
-
-		elif self.feedback == "histogram":
-
-			if len(self.variances_histogram) > 0:
-				variances = self.variances_histogram.view(-1).clone()
-
-				for i in range(variances.shape[0]):
-					variances[i] = variances[i] * self.variance_correction(variances[i])
-
-				D = torch.diag(1. / variances)
-				W = self.phis.T @ D @ self.phis
-		else:
-			raise AssertionError("Not implemented.")
-
-		return W + torch.eye(self.get_m()).double() * self.s
-
-	def gap(self, S, actions, w, dt, beta=2.):
-		"""
-		Estimates the gap of an action S,
-		:param S:
-		:param dt:
-		:return:
-		"""
-		phi = self.packing.integral(S) * dt
-		Gamma_half = self.packing.cov().numpy()
-
-		if self.approx is None:
-			l, Lambda, u = self.get_constraints()
-			Lambda = Lambda @ Gamma_half
-			ucbs = []
-			for action in actions:
-				phi_a = self.packing.integral(action) * dt
-				# ucb, _ = maximize_on_elliptical_slice(phi_a.numpy()-phi.numpy(), self.W.numpy(), self.rate.view(-1).numpy(), beta, l, Lambda, u)
-				ucb, _ = maximize_on_elliptical_slice(phi.numpy(), self.W.numpy(),
-													  self.rate.view(-1).numpy(), beta, l, Lambda, u)
-				ucbs.append(float(ucb))
-			gap = torch.max(torch.Tensor(ucbs))
-
-		else:
-			if self.data is None:
-				return (self.B - self.b) * S.volume()
-
-			if self.ucb_identified == False:
-				print("Recomputing UCB.....")
-				self.ucb_identified = True
-				self.fit_ellipsoid_approx()
-				self.max_ucb = -1000
-				self.ucb_action = None
-
-				for action in actions:
-					_, __, ucb = self.map_lcb_ucb_approx_action(action, dt=dt, beta=self.beta(0))
-					ucb = ucb / w(action)
-
-					if ucb > self.max_ucb:
-						self.max_ucb = ucb
-						self.ucb_action = action
-
-			map, lcb, ucb = self.map_lcb_ucb_approx_action(S, dt=dt, beta=self.beta(0))
-			gap = w(S) * self.max_ucb - lcb
-		return gap
-
-	def information(self, S, dt, precomputed=None):
-
-		if self.data is None:
-			return 1.
-
-		if self.W is None:
-			self.construct_covariance_matrix()
-
-		if self.feedback == "count-record":
-			varphi_UCB = self.packing.integral(self.ucb_action).view(1, -1) * dt
-
-			if precomputed is not None:
-				Upsilon = precomputed[S] * dt
-			else:
-				ind = []
-				for index, set in enumerate(self.basic_sets):
-					if S.inside(set):
-						ind.append(index)
-				Upsilon = self.varphis[ind, :] * dt
-
-			I = torch.eye(Upsilon.size()[0]).double()
-			G = self.W_inv_approx - self.W_inv_approx @ Upsilon.T @ torch.inverse(
-				I + Upsilon @ Upsilon.T) @ Upsilon @ self.W_inv_approx
-			return 10e-4 + torch.logdet(varphi_UCB @ self.W_inv_approx @ varphi_UCB.T) - torch.logdet(
-				varphi_UCB @ G @ varphi_UCB.T)
-
-		elif self.feedback == "histogram":
-
-			return torch.log(1 + self.packing.integral(S) @ self.W_inv_approx @ self.packing.integral(S) * dt ** 2)
-
-	def map_lcb_ucb_approx(self, S, n, beta=2.0, delta=0.01):
-		xtest = S.return_discretization(n)
-		if self.data is None:
-			return self.b + 0 * xtest[:, 0].view(-1, 1), \
-				   self.b + 0 * xtest[:, 0].view(-1, 1), \
-				   self.B + 0 * xtest[:, 0].view(-1, 1)
-
-		self.fit_ellipsoid_approx()
-		self.fit_ellipsoid_approx()
-
-		Phi = self.packing.embed(xtest).double()
-		map = Phi @ self.rate
-		N = Phi.size()[0]
-
-		ucb = torch.zeros(size=(N, 1)).double()
-		lcb = torch.zeros(size=(N, 1)).double()
-
-		for i in range(N):
-			x = Phi[i, :].view(-1, 1)
-			ucb[i, 0] = np.minimum(map[i] + beta * np.sqrt(x.T @ self.W_inv_approx @ x), self.B)
-			lcb[i, 0] = np.maximum(map[i] - beta * np.sqrt(x.T @ self.W_inv_approx @ x), self.b)
-		return map, lcb, ucb
-
-	def map_lcb_ucb(self, S, n, beta=2.0):
-		"""
-		Calculate exact confidence using laplace approximation on a whole set domain
-		:param S: set
-		:param n: discretization
-		:param beta: beta
-		:return:
-		"""
-
-		xtest = S.return_discretization(n)
-		if self.data is None:
-			return self.b + 0 * xtest[:, 0].view(-1, 1), \
-				   self.b + 0 * xtest[:, 0].view(-1, 1), \
-				   self.B + 0 * xtest[:, 0].view(-1, 1)
-
-		N = xtest.size()[0]
-		Phi = self.packing.embed(xtest)
-		map = Phi @ self.rate
-
-		if self.uncertainty == "laplace":
-			W = self.construct_covariance_matrix_laplace()
-		elif self.uncertainty == "least-sq":
-			W = self.construct_covariance_matrix_regression()
-		elif self.uncertainty == "bins":
-			W = self.construct_covariance_matrix_bins()
-		else:
-			raise AssertionError("Not implemented ")
-
-		Gamma_half = self.cov().numpy()
-		l, Lambda, u = self.get_constraints()
-		Lambda = Lambda @ Gamma_half
-		ucb = torch.zeros(size=(N, 1)).double()
-		lcb = torch.zeros(size=(N, 1)).double()
-
-		for i in range(N):
-			x = Phi[i, :]
-			ucbi, _ = maximize_on_elliptical_slice(x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), np.sqrt(beta), l,
-												   Lambda,
-												   u)
-			lcbi, _ = maximize_on_elliptical_slice(-x.numpy(), (W).numpy(), self.rate.view(-1).numpy(), np.sqrt(beta),
-												   l, Lambda,
-												   u)
-			ucb[i, 0] = ucbi
-			lcb[i, 0] = -lcbi
-
-		return map, lcb, ucb
-
-	def map_lcb_ucb_likelihood_ratio(self, S, n, delta=0.1, current=False):
-		xtest = S.return_discretization(n)
-
-		if self.data is None:
-			return self.b + 0 * xtest[:, 0].view(-1, 1), \
-				   self.b + 0 * xtest[:, 0].view(-1, 1), \
-				   self.B + 0 * xtest[:, 0].view(-1, 1)
-
-		N = xtest.size()[0]
-		Phi = self.packing.embed(xtest)
-		map = Phi @ self.rate
-
-		ucb = torch.zeros(size=(N, 1)).double()
-		lcb = torch.zeros(size=(N, 1)).double()
-
-		phis = self.phis.numpy()
-
-		if current:
-			if self.observations is not None:
-				v = np.log(1. / delta) - torch.sum(torch.log(self.observations @ self.rate)) + torch.sum(
-					self.phis @ self.rate) + self.s * 0.5 * torch.sum(self.rate ** 2)
-			else:
-				v = np.log(1. / delta) + torch.sum(
-					self.phis @ self.rate) + self.s * 0.5 * torch.sum(self.rate ** 2)
-		else:
-			if self.feedback == 'count-record':
-				v = np.log(1. / delta) + self.loglikelihood + 0.5 * self.s * torch.sum(self.rate ** 2)
-			elif self.feedback == 'histogram':
-				v = np.log(1. / delta) + self.loglikelihood + 0.5 * self.s * torch.sum(self.rate ** 2)
-			else:
-				raise NotImplementedError("Not compatible with given feedback model ")
-
-		l, Lambda, u = self.get_constraints()
-		Gamma_half = self.cov().numpy()
-		Lambda = Lambda @ Gamma_half
-
-		for i in range(N):
-			x = Phi[i, :].numpy()
-
-			theta = cp.Variable(self.get_m())
-
-			objective_min = cp.Minimize(x @ theta)
-			objective_max = cp.Maximize(x @ theta)
-
-			constraints = []
-			constraints.append(Lambda @ theta >= l)
-			constraints.append(Lambda @ theta <= u)
-
-			if self.feedback == 'count-record':
-				if self.observations is not None:
-					observations = self.observations.numpy()
-
-					constraints.append(
-						-cp.sum(cp.log(observations @ theta)) +
-						cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(theta)
-						<= v)
-				else:
-					constraints.append(cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(theta)
-									   <= v)
-
-			elif self.feedback == 'histogram':
-				constraints.append(
-					-cp.sum(cp.log(phis @ theta)) +
-					cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(theta)
-					<= v)
-			else:
-				raise NotImplementedError("Does not exist.")
-
-			prob = cp.Problem(objective_min, constraints)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
-			lcb[i, 0] = float(np.dot(theta.value, x))
-
-			prob = cp.Problem(objective_max, constraints)
-			prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
-			ucb[i, 0] = float(np.dot(theta.value, x))
-
-		return map, lcb, ucb
-
-	def mean_var_conformal_set(self, S, dt, beta=2., max_val=None, delta=0.05):
-		# self.bucketize_prepare()
-		if max_val is None:
-			max_val = int(self.B * self.basic_sets[0].volume() * dt) + 1
-		map, lcb, ucb = self.conformal_confidence_set(S, delta=delta, max_val=max_val, dt=dt)
-		return map, lcb, ucb
-
-	def conformal_score_func(self, theta, new, index):
-
-		if new[1] is None:
-			n_new = 0
-		else:
-			n_new = new[1].size()[0]
-
-		varphi = self.packing.integral(new[0]) * new[2]
-		err_new = abs(float(n_new) - float(varphi @ theta))
-		n = len(self.bucketized_obs[index])
-
-		if n > 0:
-
-			phis = self.varphis[index].repeat(n, 1)
-			res = torch.Tensor(self.bucketized_obs[index]).double()
-
-			err = torch.abs(res - (phis @ theta.view(-1, 1)).view(-1))
-
-			return torch.sum(err < err_new).double() / float(n + 1.) + 1. / (float(n) + 1.)
-
-		else:
-			return 0.
-
-	def conformal_confidence(self, delta=0.05, max_val=20, dt=1, step=1):
-		lcb = []
-		ucb = []
-		map = []
-
-		if self.data is not None:
-			self.bucketization(time=True)
-
-		for S in self.basic_sets:
-			m, u, l = self.conformal_confidence_set(S, delta=delta, max_val=max_val, dt=dt, step=step)
-
-			map.append(m)
-			ucb.append(u)
-			lcb.append(l)
-
-		return torch.Tensor(map).double(), torch.Tensor(ucb).double(), torch.Tensor(lcb).double()
-
-	def conformal_confidence_set(self, S, delta=0.05, max_val=20, dt=1., step=1):
-		"""
-		:return: (lcb,ucb)
-		"""
-
-		if self.data is not None:
-			if self.feedback == "count-record":
-				self.penalized_likelihood()
-			elif self.feedback == "histogram":
-				self.penalized_likelihood_integral()
-
-			# identify the set in basic sets
-			index = 0
-			for set in self.basic_sets:
-				if set.inside(S):
-					break
-				index += 1
-
-			# calculate map estimate
-			map = float(self.rate @ self.packing.integral(S))
-		else:
-			map = self.b
-			return map, self.B, self.b
-
-		scores = []
-		j = 0
-		score = 1.
-		lowest = 0
-		n = float(len(self.bucketized_obs[index]))
-
-		while score > np.ceil((1 - delta) * (n + 1)) / (n + 1) and j <= max_val:
-			lowest = j
-			if j > 0:
-				obs = torch.zeros(size=(j, self.d)).double()
-				for i in range(self.d):
-					obs[:, i] = torch.from_numpy(np.random.uniform(S.bounds[i, 0], S.bounds[i, 1], size=j))
-			else:
-				obs = None
-
-			# new observation
-			new = (S, obs, dt)
-
-			old_phis, old_observations, old_counts = self.add_data_point_and_remove(new)
-
-			if self.feedback == "count-record":
-				theta_new = self.penalized_likelihood()
-			elif self.feedback == "histogram":
-				theta_new = self.penalized_likelihood_integral()
-
-			# restore back the data
-			self.phis = old_phis
-			self.observations = old_observations
-			self.counts = old_counts
-
-			# calculate the score
-			score = self.conformal_score_func(theta_new, new, index)
-			n = float(len(self.bucketized_obs[index]))
-
-			print(j, "/", max_val, score, np.ceil((1 - delta) * (n + 1)) / (n + 1))
-			j = j + 1
-
-		j = max_val
-		score = 1.
-		largest = max_val
-
-		while score > np.ceil((1 - delta) * (n + 1)) / (n + 1) and j > lowest:
-			largest = j
-			if j > 0:
-				obs = torch.zeros(size=(j, self.d)).double()
-				for i in range(self.d):
-					obs[:, i] = torch.from_numpy(np.random.uniform(S.bounds[i, 0], S.bounds[i, 1], size=j))
-			else:
-				obs = None
-
-			# new observation
-			new = (S, obs, dt)
-
-			old_phis, old_observations, old_counts = self.add_data_point_and_remove(new)
-
-			if self.feedback == "count-record":
-				theta_new = self.penalized_likelihood()
-			elif self.feedback == "histogram":
-				theta_new = self.penalized_likelihood_integral()
-
-			# restore back the data
-			self.phis = old_phis
-			self.observations = old_observations
-			self.counts = old_counts
-
-			# calculate the score
-			score = self.conformal_score_func(theta_new, new, index)
-			n = float(len(self.bucketized_obs[index]))
-
-			print(j, "/", max_val, score, np.ceil((1 - delta) * (n + 1)) / (n + 1))
-			j = j - 1
-		# scores = np.array(scores)
-		# mask = scores < np.ceil((1-delta)*(n+1))/(n+1)
-
-		# if np.sum(mask) == 0:
-		# 	lowest = 0
-		# 	largest = max_val
-		# else:
-		# 	lowest = np.min(np.arange(0,max_val,step)[mask])
-		# 	largest = np.max(np.arange(0, max_val, step)[mask])
-
-		lcb = lowest / dt / S.volume()
-		ucb = largest / dt / S.volume()
-
-		return (map, ucb, lcb)
+    def __init__(
+        self,
+        anchor_hierarchy: HierarchicalBorelSets,
+        d: int = 1,
+        basis_size_per_dim: int = 100,
+        kernel: Optional[KernelFunction] = None,
+        max_intensity: float = 1.0,
+        s=1.0,
+        jitter=10e-8,
+        min_intensity: float = 0.0,
+        basis: str = "triangle",
+        estimator: str = "likelihood",
+        feedback_type: str = "count-record",
+        offset=0.1,
+        uncertainty="laplace",
+        approx=None,
+        sampling_stepsize=None,
+        embedding: Optional[Embedding] = None,
+        beta=2.0,
+        sampling="proximal+prox",
+        peeking=True,
+        constraints=True,
+        var_cor_on=True,
+        samples_nystrom=15000,
+        inverted_constraint=False,
+        langevine_sampling_steps=None,
+        use_anchors=True,
+        no_anchor_points=1024,
+        U=1.0,
+        optimization_library="torch",
+        roi: torch.Tensor | BorelSet | None = None,
+        roi_discretization: int = 30,
+        memory_limit=None,
+        device=torch.get_default_device(),
+        dtype=torch.get_default_dtype(),
+    ):
+        self.d = d
+        """ Dimension of the data """
+        self.s = s
+        self.b = min_intensity
+        """ Minimal value of the intensity function """
+        self.B = max_intensity
+        """ Maximal value of the intensity function """
+        self.U = U
+        self.stepsize = sampling_stepsize
+        self.sampling = sampling
+        self.steps = langevine_sampling_steps
+        self.optimization_library = optimization_library
+        self.kernel = kernel
+        # set hierarchy
+        self.constraints = constraints
+        self.hierarchy = anchor_hierarchy
+        self.ucb_identified = False
+        self.inverted_constraint = inverted_constraint
+        # approximation
+        self.loglikelihood = 0.0
+        self.dual = use_anchors
+        self.peeking = peeking
+        self.no_anchor_points = no_anchor_points
+        if beta < 0.0:
+            self.beta = lambda t: self.beta_theory()
+        else:
+            self.beta = lambda t: beta
+        self.var_cor_on = var_cor_on
+        self.device = device
+        self.dtype = dtype
+
+        if basis == "triangle":
+            self.packing = TriangleEmbedding(
+                d,
+                basis_size_per_dim,
+                kernel_object=kernel,
+                B=max_intensity,
+                b=min_intensity,
+                offset=offset,
+                s=np.sqrt(jitter),
+            )
+        elif basis == "bernstein":
+            self.packing = BernsteinEmbedding(
+                d,
+                basis_size_per_dim,
+                kernel_object=kernel,
+                B=max_intensity,
+                b=min_intensity,
+                offset=offset,
+                s=np.sqrt(jitter),
+            )
+        elif basis == "splines":
+            self.packing = BernsteinSplinesEmbedding(
+                d,
+                basis_size_per_dim,
+                kernel_object=kernel,
+                B=max_intensity,
+                b=min_intensity,
+                offset=offset,
+                s=np.sqrt(jitter),
+            )
+        elif basis == "nystrom":
+            self.packing = PositiveNystromEmbeddingBump(
+                d,
+                basis_size_per_dim,
+                kernel_object=kernel,
+                B=max_intensity,
+                b=min_intensity,
+                offset=offset,
+                s=np.sqrt(jitter),
+                samples=samples_nystrom,
+            )
+        elif basis == "overlap-splines":
+            self.packing = BernsteinSplinesOverlapping(
+                d,
+                basis_size_per_dim,
+                kernel_object=kernel,
+                B=max_intensity,
+                b=min_intensity,
+                offset=offset,
+                s=np.sqrt(jitter),
+            )
+        elif basis == "faber":
+            self.packing = FaberSchauderEmbedding(
+                d,
+                basis_size_per_dim,
+                kernel_object=kernel,
+                B=max_intensity,
+                b=min_intensity,
+                offset=offset,
+                s=np.sqrt(jitter),
+            )
+        elif basis == "optimal-positive":
+            self.packing = OptimalPositiveBasis(
+                d,
+                basis_size_per_dim,
+                kernel_object=kernel,
+                B=max_intensity,
+                b=min_intensity,
+                offset=offset,
+                s=np.sqrt(jitter),
+                samples=samples_nystrom,
+                data=roi,
+                discretization_size=roi_discretization,
+                memory_limit=memory_limit,
+            )
+        elif basis == "custom":
+            assert embedding is not None
+            self.packing = embedding
+        else:
+            raise NotImplementedError("The request positive basis is not implemented.")
+        self.m = basis_size_per_dim
+        """ Number of basis functions per dimension """
+        self.data = None
+        self.covariance = False
+
+        # stabilizing the matrix inversion
+        self.jitter = jitter
+
+        # for variance stabilization
+        self.stabilization = None
+        self.approx_fit = False
+
+        # properties of rate estimator
+        self.estimator = estimator
+        self.feedback = feedback_type
+        self.uncertainty = uncertainty
+        self.approx = approx
+
+        # precompute information
+        self.basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
+
+        self.varphis = torch.zeros(size=(len(self.basic_sets), self.get_m())).double()
+        self.variances = torch.ones(size=(len(self.basic_sets), 1)).double().view(-1)
+        self.variances_histogram = []
+        self.observations = None
+        self.rate = None
+        r""" $\hat \theta$ in the paper"""
+        self.W = (s) * torch.eye(self.get_m()).double()
+        self.W_inv_approx = (1.0 / s) * torch.eye(self.get_m()).double()
+        self.beta_value = 2.0
+        self.sampled_theta = None
+        if self.dual == True:
+            if self.d == 1:
+                anchor = no_anchor_points
+                self.anchor_points = self.hierarchy.top_node.return_discretization(
+                    anchor
+                )
+                self.anchor_weights = torch.zeros(size=(anchor, 1)).double().view(-1)
+            elif self.d == 2:
+                anchor = no_anchor_points
+                self.anchor_points = self.hierarchy.top_node.return_discretization(
+                    int(np.sqrt(anchor))
+                )
+                self.anchor_weights = torch.zeros(size=(anchor, 1)).double().view(-1)
+            self.global_dt = 0.0
+            self.anchor_points_emb = self.packing.embed(self.anchor_points)
+
+        if feedback_type == "count-record" and self.dual:
+            print("Precomputing phis.")
+            for index_set, set in enumerate(self.basic_sets):
+                self.varphis[index_set, :] = self.packing.integral(set)
+                self.variances[index_set] = set.volume() * self.B
+
+        print("Precomputation finished.")
+
+    def add_data_point(self, new_data, times=True):
+        r"""
+        Takes data in the format (area: BorelSet, data_points: Tensor, time_delta: float)
+        where data_points is a 2d tensor, with number of columns equal to d
+        and number of rows equal to the number of point observations
+
+        It adds
+
+        - the integral over the sensing area plus the log of the integral over the sensing area if the data is of type histogram
+        - the integral over the sensing are plus the sum of the rate function at the datapoints if the data is of type count-record
+
+        to `self.loglikelihood`
+        """
+
+        super().add_data_point(new_data, times=times)
+
+        if self.rate is not None:
+            rate = self.rate
+        else:
+            l, _, u = self.get_constraints()
+            Gamma_half = self.cov()
+            rate = Gamma_half @ u
+
+        if self.feedback == "histogram":
+            val = self.packing.integral(new_data[0]) @ rate * new_data[2]
+            v = -np.log(val) + val
+
+        elif self.feedback == "count-record":
+            v = self.packing.integral(new_data[0]) @ rate * new_data[2]
+            if new_data[1] is not None:
+                val2 = self.packing.embed(new_data[1]) @ rate * new_data[2]
+                v = v - torch.sum(np.log(val2))
+
+        self.loglikelihood += v
+
+    def beta_theory(self):
+        if self.approx_fit == False:
+            l, Lambda, u = self.get_constraints()
+            Gamma_half, invGamma_half = self.cov(inverse=True)
+
+            ## norm
+            norm = self.s
+
+            ## constraints
+            eps = 10e-3
+            res = Gamma_half @ self.rate.view(-1, 1) - torch.tensor(l).view(-1, 1)
+            xi = res.clone()
+            xi[res > eps] = 0.0
+
+            constraint = xi.T @ Gamma_half @ self.W_inv_approx @ Gamma_half.T @ xi
+
+            ## concentration
+            vol = (
+                4 * np.log(1.0 / 0.1)
+                + torch.logdet(self.W)
+                - self.get_m() * np.log(self.s)
+            )
+            self.beta_value = np.sqrt(norm + vol + constraint)
+            print("-------------------")
+            print("New beta:", self.beta_value)
+            print("norm:", norm)
+            print("constraint:", constraint)
+            print("vol:", vol)
+            print("-------------------")
+        else:
+            pass
+        return self.beta_value
+
+    def get_constraints(self):
+        old_elements = self.packing.get_constraints()
+        new_elements = []
+        for element in old_elements:
+            if isinstance(element, np.ndarray):
+                element = torch.tensor(element)
+            new_elements.append(element)
+        return tuple(new_elements)
+
+    def cov(self, inverse=False):
+        return self.packing.cov(inverse=inverse)
+
+    def fit(self):
+        self.fit_gp()
+
+    def fit_gp(
+        self,
+        threads=4,
+        optimization_library=None,
+    ):
+        optimization_library = (
+            optimization_library
+            if optimization_library is not None
+            else self.optimization_library
+        )
+
+        if self.data is not None:
+            if self.feedback == "count-record":
+
+                if self.estimator == "likelihood":
+                    if optimization_library == "cvxpy":
+                        self.penalized_likelihood(threads=threads)
+                    elif optimization_library == "torch":
+                        self.penalized_likelihood_fast()
+                    else:
+                        raise NotImplementedError(
+                            "The optimization method does not exist"
+                        )
+
+                elif self.estimator == "least-sq":
+                    self.least_squares_weighted()
+
+                elif self.estimator == "bins":
+                    self.penalized_likelihood_bins()
+
+                else:
+                    raise AssertionError("wrong name.")
+
+            elif self.feedback == "histogram":
+
+                if self.estimator == "likelihood":
+                    self.penalized_likelihood_integral()
+
+                elif self.estimator == "least-sq":
+                    self.least_squares_weighted_integral()
+
+                elif self.estimator == "bins":
+                    self.penalized_likelihood_integral_bins()
+
+                else:
+                    raise AssertionError("wrong name.")
+            else:
+                raise AssertionError("wrong name.")
+        else:
+            l, Lambda, u = self.get_constraints()
+            Gamma_half = self.cov()
+            self.rate = l
+
+    def sample_mirror_langevin(self, steps=500, verbose=False):
+
+        l, Lambda, u = self.get_constraints()
+        Gamma_half, invGamma_half = self.cov(inverse=True)
+
+        v = torch.tensor((u + l) / 2.0).view(-1, 1)
+        S = torch.diag(torch.tensor(u - l).view(-1) / 2.0).double()
+
+        phis = self.phis.clone() @ invGamma_half
+
+        if self.observations is not None:
+            obs = self.observations @ invGamma_half
+        else:
+            obs = None
+
+        invGamma = invGamma_half.T @ invGamma_half
+        transform = lambda y: S @ torch.tanh(y) + v
+
+        if self.feedback == "count-record" and self.dual == False:
+            if obs is not None:
+                func = (
+                    lambda y: -torch.sum(torch.log(obs @ transform(y)).view(-1))
+                    + torch.sum(phis @ transform(y))
+                    + self.s * transform(y).T @ invGamma @ transform(y)
+                    + torch.sum(torch.log(1.0 / (1.0 - transform(y) ** 2)))
+                )
+            else:
+                func = (
+                    lambda y: torch.sum(phis @ transform(y))
+                    + self.s * transform(y).T @ invGamma @ transform(y)
+                    + torch.sum(torch.log(1.0 / (1.0 - transform(y) ** 2)))
+                )  # torch.sum(torch.log(0.5*(1.+torch.cosh(2*y))))
+
+        elif self.feedback == "count-record" and self.dual == True:
+            mask = self.bucketized_counts > 0
+            phis = self.varphis[mask, :] @ invGamma_half
+            tau = self.total_bucketized_time[mask]
+
+            if obs is not None:
+                obs = self.anchor_points_emb @ invGamma_half
+                weights = self.anchor_weights
+                mask = weights > 0.0
+
+                func = (
+                    lambda y: -torch.sum(
+                        weights[mask].view(-1, 1)
+                        * torch.log(obs[mask, :] @ transform(y))
+                    )
+                    + torch.sum(tau.view(-1, 1) * (phis @ transform(y)))
+                    + self.s * transform(y).T @ invGamma @ transform(y)
+                    + torch.sum(torch.log(1.0 / (1.0 - (transform(y) ** 2))))
+                )  # + torch.sum(torch.log(0.5*(1.+torch.cosh(2*y))))
+            else:
+                func = (
+                    lambda y: torch.sum(tau.view(-1, 1) * (phis @ transform(y)))
+                    + self.s * transform(y).T @ invGamma @ transform(y)
+                    + torch.sum(torch.log(1.0 / (1.0 - transform(y) ** 2)))
+                )  # + torch.sum(torch.log(0.5*(1.+torch.cosh(2*y))))
+
+        elif self.feedback == "histogram":
+            func = (
+                lambda y: -torch.sum(
+                    self.counts.clone().view(-1)
+                    * torch.log(phis @ (S @ torch.tanh(y) + v)).view(-1)
+                )
+                + torch.sum(phis @ (S @ torch.tanh(y) + v))
+                + self.s
+                * (S @ torch.tanh(y) + v).T
+                @ invGamma
+                @ (S @ torch.tanh(y) + v)
+            )
+
+        y = torch.rand(size=(self.get_m(), 1), dtype=self.dtype, requires_grad=True)
+
+        # initiallize with map sqeezed more
+        y.data = Gamma_half @ self.rate.view(-1, 1)  # u < theta < l
+
+        u_new = u + 0.01
+        l_new = l - 0.01
+        v2 = torch.tensor((u_new + l_new) / 2.0).view(-1, 1)
+        S2 = torch.diag(torch.tensor(u_new - l_new).view(-1) / 2.0).double()
+        #
+        y.data = torch.inverse(S2) @ (y.data - v2)
+        y.data = torch.atanh(y.data)
+
+        W = (
+            S.T
+            @ invGamma_half.T
+            @ self.construct_covariance_matrix_laplace()
+            @ invGamma_half
+            @ S
+        )
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.cpu().numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-8
+            )
+        )
+        eta = 0.05 / (L + 1)
+
+        print("Eta:", eta)
+
+        for k in range(steps):
+
+            w = torch.randn(size=(self.get_m(), 1)).double()
+            nabla_y = torch.autograd.functional.jacobian(func, y).data[0, 0, :, :]
+            y.data = y.data - eta * nabla_y + np.sqrt(2 * eta) * w
+            theta = torch.tanh(y).detach()
+
+            if verbose == True:
+                print("Iter:", k, (S @ theta + v).T)
+                print(y.T)
+
+        self.sampled_theta = invGamma_half @ transform(y.data)
+
+    def sample_projected_langevin(self, steps=300, verbose=False, stepsize=None):
+        """
+        :param burn_in:
+        :return:
+        """
+
+        Gamma_half = self.packing.cov()
+
+        def prox(x):
+            z = x.cpu().numpy()
+            theta = cp.Variable((self.get_m(), 1))
+            objective = cp.Minimize(cp.sum_squares(z - theta))
+            constraints = []
+            l, Lambda, u = self.get_constraints()
+            Lambda = Lambda @ Gamma_half.cpu().numpy()
+            constraints.append(Lambda @ theta >= l.reshape(-1, 1))
+            prob = cp.Problem(objective, constraints)
+            prob.solve(
+                solver=cp.OSQP,
+                warm_start=False,
+                verbose=False,
+                eps_abs=1e-3,
+                eps_rel=1e-3,
+            )
+            return torch.tensor(theta.value)
+
+        if self.feedback == "count-record" and self.dual == False:
+            if self.observations is not None:
+                nabla = (
+                    lambda y: -torch.einsum(
+                        "i,ij->j",
+                        1.0 / (self.observations @ y).view(-1),
+                        self.observations,
+                    ).view(-1, 1)
+                    + torch.sum(self.phis, dim=0).view(-1, 1)
+                    + self.s * y.view(-1, 1)
+                )
+            else:
+                nabla = lambda theta: torch.sum(self.phis, dim=0).view(
+                    -1, 1
+                ) + self.s * theta.view(-1, 1)
+
+        elif self.feedback == "count-record" and self.dual == True:
+            mask = self.bucketized_counts > 0
+            phis = self.varphis[mask, :]
+            tau = self.total_bucketized_time[mask]
+
+            if self.observations is not None:
+                obs = self.anchor_points_emb
+                weights = self.anchor_weights
+                mask = weights > 0.0
+                nabla = (
+                    lambda y: -torch.einsum(
+                        "i,ij->j",
+                        weights[mask] / ((obs[mask, :] @ y).view(-1)),
+                        obs[mask],
+                    ).view(-1, 1)
+                    + torch.einsum("i,ij->j", tau, phis).view(-1, 1)
+                    + self.s * y.view(-1, 1)
+                )
+            else:
+                nabla = lambda y: torch.einsum("i,ij->j", tau, phis).view(
+                    -1, 1
+                ) + self.s * y.view(-1, 1)
+
+        elif self.feedback == "histogram":
+            nabla = (
+                lambda theta: -torch.sum(
+                    torch.diag((1.0 / (self.phis @ theta).view(-1)) * self.counts)
+                    @ self.phis,
+                    dim=0,
+                ).view(-1, 1)
+                + torch.sum(self.phis, dim=0).view(-1, 1)
+                + self.s * theta.view(-1, 1)
+            )
+
+        theta = self.rate.view(-1, 1)
+        W = self.construct_covariance_matrix_laplace(minimal=True)
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.cpu().numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-5
+            )
+        )
+
+        if stepsize is None:
+            eta = 0.5 / (L + 1)
+        else:
+            eta = np.minimum(1, stepsize * 0.5 / L)
+
+        print(eta)
+        for k in range(steps):
+            w = torch.randn(size=(self.get_m(), 1)).double()
+            theta = prox(theta - eta * nabla(theta) + np.sqrt(2 * eta) * w)
+
+            if verbose == True:
+                print("Iter:", k, theta.T)
+
+        self.sampled_theta = theta
+
+    def sample_proximal_langevin_prox(self, steps=300, verbose=False, stepsize=None):
+        """
+        :param burn_in:
+        :return:
+        """
+
+        Gamma_half, invGamma_half = self.packing.cov(inverse=True)
+        # invGamma = invGamma_half.T @ invGamma_half
+        l, Lambda, u = self.get_constraints()
+        Lambda = Lambda @ Gamma_half.cpu().numpy()
+
+        def prox(x):
+            res = solve_qp(
+                np.eye(self.get_m()),
+                x.cpu().numpy().reshape(-1),
+                C=Gamma_half.cpu().numpy(),
+                b=np.array(l),
+                factorized=True,
+            )
+            return torch.tensor(res[0]).view(-1, 1)
+
+        # theta_n = cp.Variable((self.get_m(), 1))
+        # x = cp.Parameter((self.get_m(), 1))
+        # objective = cp.Minimize(cp.sum_squares(x - theta_n))
+        #
+        # constraints = []
+        # l, Lambda, u = self.get_constraints()
+        # Lambda = Lambda @ Gamma_half.cpu().numpy()
+        # constraints.append(Lambda @ theta_n >= l.reshape(-1, 1))
+        # constraints.append(Lambda @ theta_n <= u.reshape(-1, 1))
+        #
+        # prob = cp.Problem(objective, constraints)
+
+        # def prox(x):
+        # 	return Gamma_half @ torch.tensor(scipy.optimize.nnls(invGamma.cpu().numpy(), (invGamma_half@x).numpy().reshape(-1), maxiter = 1000)[0]).view(-1,1)
+
+        samples = []
+
+        if self.data is not None:
+            if self.feedback == "count-record" and self.dual == False:
+                if self.observations is not None:
+                    nabla = (
+                        lambda y: -torch.einsum(
+                            "i,ij->j",
+                            1.0 / (self.observations @ y).view(-1),
+                            self.observations,
+                        ).view(-1, 1)
+                        + torch.sum(self.phis, dim=0).view(-1, 1)
+                        + self.s * y.view(-1, 1)
+                    )
+                else:
+                    nabla = lambda theta: torch.sum(self.phis, dim=0).view(
+                        -1, 1
+                    ) + self.s * theta.view(-1, 1)
+
+            elif self.feedback == "count-record" and self.dual == True:
+                mask = self.bucketized_counts > 0
+                phis = self.varphis[mask, :]
+                tau = self.total_bucketized_time[mask]
+
+                if self.observations is not None:
+                    obs = self.anchor_points_emb
+                    weights = self.anchor_weights
+                    mask = weights > 0.0
+                    nabla = (
+                        lambda y: -torch.einsum(
+                            "i,ij->j",
+                            weights[mask] / ((obs[mask, :] @ y).view(-1)),
+                            obs[mask],
+                        ).view(-1, 1)
+                        + torch.einsum("i,ij->j", tau, phis).view(-1, 1)
+                        + self.s * y.view(-1, 1)
+                    )
+                else:
+                    nabla = lambda y: torch.einsum("i,ij->j", tau, phis).view(
+                        -1, 1
+                    ) + self.s * y.view(-1, 1)
+
+            elif self.feedback == "histogram":
+                nabla = (
+                    lambda theta: -torch.sum(
+                        torch.diag((1.0 / (self.phis @ theta).view(-1)) * self.counts)
+                        @ self.phis,
+                        dim=0,
+                    ).view(-1, 1)
+                    + torch.sum(self.phis, dim=0).view(-1, 1)
+                    + self.s * theta.view(-1, 1)
+                )
+        else:
+            nabla = lambda theta: self.s * theta.view(-1, 1)
+
+        if self.rate is not None:
+            theta = self.rate.view(-1, 1)
+        else:
+            theta = (
+                self.b
+                + 0.05
+                * torch.rand(
+                    size=(self.get_m(), 1), dtype=self.dtype, requires_grad=False
+                ).view(-1, 1)
+                ** 2
+            )
+
+        for k in range(steps):
+            w = torch.randn(size=(self.get_m(), 1)).double()
+
+            # calculate proper step-size
+            W = self.construct_covariance_matrix_laplace(theta=theta)
+            L = float(
+                scipy.sparse.linalg.eigsh(
+                    W.cpu().numpy(),
+                    k=1,
+                    which="LM",
+                    return_eigenvectors=False,
+                    tol=1e-3,
+                )
+            )
+            if stepsize is not None:
+                eta = 0.5 * stepsize / L
+            else:
+                eta = 0.5 / L
+
+            # prox calculate
+            # x.value = theta.cpu().numpy()
+            # prob.solve(solver=cp.OSQP, warm_start=True, verbose=False, eps_abs=1e-3, eps_rel=1e-3)
+            # proximal_theta = torch.tensor(theta_n.value)
+
+            # update step
+            # 			theta = 0.5 * theta - eta * nabla(theta) + 0.5 * proximal_theta + np.sqrt(2 * eta) * w
+
+            # update step
+            theta = (
+                0.5 * theta
+                - eta * nabla(theta)
+                + 0.5 * prox(theta)
+                + np.sqrt(2 * eta) * w
+            )
+            if verbose == True:
+                print("Iter:", k, theta.T)
+
+            samples.append(prox(theta))
+
+        self.sampled_theta = prox(theta)
+
+        return samples
+
+    def sample_proximal_langevin_simple_prox(self, steps=300, verbose=False):
+
+        Gamma_half, invGamma_half = self.packing.cov(inverse=True)
+        l, Lambda, u = self.get_constraints()
+        prox_simple = lambda x: torch.minimum(
+            torch.maximum(x.view(-1), torch.tensor(l).view(-1)),
+            torch.tensor(u).view(-1),
+        ).view(-1, 1)
+
+        def prox(x):
+            return invGamma_half @ prox_simple(Gamma_half @ x)
+
+        phis = self.phis
+        if self.feedback == "count-record" and self.dual == False:
+            if self.observations is not None:
+                obs = self.observations
+
+                func = (
+                    lambda y: -torch.sum(torch.log(obs @ y))
+                    + torch.sum((phis @ y))
+                    + self.s * y.T @ y
+                )
+
+                nabla = (
+                    lambda y: -torch.einsum(
+                        "i,ij->j", 1.0 / (obs @ y).view(-1), obs
+                    ).view(-1, 1)
+                    + torch.sum(phis, dim=0).view(-1, 1)
+                    + self.s * y.view(-1, 1)
+                )
+            else:
+                func = lambda y: torch.sum(phis @ y).view(-1, 1) + self.s * y.T @ y
+
+                nabla = lambda y: torch.sum(phis, dim=0).view(-1, 1) + self.s * y.view(
+                    -1, 1
+                )
+
+        elif self.feedback == "count-record" and self.dual == True:
+            mask = self.bucketized_counts > 0
+            phis = self.varphis[mask, :]
+            tau = self.total_bucketized_time[mask]
+
+            if self.observations is not None:
+                obs = self.anchor_points_emb
+                weights = self.anchor_weights
+                mask = weights > 0.0
+                func = (
+                    lambda y: -torch.sum(
+                        weights[mask].view(-1, 1) * torch.log(obs[mask, :] @ y)
+                    )
+                    + torch.sum(tau.view(-1, 1) * (phis @ y))
+                    + self.s * y.T @ y
+                )
+
+                nabla = (
+                    lambda y: -torch.einsum(
+                        "i,ij->j",
+                        weights[mask] / ((obs[mask, :] @ y).view(-1)),
+                        obs[mask],
+                    ).view(-1, 1)
+                    + torch.einsum("i,ij->j", tau, phis).view(-1, 1)
+                    + self.s * y.view(-1, 1)
+                )
+            else:
+                func = (
+                    lambda y: torch.sum(tau.view(-1, 1) * (phis @ y)) + self.s * y.T @ y
+                )
+
+                nabla = lambda y: torch.einsum("i,ij->j", tau, phis).view(
+                    -1, 1
+                ) + self.s * y.view(-1, 1)
+
+        elif self.feedback == "histogram":
+            func = (
+                lambda y: -torch.sum(
+                    self.counts.view(-1) * torch.log(phis @ y).view(-1)
+                )
+                + torch.sum(phis @ y)
+                + self.s * y.T @ y
+            )
+            nabla = (
+                lambda y: -torch.einsum(
+                    "i,ij->j", self.counts.view(-1) / (phis @ y).view(-1), phis
+                ).view(-1, 1)
+                + torch.sum(phis, dim=0).view(-1, 1)
+                + self.s * y
+            )
+
+        # hessian = lambda y: self.construct_covariance_matrix_laplace()
+
+        y = prox(
+            torch.randn(size=(self.get_m(), 1), dtype=self.dtype, requires_grad=True)
+        )
+        y.data = self.rate.view(-1, 1)
+
+        W = self.construct_covariance_matrix_laplace()
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.cpu().numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-5
+            )
+        )
+
+        eta = 0.5 / (L + 1)
+
+        for k in range(steps):
+            W = torch.randn(size=(self.get_m(), 1)).double()
+            nabla_y = nabla(y.data)
+            y.data = (
+                (1 - eta) * y.data
+                - eta * nabla_y
+                + eta * prox(y.data)
+                + np.sqrt(2 * eta) * W
+            )
+            if verbose == True:
+                print("Iter:", k, y.T)
+                print("grad:", y.grad.T)
+
+        self.sampled_theta = prox(y.detach())
+
+    def sample_hessian_positive_langevin(self, steps=500, verbose=False, stepsize=None):
+
+        if self.data is not None:
+            if self.feedback == "count-record" and self.dual == False:
+                if self.observations is not None:
+                    nabla = (
+                        lambda y: -torch.einsum(
+                            "i,ij->j",
+                            1.0 / (self.observations @ y).view(-1),
+                            self.observations,
+                        ).view(-1, 1)
+                        + torch.sum(self.phis, dim=0).view(-1, 1)
+                        + self.s * y.view(-1, 1)
+                    )
+                else:
+                    nabla = lambda theta: torch.sum(self.phis, dim=0).view(
+                        -1, 1
+                    ) + self.s * theta.view(-1, 1)
+
+            elif self.feedback == "count-record" and self.dual == True:
+
+                mask = self.bucketized_counts > 0
+                phis = self.varphis[mask, :]
+                tau = self.total_bucketized_time[mask]
+
+                if self.observations is not None:
+                    obs = self.anchor_points_emb
+                    weights = self.anchor_weights
+                    mask = weights > 0.0
+                    nabla = (
+                        lambda y: -torch.einsum(
+                            "i,ij->j",
+                            weights[mask] / ((obs[mask, :] @ y).view(-1)),
+                            obs[mask],
+                        ).view(-1, 1)
+                        + torch.einsum("i,ij->j", tau, phis).view(-1, 1)
+                        + self.s * y.view(-1, 1)
+                    )
+                else:
+                    nabla = lambda y: torch.einsum("i,ij->j", tau, phis).view(
+                        -1, 1
+                    ) + self.s * y.view(-1, 1)
+
+            elif self.feedback == "histogram":
+                nabla = (
+                    lambda theta: -torch.sum(
+                        torch.diag((1.0 / (self.phis @ theta).view(-1)) * self.counts)
+                        @ self.phis,
+                        dim=0,
+                    ).view(-1, 1)
+                    + torch.sum(self.phis, dim=0).view(-1, 1)
+                    + self.s * theta.view(-1, 1)
+                )
+        else:
+            nabla = lambda theta: self.s * theta.view(-1, 1)
+
+        Gamma_half = self.packing.cov()
+        lz, Lambda, u = self.get_constraints()
+
+        Lambda = torch.tensor(Lambda) @ Gamma_half
+        y = (
+            self.b
+            + 0.05
+            * torch.rand(
+                size=(self.get_m(), 1), dtype=self.dtype, requires_grad=True
+            ).view(-1)
+            ** 2
+        )
+
+        if self.rate is not None:
+            y.data = self.rate.data + Gamma_half @ y.data
+        else:
+            y.data = Gamma_half @ y.data
+
+        if verbose == True:
+            print("initial point")
+            print(y.data)
+
+        W = self.construct_covariance_matrix_laplace()
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.cpu().numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-5
+            )
+        )
+
+        if stepsize is None:
+            eta = 1.0 / (L + 1)
+        else:
+            eta = stepsize / (L + 1)
+
+        D = lambda x: torch.diag(1.0 / torch.abs(Lambda @ x).view(-1))
+        sqrt_hessian = lambda x: Lambda @ D(x)
+
+        phi = lambda x: -torch.sum(torch.log(Lambda @ x))
+        nabla_phi = lambda x: -torch.einsum(
+            "i,ij->j", 1.0 / (Lambda @ x).view(-1), Lambda
+        )
+        hessian_phi = (
+            lambda x: Lambda.T @ torch.diag(1.0 / (Lambda @ x).view(-1) ** 2) @ Lambda
+        )
+
+        for k in range(steps):
+            w = torch.randn(size=(self.get_m(), 1)).double()
+            nabla_val = nabla(y)
+            H = sqrt_hessian(y.data)
+            z = (
+                nabla_phi(y.data).view(-1, 1)
+                - eta * nabla_val
+                + np.sqrt(2 * eta) * H @ w
+            )
+
+            # y.data = newton_solve(lambda s: nabla_phi(s).reshape(-1)-z.data.reshape(-1),y.reshape(-1),
+            #  					  verbose = verbose, grad = hessian_phi).view(-1,1)
+
+            # # minimization appraoch
+            def objective(s):
+                return torch.sum((nabla_phi(s).reshape(-1) - z.reshape(-1)) ** 2)
+
+            # #
+
+            # x0 = y.reshape(-1).clone().detach().numpy()
+            # res = minimize(objective, x0, backend='torch', method='Newton-CG', precision='float64', tol=1e-5, hvp_type='vhp')
+            # y.data = torch.tensor(res.x)
+
+            x0 = y.reshape(-1).clone()
+            res = minimize_torch(objective, x0, method="newton-cg", tol=1e-5)
+            y.data = res.x
+
+            if verbose:
+                print("Iter:", k)
+                print(y.T)
+
+        self.sampled_theta = y.data
+
+    def sample_mla_prime(self, steps=100, verbose=False, stepsize=None):
+        Gamma_half, invGamma_half = self.packing.cov(inverse=True)
+        invGamma = invGamma_half.T @ invGamma_half
+        l, Lambda, u = self.get_constraints()
+        Lambda = torch.tensor(Lambda) @ Gamma_half
+
+        if self.data is not None:
+            if self.feedback == "count-record" and self.dual == False:
+                if self.observations is not None:
+                    observations = self.observations @ invGamma_half
+                    phis = self.phis @ invGamma_half
+                    nabla = (
+                        lambda y: -torch.einsum(
+                            "i,ij->j", 1.0 / (observations @ y).view(-1), observations
+                        ).view(-1, 1)
+                        + torch.sum(phis, dim=0).view(-1, 1)
+                        + self.s * invGamma @ y.view(-1, 1)
+                    )
+                else:
+                    nabla = lambda theta: torch.sum(phis, dim=0).view(
+                        -1, 1
+                    ) + self.s * invGamma @ theta.view(-1, 1)
+
+        else:
+            nabla = lambda theta: self.s * invGamma @ theta.view(-1, 1)
+
+        y = (
+            self.b
+            + 0.05
+            * torch.rand(
+                size=(self.get_m(), 1), dtype=self.dtype, requires_grad=True
+            ).reshape(-1, 1)
+            ** 2
+        )
+        # if self.rate is not None:
+        # 	y.data = Gamma_half @ self.rate.data.view(-1,1) + y.data
+        # else:
+        y.data = y.data
+
+        if verbose == True:
+            print("initial point")
+            print(y.data)
+
+        W = invGamma_half.T @ self.construct_covariance_matrix_laplace() @ invGamma_half
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.cpu().numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-5
+            )
+        )
+
+        if stepsize is None:
+            eta = 1.0 / (L + 1)
+        else:
+            eta = stepsize / (L + 1)
+
+        from stpy.approx_inference.sampling_helper import get_increment
+
+        for k in range(steps):
+
+            nabla_val = nabla(y)
+
+            # cvxpy minimization
+            # x = cp.Variable((self.get_m(), 1))
+            # objective = cp.Minimize( eta * nabla_val.detach().numpy().T @ x - cp.sum(cp.log(x)) -(-1./y.data).T@x)
+            # constraints = [x >= 0.]
+            #
+            # prob = cp.Problem(objective, constraints)
+            # prob.solve(solver = cp.MOSEK)
+
+            w0 = eta * nabla_val.data + 1.0 / y.data
+            # initial point for the solve
+            # w0 = -1./( torch.tensor(x.value))
+
+            # simulate
+            f = lambda w, n: n / torch.abs(w)
+            w = get_increment(eta, 1000, f, w0, path=False)
+
+            # back mirror map
+            y.data = -1.0 / w
+
+            if verbose:
+                print("Iter:", k)
+                print(y.T)
+
+        self.sampled_theta = invGamma_half @ y.data
+
+    def sample_hessian_positive_langevin_2(
+        self, steps=500, verbose=False, stepsize=None, preconditioner=True
+    ):
+
+        Gamma_half, invGamma_half = self.packing.cov(inverse=True)
+        invGamma = invGamma_half @ invGamma_half
+        if self.data is not None:
+
+            if self.feedback == "count-record" and self.dual == False:
+
+                observations = self.observations @ invGamma_half
+                phis = self.phis @ invGamma_half
+
+                if self.observations is not None:
+                    nabla = (
+                        lambda y: -torch.einsum(
+                            "i,ij->j", 1.0 / (observations @ y).view(-1), observations
+                        ).view(-1, 1)
+                        + torch.sum(phis, dim=0).view(-1, 1)
+                        + self.s * invGamma @ y.view(-1, 1)
+                    )
+                else:
+                    nabla = lambda theta: torch.sum(phis, dim=0).view(
+                        -1, 1
+                    ) + self.s * invGamma @ theta.view(-1, 1)
+
+        else:
+            nabla = lambda theta: self.s * invGamma @ theta.view(-1, 1)
+
+        y = (
+            torch.rand(
+                size=(self.get_m(), 1), dtype=self.dtype, requires_grad=True
+            ).view(-1)
+            ** 2
+        )
+        # if self.rate is not None:
+        # 	y.data = Gamma_half @ self.rate.data + y.data
+
+        if verbose == True:
+            print("initial point")
+            print(y.data)
+
+        W = self.construct_covariance_matrix_laplace(minimal=True)
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.cpu().numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-5
+            )
+        )
+
+        if stepsize is None:
+            eta = 1.0 / (L + 1)
+        else:
+            eta = stepsize / (L + 1)
+
+        for k in range(steps):
+            w = torch.randn(size=(self.get_m(), 1)).double() / torch.abs(y.data).view(
+                -1, 1
+            )
+            nabla_val = nabla(y)
+            z = (
+                -1.0 / y.data.view(-1, 1)
+                + self.b
+                - eta * Gamma_half @ nabla_val
+                + np.sqrt(2 * eta) * Gamma_half @ w
+            )
+            y.data = -1.0 / z + self.b
+
+            if verbose:
+                print("Iter:", k)
+                print(y.T)
+
+        self.sampled_theta = invGamma_half @ y.data
+
+    def sample_newton_langevin(self, steps=1000, stepsize=None, verbose=False):
+        Gamma_half, invGamma_half = self.packing.cov(inverse=True)
+        invGamma = invGamma_half @ invGamma_half
+        if self.data is not None:
+
+            if self.feedback == "count-record" and self.dual == False:
+
+                observations = self.observations @ invGamma_half
+                phis = self.phis @ invGamma_half
+
+                if self.observations is not None:
+                    nabla = (
+                        lambda y, bar: -torch.einsum(
+                            "i,ij->j", 1.0 / (observations @ y).view(-1), observations
+                        ).view(-1, 1)
+                        + torch.sum(phis, dim=0).view(-1, 1)
+                        + self.s * invGamma @ y.view(-1, 1)
+                        - bar * 1.0 / y
+                    )
+                else:
+                    nabla = (
+                        lambda theta, bar: torch.sum(phis, dim=0).view(-1, 1)
+                        + self.s * invGamma @ theta.view(-1, 1)
+                        - bar * 1.0 / theta
+                    )
+
+        else:
+            nabla = (
+                lambda theta, bar: self.s * invGamma @ theta.view(-1, 1)
+                - bar * 1.0 / theta
+            )
+
+        y = (
+            0.05
+            * torch.rand(
+                size=(self.get_m(), 1), dtype=self.dtype, requires_grad=True
+            ).view(-1, 1)
+            ** 2
+        )
+
+        barrier = 10.0
+        # hessian = lambda theta,bar: torch.einsum('ik,k,kj->ij',observations.T,(observations@theta).view(-1),observations) + invGamma + bar/theta**2
+        hessian = (
+            lambda theta, bar: observations.T
+            @ torch.diag(1 / (observations @ theta).view(-1) ** 2)
+            @ observations
+            + invGamma
+            + torch.diag(bar / theta.view(-1) ** 2)
+        )
+        hessian_sqrt = lambda theta, bar: torch.cholesky(hessian(theta, bar))
+        eta = 1.0
+
+        for k in range(steps):
+            w = torch.randn(size=(self.get_m(), 1)).double()
+            nabla_val = nabla(y, barrier)
+            y.data = (
+                y.data
+                - torch.linalg.solve(hessian(y.data, barrier), nabla_val)
+                + np.sqrt(2 * eta)
+                * torch.linalg.solve(hessian_sqrt(y.data, barrier), w)
+            )
+
+            if verbose:
+                print("Iter:", k)
+                print(y.T)
+
+        self.sampled_theta = invGamma_half @ y.data
+
+    # self.sampled_theta = y.data
+
+    def sample_hmc(self, steps=1000, stepsize=None, verbose=False):
+        import hamiltorch
+
+        phis = self.phis
+        if self.feedback == "count-record" and self.dual == False:
+            if self.observations is not None:
+                obs = self.observations
+                func = (
+                    lambda y: torch.sum(torch.log(obs @ y))
+                    - torch.sum((phis @ y))
+                    - self.s * y.T @ y
+                )
+            else:
+                func = lambda y: -torch.sum(phis @ y).view(-1, 1) - self.s * y.T @ y
+
+        num_samples = 1
+        num_steps_per_sample = steps
+        if stepsize is None:
+            step_size = 1e-8
+        else:
+            step_size = stepsize
+
+        params_init = self.rate
+        self.sample_theta = hamiltorch.sample(
+            log_prob_func=func,
+            params_init=params_init,
+            num_samples=num_samples,
+            step_size=step_size,
+            num_steps_per_sample=num_steps_per_sample,
+        )
+        print(self.sampled_theta)
+
+    def sample_variational(self, xtest, accuracy=1e-4, verbose=False, samples=1):
+        from stpy.approx_inference.variational_mf import VMF_SGCP
+
+        cov_params = [self.kernel.kappa, self.kernel.gamma]
+        S_borders = np.array([[-1.0, 1.0]])
+        num_inducing_points = self.m
+        num_integration_points = 256
+        X = self.x
+
+        var_mf_sgcp = VMF_SGCP(
+            S_borders,
+            X,
+            cov_params,
+            num_inducing_points,
+            num_integration_points=num_integration_points,
+            update_hyperparams=False,
+            output=0,
+            conv_crit=accuracy,
+        )
+        var_mf_sgcp.run()
+        sample_paths = var_mf_sgcp.sample_posterior(xtest, num_samples=1.0)
+        return sample_paths
+
+    def sample(self, verbose=False, steps=None, domain=None):
+        """
+        :return:
+        """
+        if steps is None:
+            steps = self.steps
+
+        if self.stepsize is not None:
+            stepsize = self.stepsize
+        else:
+            stepsize = None
+
+        l, Lambda, u = self.get_constraints()
+        if self.rate is None:
+            self.fit_gp()
+
+        if self.sampling == "mirror":
+            r = self.sample_mirror_langevin(steps=steps, verbose=verbose)
+        elif self.sampling == "proximal+prox":
+            r = self.sample_proximal_langevin_prox(steps=steps, verbose=verbose)
+        elif self.sampling == "proximal+simple_prox":
+            r = self.sample_proximal_langevin_simple_prox(steps=steps, verbose=verbose)
+        elif self.sampling == "hessian":
+            r = self.sample_hessian_positive_langevin(
+                steps=steps, verbose=verbose, stepsize=stepsize
+            )
+        elif self.sampling == "hessian2":
+            r = self.sample_hessian_positive_langevin_2(
+                steps=steps, verbose=verbose, stepsize=stepsize
+            )
+        elif self.sampling == "mla_prime":
+            r = self.sample_mla_prime(steps=steps, verbose=verbose, stepsize=stepsize)
+        elif self.sampling == "hmc":
+            r = self.sample_hmc(steps=steps, verbose=verbose, stepsize=stepsize)
+        elif self.sampling == "polyia_variational":
+            r = self.sample_variational(accuracy=1.0 / steps, verbose=verbose)
+        else:
+            raise NotImplementedError("Sampling of such is not supported.")
+
+        return r
+
+    def sampled_lcb_ucb(self, xtest, samples=100, delta=0.1):
+        paths = []
+        for i in range(samples):
+            self.sample()
+            path = self.sample_path_points(xtest).view(1, -1)
+            paths.append(path)
+
+        paths = torch.cat(paths, dim=0)
+        lcb = torch.quantile(paths, delta, dim=0)
+        ucb = torch.quantile(paths, 1 - delta, dim=0)
+        return lcb, ucb
+
+    def penalized_likelihood_fast(self):
+        l, Lambda, u = self.get_constraints()
+        # assert torch.allclose(Lambda, torch.eye(self.m**self.d))
+
+        Gamma_half, invGamma_half = self.cov(inverse=True)
+        invGamma_half = invGamma_half.to(self.device)
+
+        s = self.s * 0.5
+
+        if self.dual == False:
+            p = self.phis.to(self.device) @ invGamma_half
+            # using all points without anchor points
+            if self.observations is not None:
+                o = self.observations.to(self.device) @ invGamma_half
+
+                def objective(theta):
+                    return (
+                        -torch.sum(torch.log(o @ theta))
+                        + torch.sum(p @ theta)
+                        + s * torch.sum((invGamma_half @ theta) ** 2)
+                    )
+
+            else:
+
+                def objective(theta):
+                    return torch.sum(p @ theta) + s * torch.sum(
+                        (invGamma_half @ theta) ** 2
+                    )
+
+        else:
+            # using anchor points
+            mask = self.bucketized_counts > 0
+            phis = self.varphis[mask, :]
+            tau = self.total_bucketized_time[mask].to(self.device)
+            p = phis @ invGamma_half
+
+            if self.observations is not None:
+                observations = self.anchor_points_emb.to(self.device)
+                weights = self.anchor_weights.to(self.device)
+                mask = weights > 0.0
+
+                o = observations[mask, :] @ invGamma_half
+
+                def objective(theta):
+                    return (
+                        -torch.einsum(
+                            "i,i",
+                            weights[mask],
+                            torch.log(o @ theta),
+                        )
+                        + torch.einsum("i,i", tau, p @ theta)
+                        + s * torch.sum((invGamma_half @ theta) ** 2)
+                    )
+
+            else:
+
+                def objective(theta):
+                    return torch.einsum("i,i", tau, p @ theta) + s * torch.sum(
+                        (invGamma_half @ theta) ** 2
+                    )
+
+        if isinstance(self.rate, torch.Tensor):
+            theta0 = torch.cat(
+                [
+                    self.rate.to(self.device),
+                    torch.zeros([self.get_m() - len(self.rate)], device=self.device),
+                ]
+            )
+        else:
+            theta0 = torch.zeros(size=(self.get_m(), 1)).view(-1).double()
+
+        eps = 1e-4
+        res = minimize(
+            objective,
+            theta0.cpu().numpy(),
+            backend="torch",
+            method="L-BFGS-B",
+            bounds=(l[0] + eps, u[0]),
+            precision="float64",
+            tol=1e-8,
+            torch_device=str(self.device),
+            options={
+                "ftol": 1e-08,
+                "gtol": 1e-08,
+                "eps": 1e-08,
+                "maxfun": 15000,
+                "maxiter": 15000,
+                "maxls": 20,
+            },
+        )
+
+        self.rate = invGamma_half @ torch.tensor(res.x, device=self.device)
+        print(res.message)
+        return self.rate
+
+    def penalized_likelihood(self, threads=None):
+        if threads is None:
+            cpu_count = os.cpu_count()
+            threads = max(cpu_count - 2, 1) if cpu_count is not None else 1
+
+        theta = cp.Variable(self.get_m())
+        l, Lambda, u = self.get_constraints()
+
+        Gamma_half = self.cov(inverse=False)
+
+        if self.dual == False:
+
+            # using all points without anchor points
+            phis = self.phis.cpu().numpy()
+            if self.observations is not None:
+                observations = self.observations.cpu().numpy()
+                objective = cp.Minimize(
+                    -cp.sum(cp.log(observations @ theta))
+                    + cp.sum(phis @ theta)
+                    + self.s * 0.5 * cp.sum_squares(theta)
+                )
+            else:
+                objective = cp.Minimize(
+                    cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(theta)
+                )
+
+        else:
+
+            # using anchor points
+            mask = self.bucketized_counts.clone().numpy() > 0
+            phis = self.varphis[mask, :].clone().numpy()
+            tau = self.total_bucketized_time[mask].clone().numpy()
+
+            if self.observations is not None:
+                observations = self.anchor_points_emb.cpu().numpy()
+                weights = self.anchor_weights.cpu().numpy()
+                mask = weights > 0.0
+                objective = cp.Minimize(
+                    -cp.sum(
+                        cp.multiply(
+                            weights[mask], cp.log(observations[mask, :] @ theta)
+                        )
+                    )
+                    + cp.sum(cp.multiply(tau, phis @ theta))
+                    + self.s * 0.5 * cp.sum_squares(theta)
+                )
+            else:
+                objective = cp.Minimize(
+                    cp.sum(cp.multiply(tau, phis @ theta))
+                    + self.s * 0.5 * cp.sum_squares(theta)
+                )
+
+        constraints = []
+
+        Lambda = (Lambda @ Gamma_half).cpu().numpy()
+
+        constraints.append(Lambda @ theta >= l.cpu().numpy())
+        constraints.append(Lambda @ theta <= u.cpu().numpy())
+
+        prob = cp.Problem(objective, constraints)
+
+        if self.rate is not None:
+            theta.value = (
+                torch.cat([self.rate, torch.zeros([self.get_m() - len(self.rate)])])
+                .cpu()
+                .numpy()
+            )
+
+        try:
+            prob.solve(
+                solver=cp.MOSEK,
+                warm_start=False,
+                verbose=False,
+                mosek_params={
+                    mosek.iparam.num_threads: threads,
+                    mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                    mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
+                    mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
+                    mosek.dparam.intpnt_co_tol_rel_gap: 1e-4,
+                },
+            )
+
+            self.rate = torch.tensor(theta.value)
+            return self.rate
+        except:
+            print("Optimization failed. Using the old value.")
+            print(prob.status)
+            return self.rate
+
+    def penalized_likelihood_integral(self, threads=4):
+
+        phis = self.phis.cpu().numpy()
+        counts = self.counts.cpu().numpy()
+
+        theta = cp.Variable(self.get_m())
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov().numpy()
+        objective = cp.Minimize(
+            -cp.sum(counts @ cp.log(phis @ theta))
+            + cp.sum(phis @ theta)
+            + self.s * 0.5 * cp.sum_squares(theta)
+        )
+
+        constraints = []
+        Lambda = Lambda @ Gamma_half
+        constraints.append(Lambda @ theta >= l)
+        constraints.append(Lambda @ theta <= u)
+
+        # if self.rate is not None:
+        # 	theta.value = self.rate.cpu().numpy()
+        try:
+            prob = cp.Problem(objective, constraints)
+            prob.solve(
+                solver=cp.MOSEK,
+                warm_start=False,
+                verbose=False,
+                mosek_params={
+                    mosek.iparam.num_threads: threads,
+                    mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                    mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
+                    mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
+                    mosek.dparam.intpnt_co_tol_rel_gap: 1e-4,
+                },
+            )
+            self.rate = torch.tensor(theta.value)
+        except:
+            print("Optimization failed. Using the old value.")
+            print(prob.status)
+
+        return self.rate
+
+    def bucketization(self):
+
+        phis = []
+        observations = []
+
+        # project sets to smallest forms, and then sum on those only
+        basic_sets = self.basic_sets
+
+        data_basic = [[] for _ in range(len(basic_sets))]
+        sensing_times = [[] for _ in range(len(basic_sets))]
+        counts = torch.zeros(len(basic_sets)).int()
+        total_data = 0.0
+        self.total_bucketized_obs = (
+            torch.zeros(size=(len(basic_sets), 1)).double().view(-1)
+        )
+        self.total_bucketized_time = (
+            torch.zeros(size=(len(basic_sets), 1)).double().view(-1)
+        )
+
+        for sample in self.data:
+            S, obs, dt = sample
+            if obs is not None:
+                total_data = total_data + obs.size()[0]  # total counts
+                for index, elementary in enumerate(
+                    basic_sets
+                ):  # iterate over basic sets
+                    mask = elementary.is_inside(
+                        obs
+                    )  # mask which belong to the elementary
+                    if S.inside(elementary) == True:
+                        data_basic[index].append(obs[mask])
+                        counts[index] += 1
+                        sensing_times[index].append(dt)
+            else:
+                for index, elementary in enumerate(basic_sets):
+                    if S.inside(elementary) == True:
+                        data_basic[index].append(torch.tensor([]))
+                        counts[index] += 1
+                        sensing_times[index].append(dt)
+
+        for index, elementary in enumerate(basic_sets):
+            arr = np.array(
+                [int(elem.size()[0]) for elem in data_basic[index]]
+            )  # counts over sensing rounds
+            phi = self.packing.integral(elementary)  # * counts[index]
+
+            self.total_bucketized_obs[index] = float(np.sum(arr))
+            self.total_bucketized_time[index] = float(np.sum(sensing_times[index]))
+
+            observations.append(arr)
+            phis.append(phi.view(1, -1))  # construct varphi_B
+
+        self.bucketized_obs = (
+            observations.copy()
+        )  # these are number of counts associated with sensings
+        self.bucketized_time = (
+            sensing_times.copy()
+        )  # these are times each basic set has been sensed
+        self.bucketized_counts = (
+            counts  # these are count each basic set has been sensed
+        )
+
+    def variance_correction(self, variance):
+
+        if self.var_cor_on == 1:
+
+            g = (
+                lambda B, k, mu: -0.5 * (B**2) / ((mu**2) * k)
+                - B / (mu * k)
+                + (np.exp(B / (k * mu)) - 1)
+            )
+            gn = lambda k: g(self.U, k, variance)
+
+            from scipy import optimize
+
+            k = optimize.bisect(gn, 1, 10000000)
+
+            return k
+        else:
+            return 1.0
+
+    def least_squares_weighted(self, threads=4):
+
+        # if self.approx_fit == False:
+        # 	self.bucketization()
+
+        theta = cp.Variable(self.get_m())
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov().numpy()
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        phis = self.varphis[mask, :].clone().numpy()
+        tau = self.total_bucketized_time.clone().numpy()
+
+        variances = self.variances.view(-1).clone().numpy()
+
+        for i in range(variances.shape[0]):
+            if mask[i] > 0:
+                variances[i] = (
+                    variances[i]
+                    * tau[i]
+                    * self.variance_correction(variances[i] * tau[i])
+                )
+
+        selected_variances = variances[mask]
+        objective = cp.Minimize(
+            cp.sum_squares(
+                (cp.multiply((phis @ theta), tau[mask]) - observations)
+                / (np.sqrt(selected_variances))
+            )
+            + 0.5 * self.s * cp.norm2(theta) ** 2
+        )
+
+        constraints = []
+        Lambda = Lambda @ Gamma_half
+        # constraints.append(Lambda @ theta >= l)
+        constraints.append(Lambda @ theta <= u)
+
+        prob = cp.Problem(objective, constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-4,
+            },
+        )
+        print(prob.status)
+        self.rate = torch.tensor(theta.value)
+        return self.rate
+
+    def least_sqaures_weighted_fast(self, threads=4):
+
+        l, Lambda, u = self.get_constraints()
+        Gamma_half, invGamma_half = self.cov(inverse=True)
+
+        mask = self.bucketized_counts > 0
+        observations = self.total_bucketized_obs[mask]
+        phis = self.varphis[mask, :]
+        tau = self.total_bucketized_time
+
+        variances = self.variances.view(-1)
+        for i in range(variances.size()[0]):
+            if mask[i] > 0:
+                variances[i] = (
+                    variances[i]
+                    * tau[i]
+                    * self.variance_correction(variances[i] * tau[i])
+                )
+        selected_variances = variances[mask]
+
+        def objective(theta):
+            return torch.sum(
+                (
+                    (tau[mask] * (phis @ invGamma_half @ theta) - observations)
+                    / (np.sqrt(selected_variances))
+                )
+                ** 2
+            ) + self.s * 0.5 * torch.sum((invGamma_half @ theta) ** 2)
+
+        if self.rate is not None:
+            theta0 = torch.zeros(size=(self.get_m(), 1)).view(-1).double()
+            theta0.data = Gamma_half @ self.rate.data
+        else:
+            theta0 = torch.zeros(size=(self.get_m(), 1)).view(-1).double()
+
+        eps = 1e-4
+        res = minimize(
+            objective,
+            theta0.cpu().numpy(),
+            backend="torch",
+            method="L-BFGS-B",
+            bounds=(l[0] + eps, u[0]),
+            precision="float64",
+            tol=1e-8,
+            options={
+                "ftol": 1e-06,
+                "gtol": 1e-06,
+                "eps": 1e-08,
+                "maxfun": 15000,
+                "maxiter": 15000,
+                "maxls": 20,
+            },
+        )
+        self.rate = invGamma_half @ torch.tensor(res.x)
+
+        return self.rate
+
+    def least_squares_weighted_integral(self, threads=4):
+
+        # if self.approx_fit == False:
+        # 	self.bucketization()
+
+        theta = cp.Variable(self.get_m())
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov().numpy()
+
+        phis = self.phis.clone().numpy()  # integrated actions
+        if self.rate is None:
+            rate = torch.pinverse(torch.tensor(Gamma_half)) @ torch.tensor(u)
+        else:
+            rate = self.rate.clone()
+
+        if len(self.variances_histogram) > 0:
+            variances = self.variances_histogram.cpu().numpy()
+
+            for i in range(variances.shape[0]):
+                variances[i] = variances[i] * self.variance_correction(variances[i])
+        else:
+            variances = np.zeros(len(self.data))
+            i = 0
+            for S, obs, dt in self.data:
+                variances[i] = S.volume() * self.B
+                variances[i] = variances[i] * self.variance_correction(variances[i])
+                i = i + 1
+
+        observations = self.counts.clone().numpy()
+
+        objective = cp.Minimize(
+            cp.sum_squares((phis @ theta - observations) / np.sqrt(variances))
+            + self.s * cp.sum_squares(theta)
+        )
+        constraints = []
+        Lambda = Lambda @ Gamma_half
+        constraints.append(Lambda @ theta >= l)
+        constraints.append(Lambda @ theta <= u)
+        prob = cp.Problem(objective, constraints)
+
+        prob.solve(
+            solver=cp.MOSEK,
+            warm_start=False,
+            verbose=False,
+            mosek_params={
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.dual,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-6,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-6,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-6,
+            },
+        )
+
+        self.rate = torch.tensor(theta.value)
+
+        return self.rate
+
+    def penalized_likelihood_bins(self, threads=4):
+        theta = cp.Variable(self.get_m())
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov().numpy()
+
+        mask = self.bucketized_counts.clone().numpy() > 0
+        observations = self.total_bucketized_obs[mask].clone().numpy()
+        phis = self.varphis[mask, :].clone().numpy()
+        tau = self.total_bucketized_time[mask].clone().numpy()
+
+        constraints = []
+        Lambda = Lambda @ Gamma_half
+        constraints.append(Lambda @ theta >= l)
+        constraints.append(Lambda @ theta <= u)
+
+        objective = cp.Minimize(
+            -cp.sum(observations @ cp.log(cp.multiply(tau, phis @ theta)))
+            + cp.sum(cp.multiply(phis @ theta, tau))
+            + self.s * 0.5 * cp.sum_squares(theta)
+        )
+        prob = cp.Problem(objective, constraints)
+        try:
+            prob.solve(solver=cp.CLARABEL, warm_start=False, verbose=True)
+
+            self.rate = torch.tensor(theta.value)
+        except:
+            print("optimization failed.")
+        return self.rate
+
+    def penalized_likelihood_integral_bins(self, threads=4):
+        phis = self.phis.cpu().numpy()
+        counts = self.counts.cpu().numpy()
+
+        theta = cp.Variable(self.get_m())
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov().numpy()
+        objective = cp.Minimize(
+            -cp.sum(counts @ cp.log(phis @ theta))
+            + cp.sum(phis @ theta)
+            + self.s * 0.5 * cp.sum_squares(theta)
+        )
+
+        constraints = []
+        Lambda = Lambda @ Gamma_half
+        constraints.append(Lambda @ theta >= l)
+        constraints.append(Lambda @ theta <= u)
+
+        try:
+            if constraints:
+                prob = cp.Problem(objective, constraints)
+            else:
+                prob = cp.Problem(objective)
+            prob.solve(solver=cp.CLARABEL, warm_start=False, verbose=True)
+            self.rate = torch.tensor(theta.value)
+        except:
+            print("Optimization failed. Using the old value.")
+
+        return self.rate
+
+    def update_variances(self, value=False, force=False):
+        self.approx_fit = True
+        if (
+            self.feedback == "count-record" and self.estimator == "least-sq"
+        ) or force == True:
+            print("updating variance")
+            for index, set in enumerate(self.basic_sets):
+                if value == False:
+                    ucb = self.ucb(set)
+                    self.variances[index] = np.minimum(ucb, self.variances[index])
+                else:
+                    self.variances[index] = self.mean_set(set)
+        else:
+            if self.data is not None:
+                if self.peeking == True:
+                    new_var = []
+                    for S, _, dt in self.data:
+                        new_var.append(float(self.ucb(S)) * dt)
+                    self.variances_histogram = torch.tensor(new_var.copy()).double()
+                else:
+                    last = self.data[-1]
+                    new_var = torch.tensor([self.ucb(last[0]) * last[2]]).double()
+                    if len(self.variances_histogram) > 0:
+                        self.variances_histogram = torch.cat(
+                            (self.variances_histogram, new_var)
+                        )
+                    else:
+                        self.variances_histogram = new_var
+        self.approx_fit = False
+
+    def ucb(self, S, dt=1.0, delta=0.5):
+
+        if self.data is None or self.rate is None:
+            return self.B * S.volume() * dt
+
+        if self.approx == None:
+
+            if self.uncertainty == "laplace":
+                return self.mean_var_laplace_set(S, dt=dt, beta=self.beta(0))[1]
+
+            elif self.uncertainty == "least-sq":
+                return self.mean_var_reg_set(S, dt=dt, beta=self.beta(0))[1]
+
+            elif self.uncertainty == "bins":
+                return self.mean_var_bins_set(S, dt=dt, beta=self.beta(0))[1]
+
+            elif self.uncertainty == "likelihood-ratio":
+                return self.mean_var_ratio_set(S, dt=dt, beta=self.beta(0))[1]
+
+            elif self.uncertainty == "conformal":
+                return self.mean_var_conformal_set(S, dt=dt, delta=delta)[2]
+
+            else:
+                raise AssertionError("Not Implemented.")
+
+        elif self.approx == "ellipsoid":
+
+            if self.approx_fit == False:
+                self.fit_ellipsoid_approx()
+                self.beta(0)
+                print("Fitting Approximation.")
+                self.approx_fit = True
+            return self.map_lcb_ucb_approx_action(S, dt=dt, beta=self.beta(0))[2]
+        else:
+            raise AssertionError("Not implemented.")
+
+    def mean_std_per_action(self, S, W, dt, beta):
+
+        phi = self.packing.integral(S) * dt
+        Gamma_half = self.cov().numpy()
+
+        l, Lambda, u = self.get_constraints()
+
+        Lambda = Lambda @ Gamma_half
+        ucb, _ = maximize_on_elliptical_slice(
+            phi.cpu().numpy(),
+            (W).numpy(),
+            self.rate.view(-1).cpu().numpy(),
+            beta,
+            l,
+            Lambda,
+            u,
+        )
+        lcb, _ = maximize_on_elliptical_slice(
+            -phi.cpu().numpy(),
+            (W).numpy(),
+            self.rate.view(-1).cpu().numpy(),
+            beta,
+            l,
+            Lambda,
+            u,
+        )
+        map = phi @ self.rate
+
+        return map, float(ucb), -float(lcb)
+
+    def mean_var_laplace_set(self, S, dt, beta=2.0):
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix_laplace()
+            self.approx_fit = True
+        return self.mean_std_per_action(S, self.W, dt, beta)
+
+    def mean_var_reg_set(self, S, dt, beta=2.0):
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix_regression()
+            self.approx_fit = True
+        return self.mean_std_per_action(S, self.W, dt, beta)
+
+    def mean_var_bins_set(self, S, dt, beta=2.0):
+        if self.approx_fit == False:
+            self.W = self.construct_covariance_matrix_bins()
+            self.approx_fit = True
+        return self.mean_std_per_action(S, self.W, dt, beta)
+
+    def mean_var_ratio_set(self, S, dt, beta=2.0):
+        x = self.packing.integral(S) * dt
+        map = x @ self.rate
+        # v = np.log(1. / 0.1) - torch.sum(self.counts.double() @ torch.log(self.phis.double() @ self.rate)) \
+        # 	+ torch.sum(self.phis.double() @ self.rate) + 0.5 * self.s * torch.norm(self.rate) ** 2
+        v = (
+            np.log(1.0 / 0.1)
+            + self.likelihood
+            + 0.5 * self.s * torch.norm(self.rate) ** 2
+        )
+
+        phis = self.phis.cpu().numpy()
+        counts = self.counts.cpu().numpy()
+        theta = cp.Variable(self.get_m())
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov().numpy()
+
+        objective_min = cp.Minimize(x @ theta)
+        objective_max = cp.Maximize(x @ theta)
+
+        constraints = []
+        Lambda = Lambda @ Gamma_half
+        constraints.append(Lambda @ theta >= l)
+        constraints.append(Lambda @ theta <= u)
+
+        constraints.append(
+            -cp.sum(counts @ cp.log(phis @ theta))
+            + cp.sum(phis @ theta)
+            + self.s * 0.5 * cp.sum_squares(theta)
+            <= v
+        )
+
+        prob = cp.Problem(objective_min, constraints)
+        prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
+        lcb = np.dot(theta.value, x)
+        prob = cp.Problem(objective_max, constraints)
+        prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
+        ucb = np.dot(theta.value, x)
+
+        return map, ucb, lcb
+
+    def map_lcb_ucb_approx_action(self, S, dt=1.0, beta=2.0):
+        phi = self.packing.integral(S)
+        map = dt * phi @ self.rate
+
+        ucb = map + beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
+        # ucb = np.minimum(dt * ucb, self.B * S.volume() * dt)
+
+        lcb = map - beta * np.sqrt(phi @ self.W_inv_approx @ phi.T)
+        # lcb = np.maximum(dt * lcb, self.b * S.volume() * dt)
+        return map, lcb, ucb
+
+    def fit_ellipsoid_approx(self):
+
+        if self.uncertainty == "laplace":
+            self.W = self.construct_covariance_matrix_laplace()
+        elif self.uncertainty == "least-sq":
+            self.W = self.construct_covariance_matrix_regression()
+        elif self.uncertainty == "bins":
+            self.W = self.construct_covariance_matrix_bins()
+        else:
+            raise AssertionError("Not implemented.")
+
+        self.W_inv_approx = torch.pinverse(self.W)
+
+    def construct_covariance_matrix(self):
+        if self.estimator == "likelihood":
+            self.W = self.construct_covariance_matrix_laplace()
+        elif self.estimator == "least-sq":
+            self.W = self.construct_covariance_matrix_regression()
+        elif self.estimator == "bins":
+            self.W = self.construct_covariance_matrix_bins()
+        else:
+            raise NotImplementedError("This estimator is not implemented.")
+        return self.W
+
+    def construct_covariance_matrix_laplace(self, theta=None):
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+
+        if self.feedback == "count-record":
+
+            if self.observations is not None:
+
+                if theta is None:
+                    D = torch.diag(
+                        1.0 / ((self.observations @ self.rate).view(-1) ** 2)
+                    )
+                    W = self.observations.T @ D @ self.observations
+                else:
+                    D = torch.diag(1.0 / ((self.observations @ theta).view(-1) ** 2))
+                    W = self.observations.T @ D @ self.observations
+
+        elif self.feedback == "histogram":
+            # D = torch.diag(self.counts / (self.phis @ self.rate).view(-1) ** 2)
+            if len(self.variances_histogram) > 0:
+                variances = self.variances_histogram.view(-1).clone()
+
+                for i in range(variances.shape[0]):
+                    variances[i] = variances[i] * self.variance_correction(variances[i])
+
+                D = torch.diag(self.counts / variances**2)
+
+            W = self.phis.T @ D @ self.phis
+        else:
+            raise AssertionError("Not implemented.")
+
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def construct_covariance_matrix_regression(self):
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+        if self.data is not None:
+            variances = self.variances
+            if self.feedback == "count-record":
+                mask = self.bucketized_counts > 0
+                tau = self.total_bucketized_time
+                for index_o, o in enumerate(self.bucketized_obs):
+                    n = mask[index_o]
+                    if n > 0:
+                        A = (
+                            self.varphis[index_o, :].view(-1, 1)
+                            @ self.varphis[index_o, :].view(1, -1)
+                            * tau[index_o]
+                        )
+                        k = self.variance_correction(tau[index_o] * variances[index_o])
+                        W = W + A / (variances[index_o] * k)
+
+            elif self.feedback == "histogram":
+
+                if len(self.variances_histogram) > 0:
+                    variances = self.variances_histogram.view(-1).clone()
+
+                    for i in range(variances.shape[0]):
+                        variances[i] = variances[i] * self.variance_correction(
+                            variances[i]
+                        )
+
+                    D = torch.diag(1.0 / variances)
+                    W = self.phis.T @ D @ self.phis
+
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def construct_covariance_matrix_bins(self):
+        W = torch.zeros(size=(self.get_m(), self.get_m())).double()
+
+        if self.feedback == "count-record":
+
+            mask = self.bucketized_counts > 0
+            tau = self.total_bucketized_time
+            varphis = self.varphis[mask, :]
+            variances = self.variances.view(-1).clone()
+
+            for i in range(variances.size()[0]):
+                if mask[i] > 0:
+                    variances[i] = variances[i] * self.variance_correction(
+                        variances[i] * tau[i]
+                    )
+
+            variances = variances[mask]
+            tau = tau[mask]
+
+            if self.observations is not None:
+                D = torch.diag(tau / variances)
+                W = varphis.T @ D @ varphis
+
+        elif self.feedback == "histogram":
+
+            if len(self.variances_histogram) > 0:
+                variances = self.variances_histogram.view(-1).clone()
+
+                for i in range(variances.shape[0]):
+                    variances[i] = variances[i] * self.variance_correction(variances[i])
+
+                D = torch.diag(1.0 / variances)
+                W = self.phis.T @ D @ self.phis
+        else:
+            raise AssertionError("Not implemented.")
+
+        return W + torch.eye(self.get_m()).double() * self.s
+
+    def gap(self, S, actions, w, dt, beta=2.0):
+        """
+        Estimates the gap of an action S,
+        :param S:
+        :param dt:
+        :return:
+        """
+        phi = self.packing.integral(S) * dt
+        Gamma_half = self.packing.cov().numpy()
+
+        if self.approx is None:
+            l, Lambda, u = self.get_constraints()
+            Lambda = Lambda @ Gamma_half
+            ucbs = []
+            for action in actions:
+                phi_a = self.packing.integral(action) * dt
+                # ucb, _ = maximize_on_elliptical_slice(phi_a.cpu().numpy()-phi.cpu().numpy(), self.W.cpu().numpy(), self.rate.view(-1).numpy(), beta, l, Lambda, u)
+                ucb, _ = maximize_on_elliptical_slice(
+                    phi.cpu().numpy(),
+                    self.W.cpu().numpy(),
+                    self.rate.view(-1).numpy(),
+                    beta,
+                    l,
+                    Lambda,
+                    u,
+                )
+                ucbs.append(float(ucb))
+            gap = torch.max(torch.tensor(ucbs))
+
+        else:
+            if self.data is None:
+                return (self.B - self.b) * S.volume()
+
+            if self.ucb_identified == False:
+                print("Recomputing UCB.....")
+                self.ucb_identified = True
+                self.fit_ellipsoid_approx()
+                self.max_ucb = -1000
+                self.ucb_action = None
+
+                for action in actions:
+                    _, __, ucb = self.map_lcb_ucb_approx_action(
+                        action, dt=dt, beta=self.beta(0)
+                    )
+                    ucb = ucb / w(action)
+
+                    if ucb > self.max_ucb:
+                        self.max_ucb = ucb
+                        self.ucb_action = action
+
+            map, lcb, ucb = self.map_lcb_ucb_approx_action(S, dt=dt, beta=self.beta(0))
+            gap = w(S) * self.max_ucb - lcb
+        return gap
+
+    def information(self, S, dt, precomputed=None):
+
+        if self.data is None:
+            return 1.0
+
+        if self.W is None:
+            self.construct_covariance_matrix()
+
+        if self.feedback == "count-record":
+            varphi_UCB = self.packing.integral(self.ucb_action).view(1, -1) * dt
+
+            if precomputed is not None:
+                Upsilon = precomputed[S] * dt
+            else:
+                ind = []
+                for index, set in enumerate(self.basic_sets):
+                    if S.inside(set):
+                        ind.append(index)
+                Upsilon = self.varphis[ind, :] * dt
+
+            I = torch.eye(Upsilon.size()[0]).double()
+            G = (
+                self.W_inv_approx
+                - self.W_inv_approx
+                @ Upsilon.T
+                @ torch.inverse(I + Upsilon @ Upsilon.T)
+                @ Upsilon
+                @ self.W_inv_approx
+            )
+            return (
+                10e-4
+                + torch.logdet(varphi_UCB @ self.W_inv_approx @ varphi_UCB.T)
+                - torch.logdet(varphi_UCB @ G @ varphi_UCB.T)
+            )
+
+        elif self.feedback == "histogram":
+
+            return torch.log(
+                1
+                + self.packing.integral(S)
+                @ self.W_inv_approx
+                @ self.packing.integral(S)
+                * dt**2
+            )
+
+    def map_lcb_ucb_approx(self, S, n, beta=2.0, delta=0.01):
+        xtest = S.return_discretization(n)
+        if self.data is None:
+            return (
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, 1),
+            )
+
+        self.fit_ellipsoid_approx()
+        self.fit_ellipsoid_approx()
+
+        Phi = self.packing.embed(xtest).double()
+        map = Phi @ self.rate
+        N = Phi.size()[0]
+
+        ucb = torch.zeros(size=(N, 1)).double()
+        lcb = torch.zeros(size=(N, 1)).double()
+
+        for i in range(N):
+            x = Phi[i, :].view(-1, 1)
+            ucb[i, 0] = np.minimum(
+                map[i] + beta * np.sqrt(x.T @ self.W_inv_approx @ x), self.B
+            )
+            lcb[i, 0] = np.maximum(
+                map[i] - beta * np.sqrt(x.T @ self.W_inv_approx @ x), self.b
+            )
+        return map, lcb, ucb
+
+    def map_lcb_ucb(self, S, n, beta=2.0):
+        """
+        Calculate exact confidence using laplace approximation on a whole set domain
+        :param S: set
+        :param n: discretization
+        :param beta: beta
+        :return:
+        """
+
+        xtest = S.return_discretization(n)
+        if self.data is None:
+            return (
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, 1),
+            )
+
+        N = xtest.size()[0]
+        Phi = self.packing.embed(xtest)
+        map = Phi @ self.rate
+
+        if self.uncertainty == "laplace":
+            W = self.construct_covariance_matrix_laplace()
+        elif self.uncertainty == "least-sq":
+            W = self.construct_covariance_matrix_regression()
+        elif self.uncertainty == "bins":
+            W = self.construct_covariance_matrix_bins()
+        else:
+            raise AssertionError("Not implemented ")
+
+        Gamma_half = self.cov().numpy()
+        l, Lambda, u = self.get_constraints()
+        Lambda = Lambda @ Gamma_half
+        ucb = torch.zeros(size=(N, 1)).double()
+        lcb = torch.zeros(size=(N, 1)).double()
+
+        for i in range(N):
+            x = Phi[i, :]
+            ucbi, _ = maximize_on_elliptical_slice(
+                x.cpu().numpy(),
+                (W).numpy(),
+                self.rate.view(-1).numpy(),
+                np.sqrt(beta),
+                l,
+                Lambda,
+                u,
+            )
+            lcbi, _ = maximize_on_elliptical_slice(
+                -x.cpu().numpy(),
+                (W).numpy(),
+                self.rate.view(-1).numpy(),
+                np.sqrt(beta),
+                l,
+                Lambda,
+                u,
+            )
+            ucb[i, 0] = ucbi
+            lcb[i, 0] = -lcbi
+
+        return map, lcb, ucb
+
+    def map_lcb_ucb_likelihood_ratio(self, S, n, delta=0.1, current=False):
+        xtest = S.return_discretization(n)
+
+        if self.data is None:
+            return (
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.b + 0 * xtest[:, 0].view(-1, 1),
+                self.B + 0 * xtest[:, 0].view(-1, 1),
+            )
+
+        N = xtest.size()[0]
+        Phi = self.packing.embed(xtest)
+        map = Phi @ self.rate
+
+        ucb = torch.zeros(size=(N, 1)).double()
+        lcb = torch.zeros(size=(N, 1)).double()
+
+        phis = self.phis.cpu().numpy()
+
+        if current:
+            if self.observations is not None:
+                v = (
+                    np.log(1.0 / delta)
+                    - torch.sum(torch.log(self.observations @ self.rate))
+                    + torch.sum(self.phis @ self.rate)
+                    + self.s * 0.5 * torch.sum(self.rate**2)
+                )
+            else:
+                v = (
+                    np.log(1.0 / delta)
+                    + torch.sum(self.phis @ self.rate)
+                    + self.s * 0.5 * torch.sum(self.rate**2)
+                )
+        else:
+            if self.feedback == "count-record":
+                v = (
+                    np.log(1.0 / delta)
+                    + self.loglikelihood
+                    + 0.5 * self.s * torch.sum(self.rate**2)
+                )
+            elif self.feedback == "histogram":
+                v = (
+                    np.log(1.0 / delta)
+                    + self.loglikelihood
+                    + 0.5 * self.s * torch.sum(self.rate**2)
+                )
+            else:
+                raise NotImplementedError("Not compatible with given feedback model ")
+
+        l, Lambda, u = self.get_constraints()
+        Gamma_half = self.cov().numpy()
+        Lambda = Lambda @ Gamma_half
+
+        for i in range(N):
+            x = Phi[i, :].cpu().numpy()
+
+            theta = cp.Variable(self.get_m())
+
+            objective_min = cp.Minimize(x @ theta)
+            objective_max = cp.Maximize(x @ theta)
+
+            constraints = []
+            constraints.append(Lambda @ theta >= l)
+            constraints.append(Lambda @ theta <= u)
+
+            if self.feedback == "count-record":
+                if self.observations is not None:
+                    observations = self.observations.cpu().numpy()
+
+                    constraints.append(
+                        -cp.sum(cp.log(observations @ theta))
+                        + cp.sum(phis @ theta)
+                        + self.s * 0.5 * cp.sum_squares(theta)
+                        <= v
+                    )
+                else:
+                    constraints.append(
+                        cp.sum(phis @ theta) + self.s * 0.5 * cp.sum_squares(theta) <= v
+                    )
+
+            elif self.feedback == "histogram":
+                constraints.append(
+                    -cp.sum(cp.log(phis @ theta))
+                    + cp.sum(phis @ theta)
+                    + self.s * 0.5 * cp.sum_squares(theta)
+                    <= v
+                )
+            else:
+                raise NotImplementedError("Does not exist.")
+
+            prob = cp.Problem(objective_min, constraints)
+            prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
+            lcb[i, 0] = float(np.dot(theta.value, x))
+
+            prob = cp.Problem(objective_max, constraints)
+            prob.solve(solver=cp.MOSEK, warm_start=False, verbose=False)
+            ucb[i, 0] = float(np.dot(theta.value, x))
+
+        return map, lcb, ucb
+
+    def mean_var_conformal_set(self, S, dt, beta=2.0, max_val=None, delta=0.05):
+        # self.bucketize_prepare()
+        if max_val is None:
+            max_val = int(self.B * self.basic_sets[0].volume() * dt) + 1
+        map, lcb, ucb = self.conformal_confidence_set(
+            S, delta=delta, max_val=max_val, dt=dt
+        )
+        return map, lcb, ucb
+
+    def conformal_score_func(self, theta, new, index):
+
+        if new[1] is None:
+            n_new = 0
+        else:
+            n_new = new[1].size()[0]
+
+        varphi = self.packing.integral(new[0]) * new[2]
+        err_new = abs(float(n_new) - float(varphi @ theta))
+        n = len(self.bucketized_obs[index])
+
+        if n > 0:
+
+            phis = self.varphis[index].repeat(n, 1)
+            res = torch.tensor(self.bucketized_obs[index]).double()
+
+            err = torch.abs(res - (phis @ theta.view(-1, 1)).view(-1))
+
+            return torch.sum(err < err_new).double() / float(n + 1.0) + 1.0 / (
+                float(n) + 1.0
+            )
+
+        else:
+            return 0.0
+
+    def conformal_confidence(self, delta=0.05, max_val=20, dt=1, step=1):
+        lcb = []
+        ucb = []
+        map = []
+
+        if self.data is not None:
+            self.bucketization(time=True)
+
+        for S in self.basic_sets:
+            m, u, l = self.conformal_confidence_set(
+                S, delta=delta, max_val=max_val, dt=dt, step=step
+            )
+
+            map.append(m)
+            ucb.append(u)
+            lcb.append(l)
+
+        return (
+            torch.tensor(map).double(),
+            torch.tensor(ucb).double(),
+            torch.tensor(lcb).double(),
+        )
+
+    def conformal_confidence_set(self, S, delta=0.05, max_val=20, dt=1.0, step=1):
+        """
+        :return: (lcb,ucb)
+        """
+
+        if self.data is not None:
+            if self.feedback == "count-record":
+                self.penalized_likelihood()
+            elif self.feedback == "histogram":
+                self.penalized_likelihood_integral()
+
+            # identify the set in basic sets
+            index = 0
+            for set in self.basic_sets:
+                if set.inside(S):
+                    break
+                index += 1
+
+            # calculate map estimate
+            map = float(self.rate @ self.packing.integral(S))
+        else:
+            map = self.b
+            return map, self.B, self.b
+
+        scores = []
+        j = 0
+        score = 1.0
+        lowest = 0
+        n = float(len(self.bucketized_obs[index]))
+
+        while score > np.ceil((1 - delta) * (n + 1)) / (n + 1) and j <= max_val:
+            lowest = j
+            if j > 0:
+                obs = torch.zeros(size=(j, self.d)).double()
+                for i in range(self.d):
+                    obs[:, i] = torch.tensor(
+                        np.random.uniform(S.bounds[i, 0], S.bounds[i, 1], size=j)
+                    )
+            else:
+                obs = None
+
+            # new observation
+            new = (S, obs, dt)
+
+            old_phis, old_observations, old_counts = self.add_data_point_and_remove(new)
+
+            if self.feedback == "count-record":
+                theta_new = self.penalized_likelihood()
+            elif self.feedback == "histogram":
+                theta_new = self.penalized_likelihood_integral()
+
+            # restore back the data
+            self.phis = old_phis
+            self.observations = old_observations
+            self.counts = old_counts
+
+            # calculate the score
+            score = self.conformal_score_func(theta_new, new, index)
+            n = float(len(self.bucketized_obs[index]))
+
+            print(j, "/", max_val, score, np.ceil((1 - delta) * (n + 1)) / (n + 1))
+            j = j + 1
+
+        j = max_val
+        score = 1.0
+        largest = max_val
+
+        while score > np.ceil((1 - delta) * (n + 1)) / (n + 1) and j > lowest:
+            largest = j
+            if j > 0:
+                obs = torch.zeros(size=(j, self.d)).double()
+                for i in range(self.d):
+                    obs[:, i] = torch.tensor(
+                        np.random.uniform(S.bounds[i, 0], S.bounds[i, 1], size=j)
+                    )
+            else:
+                obs = None
+
+            # new observation
+            new = (S, obs, dt)
+
+            old_phis, old_observations, old_counts = self.add_data_point_and_remove(new)
+
+            if self.feedback == "count-record":
+                theta_new = self.penalized_likelihood()
+            elif self.feedback == "histogram":
+                theta_new = self.penalized_likelihood_integral()
+
+            # restore back the data
+            self.phis = old_phis
+            self.observations = old_observations
+            self.counts = old_counts
+
+            # calculate the score
+            score = self.conformal_score_func(theta_new, new, index)
+            n = float(len(self.bucketized_obs[index]))
+
+            print(j, "/", max_val, score, np.ceil((1 - delta) * (n + 1)) / (n + 1))
+            j = j - 1
+        # scores = np.array(scores)
+        # mask = scores < np.ceil((1-delta)*(n+1))/(n+1)
+
+        # if np.sum(mask) == 0:
+        # 	lowest = 0
+        # 	largest = max_val
+        # else:
+        # 	lowest = np.min(np.arange(0,max_val,step)[mask])
+        # 	largest = np.max(np.arange(0, max_val, step)[mask])
+
+        lcb = lowest / dt / S.volume()
+        ucb = largest / dt / S.volume()
+
+        return (map, ucb, lcb)
diff --git a/stpy/point_processes/positive_basis_estimator.py b/stpy/point_processes/positive_basis_estimator.py
index 3d09bc6..d404af6 100644
--- a/stpy/point_processes/positive_basis_estimator.py
+++ b/stpy/point_processes/positive_basis_estimator.py
@@ -9,125 +9,124 @@
 from stpy.borel_set import BorelSet
 from stpy.point_processes.poisson import PoissonPointProcess
 
-class RateEstimator():
 
-	def __init__(self):
-		pass
-
-
-	def get_min_max(self):
-		basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
-		volumes = []
-		for index, elementary in enumerate(basic_sets):
-			volumes.append(elementary.volume())
-
-		return (np.min(volumes), np.max(volumes))
-
-
-
-	def load_data(self, data, times = True):
-		self.approx_fit = False
-
-		if len(data) > 0:
-			self.approx_fit = False
-			phis = []
-			observations = []
-			self.data = data.copy()
-			counts = []
-			#times_arr = []
-
-			for sample in data:
-				S, obs, dt = sample
-				count = torch.Tensor([0])
-
-				if obs is not None:
-					if times == True:
-						emb = self.packing.embed(obs) * dt
-					else:
-						emb = self.packing.embed(obs)
-
-					phi = self.packing.integral(S) * dt
-					observations.append(emb)
-					count = torch.Tensor([emb.size()[0]])
-					phis.append(phi.view(1, -1))
-
-
-					if self.dual == True:
-						self.global_dt = dt
-						dist_matrix = torch.cdist(obs, self.anchor_points, p = 2)
-						for k in range(obs.size()[0]):
-							index = torch.argmin(dist_matrix[k,:])
-							self.anchor_weights[index] = self.anchor_weights[index] + 1.
-				else:
-					phi = self.packing.integral(S) * dt
-					phis.append(phi.view(1, -1))
-				counts.append(count)
-
-			self.counts = torch.cat(counts, dim=0)  # n(A_i)
-			self.phis = torch.cat(phis, dim=0)  # integrals of A_i
-
-			if len(observations) > 0:
-				self.observations = torch.cat(observations, dim=0)  # \{x_i\}_{i=1}^{n(A_i)}
-			else:
-				self.observations = None
-
-			if self.feedback == "count-record":
-				self.bucketization()
-
-	def add_data_point(self, new_data, times = True):
-		self.approx_fit = False
-
-		if self.data is None:
-			self.load_data([new_data])
-			return
-
-		self.data.append(new_data)
-
-		# update standard form data
-		S, obs, dt = new_data
-		if obs is not None:
-
-			if times == True:
-				emb = self.packing.embed(obs) * dt
-			else:
-				emb = self.packing.embed(obs)
-
-			phi = self.packing.integral(S).view(1, -1) * dt
-
-			count = torch.Tensor([emb.size()[0]])
-
-			if self.observations is not None:
-				self.observations = torch.cat((self.observations, emb), dim=0)
-				#self.times = torch.cat((self.times, dt * torch.ones(size=(emb.size()[0],1)).view(-1).double() ))
-			else:
-				self.observations = emb
-				#self.times =  dt * torch.ones(size=(emb.size()[0],1)).view(-1).double()
-
-
-			if self.dual == True:
-
-				dist_matrix = torch.cdist(obs, self.anchor_points, p=2)
-				for k in range(obs.size()[0]):
-					index = torch.argmin(dist_matrix[k, :])
-					self.anchor_weights[index] += 1.
-		else:
-			count = torch.Tensor([0])
-			phi = self.packing.integral(S).view(1, -1) * dt
-
-
-		self.phis = torch.cat((self.phis, phi), dim=0)
-		self.counts = torch.cat((self.counts, count))
-
-		if self.feedback == "count-record":
-
-			for index, elementary in enumerate(self.basic_sets):
-
-				if S.inside(elementary) == True:
-					if obs is not None:
-						mask = elementary.is_inside(obs)
-						self.total_bucketized_obs[index] += float(obs[mask].size()[0])
-					else:
-						self.total_bucketized_obs[index] += 0.0
-
-					self.bucketized_counts[index] += 1
-					self.total_bucketized_time[index] += dt
+class RateEstimator:
+
+    def __init__(self):
+        pass
+
+    def get_min_max(self):
+        basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
+        volumes = []
+        for index, elementary in enumerate(basic_sets):
+            volumes.append(elementary.volume())
+
+        return (np.min(volumes), np.max(volumes))
+
+    def load_data(self, data, times=True):
+        self.approx_fit = False
+
+        if len(data) > 0:
+            self.approx_fit = False
+            phis = []
+            observations = []
+            self.data = data.copy()
+            counts = []
+            # times_arr = []
+
+            for sample in data:
+                S, obs, dt = sample
+                count = torch.tensor([0])
+
+                if obs is not None:
+                    if times == True:
+                        emb = self.packing.embed(obs) * dt
+                    else:
+                        emb = self.packing.embed(obs)
+
+                    phi = self.packing.integral(S) * dt
+                    observations.append(emb)
+                    count = torch.tensor([emb.size()[0]])
+                    phis.append(phi.view(1, -1))
+
+                    if self.dual == True:
+                        self.global_dt = dt
+                        dist_matrix = torch.cdist(obs, self.anchor_points, p=2)
+                        for k in range(obs.size()[0]):
+                            index = torch.argmin(dist_matrix[k, :])
+                            self.anchor_weights[index] = (
+                                self.anchor_weights[index] + 1.0
+                            )
+                else:
+                    phi = self.packing.integral(S) * dt
+                    phis.append(phi.view(1, -1))
+                counts.append(count)
+
+            self.counts = torch.cat(counts, dim=0)  # n(A_i)
+            self.phis = torch.cat(phis, dim=0)  # integrals of A_i
+
+            if len(observations) > 0:
+                self.observations = torch.cat(
+                    observations, dim=0
+                )  # \{x_i\}_{i=1}^{n(A_i)}
+            else:
+                self.observations = None
+
+            if self.feedback == "count-record":
+                self.bucketization()
+
+    def add_data_point(self, new_data, times=True):
+        self.approx_fit = False
+
+        if self.data is None:
+            self.load_data([new_data])
+            return
+
+        self.data.append(new_data)
+
+        # update standard form data
+        S, obs, dt = new_data
+        if obs is not None:
+
+            if times == True:
+                emb = self.packing.embed(obs) * dt
+            else:
+                emb = self.packing.embed(obs)
+
+            phi = self.packing.integral(S).view(1, -1) * dt
+
+            count = torch.tensor([emb.size()[0]])
+
+            if self.observations is not None:
+                self.observations = torch.cat((self.observations, emb), dim=0)
+                # self.times = torch.cat((self.times, dt * torch.ones(size=(emb.size()[0],1)).view(-1).double() ))
+            else:
+                self.observations = emb
+                # self.times =  dt * torch.ones(size=(emb.size()[0],1)).view(-1).double()
+
+            if self.dual == True:
+
+                dist_matrix = torch.cdist(obs, self.anchor_points, p=2)
+                for k in range(obs.size()[0]):
+                    index = torch.argmin(dist_matrix[k, :])
+                    self.anchor_weights[index] += 1.0
+        else:
+            count = torch.tensor([0])
+            phi = self.packing.integral(S).view(1, -1) * dt
+
+        self.phis = torch.cat((self.phis, phi), dim=0)
+        self.counts = torch.cat((self.counts, count))
+
+        if self.feedback == "count-record":
+
+            for index, elementary in enumerate(self.basic_sets):
+
+                if S.inside(elementary) == True:
+                    if obs is not None:
+                        mask = elementary.is_inside(obs)
+                        self.total_bucketized_obs[index] += float(obs[mask].size()[0])
+                    else:
+                        self.total_bucketized_obs[index] += 0.0
+
+                    self.bucketized_counts[index] += 1
+                    self.total_bucketized_time[index] += dt
diff --git a/stpy/point_processes/rate_estimator.py b/stpy/point_processes/rate_estimator.py
index 016661f..ed6dbcc 100644
--- a/stpy/point_processes/rate_estimator.py
+++ b/stpy/point_processes/rate_estimator.py
@@ -1,191 +1,214 @@
+from typing import List
 import numpy as np
 import torch
 
 
-class RateEstimator():
-
-	def __init__(self):
-		pass
-
-	def get_min_max(self):
-		basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
-		volumes = []
-		for index, elementary in enumerate(basic_sets):
-			volumes.append(elementary.volume())
-
-		return (np.min(volumes), np.max(volumes))
-
-	def load_data(self, data, times=True):
-		self.approx_fit = False
-
-		if len(data) > 0:
-			self.approx_fit = False
-			phis = []
-			observations = []
-			self.data = data.copy()
-			counts = []
-			# times_arr = []
-			x = []
-			for sample in data:
-				S, obs, dt = sample
-				count = torch.Tensor([0])
-
-				if obs is not None:
-					x.append(obs)
-
-				if obs is not None:
-					obs, _, duplicates = torch.unique(obs, dim=0, return_inverse=True, return_counts=True)
-					#obs = torch.diag(torch.exp(duplicates.double()))@obs\
-					obs = torch.einsum('ij,i->ij', obs, duplicates)
-
-					if times == True:
-						emb = self.packing.embed(obs) * dt
-					else:
-						emb = self.packing.embed(obs)
-
-					phi = self.packing.integral(S) * dt
-					observations.append(emb)
-					count = torch.Tensor([emb.size()[0]])
-					phis.append(phi.view(1, -1))
-
-					if self.dual == True:
-						self.global_dt = dt
-						dist_matrix = torch.cdist(obs, self.anchor_points, p=2)
-						for k in range(obs.size()[0]):
-							index = torch.argmin(dist_matrix[k, :])
-							self.anchor_weights[index] = self.anchor_weights[index] + 1.
-				else:
-					phi = self.packing.integral(S) * dt
-					phis.append(phi.view(1, -1))
-				counts.append(count)
-
-			self.counts = torch.cat(counts, dim=0)  # n(A_i)
-			self.phis = torch.cat(phis, dim=0)  # integrals of A_i
-			if len(x) > 0:
-				self.x = torch.cat(x, dim=0)
-			else:
-				self.x = None
-
-			if len(observations) > 0:
-				self.observations = torch.cat(observations, dim=0)  # \{x_i\}_{i=1}^{n(A_i)}
-			else:
-				self.observations = None
-
-			if self.feedback == "count-record":
-				self.bucketization()
-
-	def add_data_point(self, new_data, times=True):
-		self.approx_fit = False
-
-		if self.data is None:
-			self.load_data([new_data])
-			return
-
-		self.data.append(new_data)
-
-		# update standard form data
-		S, obs, dt = new_data
-		if obs is not None:
-
-			if times == True:
-				emb = self.packing.embed(obs) * dt
-			else:
-				emb = self.packing.embed(obs)
-
-			phi = self.packing.integral(S).view(1, -1) * dt
-
-			count = torch.Tensor([emb.size()[0]])
-
-			if self.observations is not None:
-				self.observations = torch.cat((self.observations, emb), dim=0)
-			# self.times = torch.cat((self.times, dt * torch.ones(size=(emb.size()[0],1)).view(-1).double() ))
-			else:
-				self.observations = emb
-			# self.times =  dt * torch.ones(size=(emb.size()[0],1)).view(-1).double()
-
-			if self.dual == True:
-
-				dist_matrix = torch.cdist(obs, self.anchor_points, p=2)
-				for k in range(obs.size()[0]):
-					index = torch.argmin(dist_matrix[k, :])
-					self.anchor_weights[index] += 1.
-		else:
-			count = torch.Tensor([0])
-			phi = self.packing.integral(S).view(1, -1) * dt
-
-		self.phis = torch.cat((self.phis, phi), dim=0)
-		self.counts = torch.cat((self.counts, count))
-
-		if self.feedback == "count-record":
-
-			for index, elementary in enumerate(self.basic_sets):
-
-				if S.inside(elementary) == True:
-					if obs is not None:
-						mask = elementary.is_inside(obs)
-						self.total_bucketized_obs[index] += float(obs[mask].size()[0])
-					else:
-						self.total_bucketized_obs[index] += 0.0
-
-					self.bucketized_counts[index] += 1
-					self.total_bucketized_time[index] += dt
-
-	def get_m(self):
-		return self.packing.get_m()
-
-	def mean_rate(self, S, n=128):
-		xtest = S.return_discretization(n)
-		if self.rate is not None:
-			return self.packing.embed(xtest) @ self.rate.view(-1, 1)
-		else:
-			return self.packing.embed(xtest)[:, 0].view(-1, 1) * 0 + self.b
-
-	def mean_rate_points(self, xtest):
-		if self.rate is not None:
-			return self.packing.embed(xtest) @ self.rate.view(-1, 1)
-		else:
-			return self.packing.embed(xtest)[:, 0].view(-1, 1) * 0 + self.b
-
-	def mean_set(self, S, dt=1):
-		phi = self.packing.integral(S) * dt
-		map = phi @ self.rate.view(-1, 1)
-		return map
-
-	def rate_value(self, x, dt=1):
-		phi = self.packing.embed(x) * dt
-
-		if self.rate is not None:
-			map = phi @ self.rate.view(-1, 1)
-		else:
-			print("Rate function not fitted!")
-			map = 0 * phi[:, 0].view(-1, 1) + self.b
-
-		return map
-
-	def sample_value(self, S):
-		"""
-		Given a pre-sampled value evaluate certain portions of the domain S
-		:param S:
-		:return:
-		"""
-		return self.packing.integral(S) @ self.sampled_theta
-
-	def sample_path(self, S, n=128):
-		xtest = S.return_discretization(n)
-		return self.packing.embed(xtest) @ self.sampled_theta
-
-	def sample_path_points(self, xtest):
-		return self.packing.embed(xtest) @ self.sampled_theta.view(-1, 1)
-
-	def get_observations(self):
-		if self.data is not None:
-			points = []
-			for datapoint in self.data:
-				if datapoint[1] is not None:
-					points.append(datapoint[1])
-			if len(points) > 0:
-				return torch.vstack(points)
-			else:
-				return None
-		else:
-			return None
+class RateEstimator:
+
+    def __init__(self):
+        pass
+
+    def get_min_max(self):
+        basic_sets = self.hierarchy.get_sets_level(self.hierarchy.levels)
+        volumes = []
+        for index, elementary in enumerate(basic_sets):
+            volumes.append(elementary.volume())
+
+        return (np.min(volumes), np.max(volumes))
+
+    def load_data(self, data: List, times=True):
+        r"""Load the data and save $\phi(x)$ into `self.observations`, $n(A_i)$ in
+        `self.counts` and $\int_{A_i} \phi_j(x) dx$ into `self.phis`
+
+
+        Parameters
+        ----------
+        data
+
+                List of samples, where each sample is a tuple of
+
+                        * The Borel set on which the data lies
+                        * A tensor of the datapoints them selves i.e. of shape
+                          [num_data_points, self.d...]
+                        * The amount of time in minutes that the data spans
+                          i.e. max time - min time of all data points
+
+        times, optional
+                by default True
+        """
+        self.approx_fit = False
+
+        if len(data) > 0:
+            self.approx_fit = False
+            phis = []
+            observations = []
+            self.data = data.copy()
+            counts = []
+            # times_arr = []
+            x = []
+            for sample in data:
+                S, obs, dt = sample
+                count = torch.tensor([0])
+
+                if obs is not None:
+                    x.append(obs)
+
+                if obs is not None:
+                    if times == True:
+                        emb = self.packing.embed(obs) * dt
+                    else:
+                        emb = self.packing.embed(obs)
+
+                    phi = self.packing.integral(S) * dt
+                    observations.append(emb)
+                    count = torch.tensor([emb.size()[0]])
+                    phis.append(phi.view(1, -1))
+
+                    if self.dual == True:
+                        self.global_dt = dt
+                        dist_matrix = torch.cdist(obs, self.anchor_points, p=2)
+                        for k in range(obs.size()[0]):
+                            index = torch.argmin(dist_matrix[k, :])
+                            self.anchor_weights[index] = (
+                                self.anchor_weights[index] + 1.0
+                            )
+                else:
+                    phi = self.packing.integral(S) * dt
+                    phis.append(phi.view(1, -1))
+                counts.append(count)
+
+            self.counts = torch.cat(counts, dim=0)  # n(A_i)
+            self.phis = torch.cat(phis, dim=0)  # integrals of A_i
+            if len(x) > 0:
+                self.x = torch.cat(x, dim=0)
+            else:
+                self.x = None
+
+            if len(observations) > 0:
+                self.observations = torch.cat(
+                    observations, dim=0
+                )  # \{x_i\}_{i=1}^{n(A_i)}
+            else:
+                self.observations = None
+
+            if self.feedback == "count-record" and self.dual:
+                self.bucketization()
+
+    def add_data_point(self, new_data, times=True):
+        self.approx_fit = False
+
+        if self.data is None:
+            self.load_data([new_data])
+            return
+
+        self.data.append(new_data)
+
+        # update standard form data
+        S, obs, dt = new_data
+        if obs is not None:
+
+            if times == True:
+                emb = self.packing.embed(obs) * dt
+            else:
+                emb = self.packing.embed(obs)
+
+            phi = self.packing.integral(S).view(1, -1) * dt
+
+            count = torch.tensor([emb.size()[0]])
+
+            if self.observations is not None:
+                self.observations = torch.cat((self.observations, emb), dim=0)
+            # self.times = torch.cat((self.times, dt * torch.ones(size=(emb.size()[0],1)).view(-1).double() ))
+            else:
+                self.observations = emb
+            # self.times =  dt * torch.ones(size=(emb.size()[0],1)).view(-1).double()
+
+            if self.dual == True:
+
+                dist_matrix = torch.cdist(obs, self.anchor_points, p=2)
+                for k in range(obs.size()[0]):
+                    index = torch.argmin(dist_matrix[k, :])
+                    self.anchor_weights[index] += 1.0
+        else:
+            count = torch.tensor([0])
+            phi = self.packing.integral(S).view(1, -1) * dt
+
+        self.phis = torch.cat((self.phis, phi), dim=0)
+        self.counts = torch.cat((self.counts, count))
+
+        if self.feedback == "count-record":
+
+            for index, elementary in enumerate(self.basic_sets):
+
+                if S.inside(elementary) == True:
+                    if obs is not None:
+                        mask = elementary.is_inside(obs)
+                        self.total_bucketized_obs[index] += float(obs[mask].size()[0])
+                    else:
+                        self.total_bucketized_obs[index] += 0.0
+
+                    self.bucketized_counts[index] += 1
+                    self.total_bucketized_time[index] += dt
+
+    def get_m(self):
+        return self.packing.get_m()
+
+    def mean_rate(self, S, n=128):
+        xtest = S.return_discretization(n)
+        if self.rate is not None:
+            return self.packing.embed(xtest) @ self.rate.view(-1, 1)
+        else:
+            return self.packing.embed(xtest)[:, 0].view(-1, 1) * 0 + self.b
+
+    def mean_rate_points(self, xtest):
+        if self.rate is not None:
+            return self.packing.embed(xtest) @ self.rate.view(-1, 1)
+        else:
+            return self.packing.embed(xtest)[:, 0].view(-1, 1) * 0 + self.b
+
+    def mean_set(self, S, dt=1):
+        phi = self.packing.integral(S) * dt
+        map = phi @ self.rate.view(-1, 1)
+        return map
+
+    def rate_value(self, x, dt=1):
+        phi = self.packing.embed(x) * dt
+
+        if self.rate is not None:
+            map = phi @ self.rate.view(-1, 1)
+        else:
+            print("Rate function not fitted!")
+            map = 0 * phi[:, 0].view(-1, 1) + self.b
+
+        return map
+
+    def sample_value(self, S):
+        """
+        Given a pre-sampled value evaluate certain portions of the domain S
+        :param S:
+        :return:
+        """
+        return self.packing.integral(S) @ self.sampled_theta
+
+    def sample_path(self, S, n=128):
+        xtest = S.return_discretization(n)
+        return self.packing.embed(xtest) @ self.sampled_theta
+
+    def sample_path_points(self, xtest):
+        return self.packing.embed(xtest) @ self.sampled_theta.view(-1, 1)
+
+    def get_observations(self):
+        if self.data is not None:
+            points = []
+            for datapoint in self.data:
+                if datapoint[1] is not None:
+                    points.append(datapoint[1])
+            if len(points) > 0:
+                return torch.vstack(points)
+            else:
+                return None
+        else:
+            return None
+
+    def fit(self):
+        raise NotImplementedError()
diff --git a/stpy/point_processes/seasonal_point_process.py b/stpy/point_processes/seasonal_point_process.py
index 3590e71..c75cf55 100644
--- a/stpy/point_processes/seasonal_point_process.py
+++ b/stpy/point_processes/seasonal_point_process.py
@@ -6,77 +6,98 @@
 
 class SeasonalPoissonPointProcess(PoissonPointProcess):
 
-	def __init__(self, *args, seasonality=lambda t: 1., **kwargs):
-		self.seasonality = seasonality
-
-	def rate_default(self, x, t, dt=1.):
-		return (self.B * torch.sum(torch.exp(-(x + 1)) * torch.sin(2 * x * np.pi) ** 2, dim=1).view(-1,
-																									1) + self.b) * dt
-
-	def rate_volume(self, S, t, dt=1, rate=None):
-		if self.rate_volume_f is None:
-			# integrate rate numerically over S
-			import scipy.integrate as integrate
-			if rate is None:
-				rate = self.rate
-			else:
-				rate = rate
-			integral = 0
-			if self.d == 1:
-				# integrate = S.volume()* self.rate(torch.from_numpy(S.bounds[0,1]).view(1))
-				integral, _ = integrate.quad(lambda x: rate(torch.Tensor([x]).view(1, 1), t).numpy(),
-											 float(S.bounds[0, 0]), float(S.bounds[0, 1]))
-			elif self.d == 2:
-				integrand = lambda x, y: rate(torch.Tensor([x, y], t).view(1, 2).double()).numpy()
-				integral, _ = integrate.dblquad(integrand, float(S.bounds[0, 0]), float(S.bounds[0, 1]),
-												lambda x: float(S.bounds[1, 0]), lambda x: float(S.bounds[1, 1]))
-
-			return integral * dt
-		else:
-			return self.rate_volume_f(S) * dt
-
-	def sample(self, S, t, dt=1., verbose=False, rate=None):
-		"""
-
-		:param S: set where it should be sampled
-		:return:
-		"""
-		if self.exact == True:
-			return self.sample_discretized(S, t, dt=dt)
-		else:
-
-			lam = self.rate_volume(S, t, dt)
-			n = np.random.poisson(lam=lam)
-			new_sample = []
-			vol = S.volume()
-			size = 0
-
-			alpha = 1. / lam
-
-			while size < n:
-				# uniform sample g(s) = 1/vol(S)
-				sample = S.uniform_sample(1)
-
-				t = self.rate(sample, t) / (alpha * lam)
-				p = np.random.uniform(0, 1)
-				if p < t:
-					new_sample.append(sample.view(1, -1))
-					size = size + 1
-
-			if len(new_sample) > 1:
-				x = torch.cat(new_sample, dim=0)
-			else:
-				return None
-			return x
-
-	def sample_discretized(self, S, t, dt, n=50):
-		lam = float(self.rate_volume(S, t, dt))
-		count = np.random.poisson(lam=lam)
-		if count > 0:
-			x = S.return_discretization(n)
-			r = self.rate(x, t) * dt
-			sample = torch.from_numpy(
-				np.random.choice(np.arange(0, x.size()[0], 1), size=count, p=(r / torch.sum(r)).numpy().reshape(-1)))
-			return x[sample, :]
-		else:
-			return None
+    def __init__(self, *args, seasonality=lambda t: 1.0, **kwargs):
+        self.seasonality = seasonality
+
+    def rate_default(self, x, t, dt=1.0):
+        return (
+            self.B
+            * torch.sum(
+                torch.exp(-(x + 1)) * torch.sin(2 * x * np.pi) ** 2, dim=1
+            ).view(-1, 1)
+            + self.b
+        ) * dt
+
+    def rate_volume(self, S, t, dt=1, rate=None):
+        if self.rate_volume_f is None:
+            # integrate rate numerically over S
+            import scipy.integrate as integrate
+
+            if rate is None:
+                rate = self.rate
+            else:
+                rate = rate
+            integral = 0
+            if self.d == 1:
+                # integrate = S.volume()* self.rate(torch.from_numpy(S.bounds[0,1]).view(1))
+                integral, _ = integrate.quad(
+                    lambda x: rate(torch.tensor([x]).view(1, 1), t).numpy(),
+                    float(S.bounds[0, 0]),
+                    float(S.bounds[0, 1]),
+                )
+            elif self.d == 2:
+                integrand = lambda x, y: rate(
+                    torch.tensor([x, y], t).view(1, 2).double()
+                ).numpy()
+                integral, _ = integrate.dblquad(
+                    integrand,
+                    float(S.bounds[0, 0]),
+                    float(S.bounds[0, 1]),
+                    lambda x: float(S.bounds[1, 0]),
+                    lambda x: float(S.bounds[1, 1]),
+                )
+
+            return integral * dt
+        else:
+            return self.rate_volume_f(S) * dt
+
+    def sample(self, S, t, dt=1.0, verbose=False, rate=None):
+        """
+
+        :param S: set where it should be sampled
+        :return:
+        """
+        if self.exact == True:
+            return self.sample_discretized(S, t, dt=dt)
+        else:
+
+            lam = self.rate_volume(S, t, dt)
+            n = np.random.poisson(lam=lam)
+            new_sample = []
+            vol = S.volume()
+            size = 0
+
+            alpha = 1.0 / lam
+
+            while size < n:
+                # uniform sample g(s) = 1/vol(S)
+                sample = S.uniform_sample(1)
+
+                t = self.rate(sample, t) / (alpha * lam)
+                p = np.random.uniform(0, 1)
+                if p < t:
+                    new_sample.append(sample.view(1, -1))
+                    size = size + 1
+
+            if len(new_sample) > 1:
+                x = torch.cat(new_sample, dim=0)
+            else:
+                return None
+            return x
+
+    def sample_discretized(self, S, t, dt, n=50):
+        lam = float(self.rate_volume(S, t, dt))
+        count = np.random.poisson(lam=lam)
+        if count > 0:
+            x = S.return_discretization(n)
+            r = self.rate(x, t) * dt
+            sample = torch.from_numpy(
+                np.random.choice(
+                    np.arange(0, x.size()[0], 1),
+                    size=count,
+                    p=(r / torch.sum(r)).numpy().reshape(-1),
+                )
+            )
+            return x[sample, :]
+        else:
+            return None
diff --git a/stpy/probability/bernoulli_likelihood.py b/stpy/probability/bernoulli_likelihood.py
index 2630337..14148e4 100644
--- a/stpy/probability/bernoulli_likelihood.py
+++ b/stpy/probability/bernoulli_likelihood.py
@@ -6,79 +6,103 @@
 from stpy.probability.gaussian_likelihood import GaussianLikelihood
 import scipy
 
+
 class BernoulliLikelihoodCanonical(GaussianLikelihood):
 
     def __init__(self):
         super().__init__()
 
-    def evaluate_datapoint(self, theta, d, mask = None):
+    def evaluate_datapoint(self, theta, d, mask=None):
         if mask is None:
-            mask = 1.
+            mask = 1.0
         x, y = d
-        r = -y*(x@theta) + torch.log(1+torch.exp(x@theta))
+        r = -y * (x @ theta) + torch.log(1 + torch.exp(x @ theta))
         r = r * mask
         return r
 
     def link(self, s):
-        return 1./(1.+ torch.exp(-s))
+        return 1.0 / (1.0 + torch.exp(-s))
 
-    def scale(self, mask = None):
-        return 1.
+    def scale(self, mask=None):
+        return 1.0
 
-    def get_objective_cvxpy(self, mask = None):
+    def get_objective_cvxpy(self, mask=None):
         if mask is None:
+
             def likelihood(theta):
-                return -self.y.T@(self.x @ theta) + cp.sum(cp.logistic(self.x @ theta))
+                return -self.y.T @ (self.x @ theta) + cp.sum(
+                    cp.logistic(self.x @ theta)
+                )
+
         else:
+
             def likelihood(theta):
-                if torch.sum(mask.double())>1e-8:
-                    return -(mask*self.y)@(self.x @ theta) + mask @ cp.logistic(self.x @ theta)
+                if torch.sum(mask.double()) > 1e-8:
+                    return -(mask * self.y) @ (self.x @ theta) + mask @ cp.logistic(
+                        self.x @ theta
+                    )
                 else:
-                    return cp.sum(theta*0)
+                    return cp.sum(theta * 0)
+
         return likelihood
 
     def lipschitz_constant(self, b):
         return np.exp(b)
 
-    def get_confidence_set_cvxpy(self,
-                                 theta: cp.Variable,
-                                 type: Union[str, None] = None,
-                                 params: Dict = {},
-                                 delta: float = 0.1):
+    def get_confidence_set_cvxpy(
+        self,
+        theta: cp.Variable,
+        type: Union[str, None] = None,
+        params: Dict = {},
+        delta: float = 0.1,
+    ):
         if self.fitted == True:
             return self.set_fn(theta)
 
-        theta_fit = params['estimate']
-        H = params['regularizer_hessian']
+        theta_fit = params["estimate"]
+        H = params["regularizer_hessian"]
         lam = torch.max(torch.linalg.eigvalsh(H))
-        B = params['bound']
-        d_eff = params['d_eff']
+        B = params["bound"]
+        d_eff = params["d_eff"]
 
-        if type in ['faubry']:
-            D = torch.diag(1./(self.x @ theta_fit).view(-1))
+        if type in ["faubry"]:
+            D = torch.diag(1.0 / (self.x @ theta_fit).view(-1))
             V = self.x.T @ D @ self.x + H
 
-            beta = np.sqrt(lam*B) / 2. + 2. / np.sqrt(lam*B) * (torch.logdet(V) - torch.logdet(H)) + 2 / np.sqrt(
-                lam*B) * np.log(1 / delta) * d_eff
+            beta = (
+                np.sqrt(lam * B) / 2.0
+                + 2.0 / np.sqrt(lam * B) * (torch.logdet(V) - torch.logdet(H))
+                + 2 / np.sqrt(lam * B) * np.log(1 / delta) * d_eff
+            )
 
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
-            self.set_fn = lambda theta: [cp.sum_squares(L @ (theta - theta_fit)) <= beta]
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
-        elif type in ['laplace']:
-            sigma = 1./4.
+        elif type in ["laplace"]:
+            sigma = 1.0 / 4.0
             V = self.x.T @ self.x / sigma**2 + H
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
-            beta = 2. * self.lipschitz_constant(B)
-            self.set_fn = lambda theta: [cp.sum_squares(L @ (theta - theta_fit)) <= beta]
+            beta = 2.0 * self.lipschitz_constant(B)
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
         elif type in ["adaptive-AB"]:
-            sigma = 1./4.
+            sigma = 1.0 / 4.0
             V = self.x.T @ self.x / sigma**2 + H
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
-            beta = 2 * np.log(1. / delta) + (torch.logdet(V + H) - torch.logdet(H)) + lam * B
-            self.set_fn = lambda theta:  [cp.sum_squares(L@(theta - theta_fit)) <= beta]
+            beta = (
+                2 * np.log(1.0 / delta)
+                + (torch.logdet(V + H) - torch.logdet(H))
+                + lam * B
+            )
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
         elif type == "LR":
@@ -86,7 +110,9 @@ def get_confidence_set_cvxpy(self,
             set = self.lr_confidence_set_cvxpy(theta, beta, params)
 
         else:
-            raise NotImplementedError("The desired confidence set type is not supported.")
+            raise NotImplementedError(
+                "The desired confidence set type is not supported."
+            )
 
         self.set = set
         self.fitted = True
@@ -94,35 +120,43 @@ def get_confidence_set_cvxpy(self,
         return set
 
     def information_matrix(self):
-        V = self.x.T@self.x/self.sigma
+        V = self.x.T @ self.x / self.sigma
         return V
 
-    def confidence_parameter(self, delta, params, type = None):
-        H = params['regularizer_hessian']
+    def confidence_parameter(self, delta, params, type=None):
+        H = params["regularizer_hessian"]
         lam = torch.max(torch.linalg.eigvalsh(H))
-        B = params['bound']
-        d_eff = params['d_eff']
+        B = params["bound"]
+        d_eff = params["d_eff"]
 
         if type is None or type == "none" or type == "laplace":
             # this is a common heuristic
-            beta =  2.0
+            beta = 2.0
 
         elif type == "adaptive-AB":
-            sigma = 1./4.
-            V = self.x.T @ self.x / sigma ** 2 + H
-            beta = 2 * np.log(1. / delta) + (torch.logdet(V + H) - torch.logdet(H)) + lam * B
+            sigma = 1.0 / 4.0
+            V = self.x.T @ self.x / sigma**2 + H
+            beta = (
+                2 * np.log(1.0 / delta)
+                + (torch.logdet(V + H) - torch.logdet(H))
+                + lam * B
+            )
 
         elif type == "LR":
             # this is based on sequential LR test
             beta = self.confidence_parameter_likelihood_ratio(delta, params)
 
         elif type == "Faubry":
-            H = params['regularizer_hessian']
-            lam = H[0., 0]
-            theta_fit = params['estimate']
-            D = torch.diag(1./(self.x @ theta_fit).view(-1))
+            H = params["regularizer_hessian"]
+            lam = H[0.0, 0]
+            theta_fit = params["estimate"]
+            D = torch.diag(1.0 / (self.x @ theta_fit).view(-1))
             V = self.x.T @ D @ self.x + H
-            beta = np.sqrt(lam)/2. + 2./np.sqrt(lam)*(torch.logdet(V) - torch.logdet(H)) + 2/np.sqrt(lam)* np.log(1/delta)*d_eff
+            beta = (
+                np.sqrt(lam) / 2.0
+                + 2.0 / np.sqrt(lam) * (torch.logdet(V) - torch.logdet(H))
+                + 2 / np.sqrt(lam) * np.log(1 / delta) * d_eff
+            )
         else:
             raise NotImplementedError("Not implemented")
         return beta
diff --git a/stpy/probability/gaussian_likelihood.py b/stpy/probability/gaussian_likelihood.py
index bdbf2fe..53a66e7 100644
--- a/stpy/probability/gaussian_likelihood.py
+++ b/stpy/probability/gaussian_likelihood.py
@@ -5,24 +5,27 @@
 from stpy.probability.likelihood import Likelihood
 import scipy
 
+
 class GaussianLikelihood(Likelihood):
 
-    def __init__(self, sigma = 0.1, Sigma=None):
+    def __init__(self, sigma=0.1, Sigma=None):
         super().__init__()
         self.sigma = sigma
         self.Sigma = Sigma
 
-    def scale(self, err = None, bound = None):
+    def scale(self, err=None, bound=None):
         if self.Sigma is None:
             return self.sigma**2
         else:
-            return torch.max(self.Sigma.T@self.Sigma)
+            return torch.max(self.Sigma.T @ self.Sigma)
 
     def evaluate_log(self, f):
         if self.Sigma is None:
-            res = torch.sum((f - self.y)**2)/self.sigma**2
+            res = torch.sum((f - self.y) ** 2) / self.sigma**2
         else:
-            res = ((f - self.y).T @ torch.inverse(self.Sigma.T@self.Sigma)  @ (f - self.y) )
+            res = (
+                (f - self.y).T @ torch.inverse(self.Sigma.T @ self.Sigma) @ (f - self.y)
+            )
         return res
 
     def load_data(self, D):
@@ -30,81 +33,119 @@ def load_data(self, D):
         self.fitted = False
 
     def add_data_point(self, d):
-        x,y = d
-        self.x = torch.vstack(self.x,x)
-        self.y = torch.vstack(self.y,y)
+        x, y = d
+        self.x = torch.vstack(self.x, x)
+        self.y = torch.vstack(self.y, y)
         self.fitted = False
 
-    def evaluate_datapoint(self, theta, d, mask = None):
-        x,y = d
+    def evaluate_datapoint(self, theta, d, mask=None):
+        x, y = d
         if mask is None:
-            mask = 1.
+            mask = 1.0
 
         if self.Sigma is None:
-            return mask*((x @ theta - y) ** 2)/ (2*self.sigma ** 2)
+            return mask * ((x @ theta - y) ** 2) / (2 * self.sigma**2)
         else:
-            return mask*(x @ theta - y).T @ torch.linalg.inv(self.Sigma.T @ self.Sigma) @ (
-                            x @ theta - y)
+            return (
+                mask
+                * (x @ theta - y).T
+                @ torch.linalg.inv(self.Sigma.T @ self.Sigma)
+                @ (x @ theta - y)
+            )
 
     def normalization(self, d):
-        return 1./np.sqrt(2.*np.pi*self.sigma**2)
+        return 1.0 / np.sqrt(2.0 * np.pi * self.sigma**2)
 
     def get_objective_torch(self):
 
         if self.Sigma is None:
-            def likelihood(theta): return torch.sum((self.x@theta - self.y)**2)/(2*self.sigma**2)
+
+            def likelihood(theta):
+                return torch.sum((self.x @ theta - self.y) ** 2) / (2 * self.sigma**2)
 
         else:
-            def likelihood(theta): return (self.x@theta - self.y).T@torch.linalg.inv(self.Sigma.T@self.Sigma*2)@(self.x@theta - self.y)
+
+            def likelihood(theta):
+                return (
+                    (self.x @ theta - self.y).T
+                    @ torch.linalg.inv(self.Sigma.T @ self.Sigma * 2)
+                    @ (self.x @ theta - self.y)
+                )
+
         return likelihood
 
-    def get_objective_cvxpy(self, mask = None):
+    def get_objective_cvxpy(self, mask=None):
         if mask is None:
             if self.Sigma is None:
-                def likelihood(theta): return cp.sum_squares(self.x@theta - self.y)/(2*self.sigma**2)
+
+                def likelihood(theta):
+                    return cp.sum_squares(self.x @ theta - self.y) / (2 * self.sigma**2)
 
             else:
-                def likelihood(theta): return cp.matrix_frac(self.x@theta - self.y,2*self.Sigma.T@self.Sigma)
+
+                def likelihood(theta):
+                    return cp.matrix_frac(
+                        self.x @ theta - self.y, 2 * self.Sigma.T @ self.Sigma
+                    )
+
         else:
             if self.Sigma is None:
+
                 def likelihood(theta):
-                    if torch.sum(mask.int())>1e-8:
-                        return cp.sum_squares(cp.multiply(mask.double().view(-1,1),(self.x @ theta - self.y)) )/ (2*self.sigma ** 2)
+                    if torch.sum(mask.int()) > 1e-8:
+                        return cp.sum_squares(
+                            cp.multiply(
+                                mask.double().view(-1, 1), (self.x @ theta - self.y)
+                            )
+                        ) / (2 * self.sigma**2)
                     else:
-                        return cp.sum(theta*0)
+                        return cp.sum(theta * 0)
 
             else:
+
                 def likelihood(theta):
-                    if torch.sum(mask.int())>1e-8:
-                        return cp.matrix_frac(cp.multiply(mask.double().view(-1,1),(self.x @ theta - self.y)), 2*self.Sigma.T @ self.Sigma)
+                    if torch.sum(mask.int()) > 1e-8:
+                        return cp.matrix_frac(
+                            cp.multiply(
+                                mask.double().view(-1, 1), (self.x @ theta - self.y)
+                            ),
+                            2 * self.Sigma.T @ self.Sigma,
+                        )
                     else:
-                        return cp.sum(theta*0)
+                        return cp.sum(theta * 0)
+
         return likelihood
 
-    def information_matrix(self, mask = None):
+    def information_matrix(self, mask=None):
         if mask is None:
             if self.Sigma is None:
-                V = self.x.T@self.x/(2*self.sigma**2)
+                V = self.x.T @ self.x / (2 * self.sigma**2)
             else:
-                V = self.x.T@torch.linalg.inv(self.Sigma.T@self.Sigma*2)@self.x
+                V = self.x.T @ torch.linalg.inv(self.Sigma.T @ self.Sigma * 2) @ self.x
             return V
         else:
             if self.Sigma is None:
-                V = self.x[mask,:].T@self.x[mask,:]/(2*self.sigma**2)
+                V = self.x[mask, :].T @ self.x[mask, :] / (2 * self.sigma**2)
             else:
-                V = self.x[mask,:].T@torch.linalg.inv(self.Sigma.T@self.Sigma*2)@self.x[mask,:]
+                V = (
+                    self.x[mask, :].T
+                    @ torch.linalg.inv(self.Sigma.T @ self.Sigma * 2)
+                    @ self.x[mask, :]
+                )
             return V
 
-    def get_confidence_set_cvxpy(self,
-                                 theta: cp.Variable,
-                                 type: Union[str,None] = None,
-                                 params: Dict = {},
-                                 delta: float  = 0.1):
+    def get_confidence_set_cvxpy(
+        self,
+        theta: cp.Variable,
+        type: Union[str, None] = None,
+        params: Dict = {},
+        delta: float = 0.1,
+    ):
         if self.fitted == True:
             return self.set_fn(theta)
 
-        theta_fit = params['estimate']
-        H = params['regularizer_hessian']
+        theta_fit = params["estimate"]
+        H = params["regularizer_hessian"]
 
         if H is not None:
             V = self.information_matrix() + H
@@ -112,23 +153,29 @@ def get_confidence_set_cvxpy(self,
             V = self.information_matrix()
 
         if type in ["none", None, "fixed"]:
-#            L = torch.linalg.cholesky(V).double()
+            #            L = torch.linalg.cholesky(V).double()
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
             beta = self.confidence_parameter(delta, params, type=type)
-            self.set_fn = lambda theta:  [cp.sum_squares(L@(theta - theta_fit)) <= beta]
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
         elif type in ["adaptive-AB"]:
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
             beta = self.confidence_parameter(delta, params, type=type)
-            self.set_fn = lambda theta:  [cp.sum_squares(L@(theta - theta_fit)) <= beta]
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
         elif type in ["adaptive-optimized"]:
             beta = self.confidence_parameter(delta, params, type=type)
             sqrtV = scipy.linalg.sqrtm(V)
-            L = torch.linalg.cholesky(V+sqrtV).double()
-            self.set_fn = lambda theta: [cp.sum_squares(L @ (theta - theta_fit)) <= beta]
+            L = torch.linalg.cholesky(V + sqrtV).double()
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
 
         elif type == "LR":
             beta = self.confidence_parameter_likelihood_ratio(delta, params)
@@ -139,20 +186,22 @@ def get_confidence_set_cvxpy(self,
             beta = self.confidence_parameter_prior_posterior(delta, params)
             set = self.prior_posterior_lr_confidence_set_cvxpy(theta, beta, params)
         else:
-            raise NotImplementedError("The desired confidence set type is not supported.")
-        print (type, "USING BETA: ", beta)
+            raise NotImplementedError(
+                "The desired confidence set type is not supported."
+            )
+        print(type, "USING BETA: ", beta)
 
         self.set = set
         self.fitted = True
 
         return set
 
-    def confidence_parameter(self, delta, params, type = None):
-        print (type)
+    def confidence_parameter(self, delta, params, type=None):
+        print(type)
 
         if type is None or type == "none":
             # this is a common heuristic
-            beta =  2.0 * np.log(1/delta)
+            beta = 2.0 * np.log(1 / delta)
 
         # elif type == "LR" or type == "LR-vovk":
         #     # this is based on sequential LR test
@@ -160,26 +209,37 @@ def confidence_parameter(self, delta, params, type = None):
         #     beta = self.confidence_parameter_likelihood_ratio(delta, params)
 
         else:
-            if 'd_eff' in params.keys():
+            if "d_eff" in params.keys():
                 n = self.x.size()[0]
-                d = params['d_eff']
+                d = params["d_eff"]
             else:
-                d = params['m']
+                d = params["m"]
 
-            B = params['bound']
-            H = params['regularizer_hessian']
+            B = params["bound"]
+            H = params["regularizer_hessian"]
             lam = torch.max(torch.linalg.eigvalsh(H))
 
             if type == "fixed":
                 # this is fixed design
-                beta = d + 2 * np.log(1 / delta) + 2 * np.sqrt(d * np.log(1 / delta)) + lam*B
+                beta = (
+                    d
+                    + 2 * np.log(1 / delta)
+                    + 2 * np.sqrt(d * np.log(1 / delta))
+                    + lam * B
+                )
 
             elif type == "adaptive-AB":
-                print ("calculating: adaptive-AB")
+                print("calculating: adaptive-AB")
                 # this takes the pseudo-maximization with a fixed mixture
                 V = self.information_matrix()
-                beta = 2*np.log(1./delta) + (torch.logdet(V+H) - torch.logdet(H)) + lam*B
+                beta = (
+                    2 * np.log(1.0 / delta)
+                    + (torch.logdet(V + H) - torch.logdet(H))
+                    + lam * B
+                )
             else:
-                raise NotImplementedError("The desired confidence set type is not supported.")
+                raise NotImplementedError(
+                    "The desired confidence set type is not supported."
+                )
 
-        return beta
\ No newline at end of file
+        return beta
diff --git a/stpy/probability/huber_likelihood.py b/stpy/probability/huber_likelihood.py
index f9321c7..66f7180 100644
--- a/stpy/probability/huber_likelihood.py
+++ b/stpy/probability/huber_likelihood.py
@@ -8,7 +8,7 @@
 
 class HuberLikelihood(GaussianLikelihood):
 
-    def __init__(self, sigma=0.1, M=1.):
+    def __init__(self, sigma=0.1, M=1.0):
         super().__init__()
         self.sigma = sigma
         self.M = M
@@ -16,23 +16,23 @@ def __init__(self, sigma=0.1, M=1.):
     def evaluate_log(self, f):
         pass
 
-    def scale(self, err = None):
+    def scale(self, err=None):
         if self.Sigma is None:
             return self.sigma**2
         else:
-            return torch.max(self.Sigma.T@self.Sigma)
+            return torch.max(self.Sigma.T @ self.Sigma)
 
-    def evaluate_datapoint(self, theta, d, mask = None):
+    def evaluate_datapoint(self, theta, d, mask=None):
         if mask is None:
-            mask = 1.
+            mask = 1.0
         x, y = d
         res = (x @ theta - y) / self.sigma
         mask1 = torch.abs(res) < self.M
         mask2 = torch.abs(res) >= self.M
         v = res
         v[mask1] = res[mask1] ** 2
-        v[mask2] = 2 * self.M * torch.abs(res[mask2]) - self.M ** 2
-        return torch.sum(v)*mask
+        v[mask2] = 2 * self.M * torch.abs(res[mask2]) - self.M**2
+        return torch.sum(v) * mask
 
     def add_data_point(self, d):
         x, y = d
@@ -46,20 +46,27 @@ def load_data(self, D):
 
     def get_objective_cvxpy(self, mask=None):
         if mask is None:
+
             def likelihood(theta):
                 return cp.sum(cp.huber((self.x @ theta - self.y) / self.sigma))
+
         else:
+
             def likelihood(theta):
                 if torch.sum(mask.int()) > 0:
-                    return cp.sum(cp.huber((self.x[mask, :] @ theta - self.y[mask, :]) / self.sigma))
+                    return cp.sum(
+                        cp.huber(
+                            (self.x[mask, :] @ theta - self.y[mask, :]) / self.sigma
+                        )
+                    )
                 else:
                     return cp.sum(theta * 0)
+
         return likelihood
 
     def information_matrix(self):
         V = self.x.T @ self.x / self.sigma
         return V
 
-
     def get_objective_torch(self):
         raise NotImplementedError("Implement me please.")
diff --git a/stpy/probability/laplace_likelihood.py b/stpy/probability/laplace_likelihood.py
index 732c82c..afb4912 100644
--- a/stpy/probability/laplace_likelihood.py
+++ b/stpy/probability/laplace_likelihood.py
@@ -6,62 +6,76 @@
 from stpy.probability.likelihood import Likelihood
 from stpy.probability.gaussian_likelihood import GaussianLikelihood
 
+
 class LaplaceLikelihood(GaussianLikelihood):
 
-    def __init__(self, b = 0.1):
+    def __init__(self, b=0.1):
         super().__init__()
         self.b = b
 
-    def scale(self, err = None, bound = None):
+    def scale(self, err=None, bound=None):
         return self.b
 
     def evaluate_log(self, f):
-        res = torch.sum(torch.abs(f - self.y))/self.b
+        res = torch.sum(torch.abs(f - self.y)) / self.b
         return res
 
-    def evaluate_datapoint(self, theta, d, mask = None):
+    def evaluate_datapoint(self, theta, d, mask=None):
         if mask is None:
-            mask = 1.
+            mask = 1.0
         x, y = d
-        return mask* (torch.abs(x @ theta - y)) / self.b
+        return mask * (torch.abs(x @ theta - y)) / self.b
 
-    def get_objective_cvxpy(self, mask = None):
+    def get_objective_cvxpy(self, mask=None):
         if mask is None:
-             def likelihood(theta): return cp.sum(cp.abs(self.x@theta - self.y)/self.b)
+
+            def likelihood(theta):
+                return cp.sum(cp.abs(self.x @ theta - self.y) / self.b)
+
         else:
+
             def likelihood(theta):
-                if torch.sum(mask.int())>0:
-                    return cp.sum(cp.abs(self.x[mask,:]@theta - self.y[mask,:])/self.b)
+                if torch.sum(mask.int()) > 0:
+                    return cp.sum(
+                        cp.abs(self.x[mask, :] @ theta - self.y[mask, :]) / self.b
+                    )
                 else:
-                    return cp.sum(theta*0)
+                    return cp.sum(theta * 0)
+
         return likelihood
 
-    def get_confidence_set_cvxpy(self,
-                                 theta: cp.Variable,
-                                 type: Union[str, None] = None,
-                                 params: Dict = {},
-                                 delta: float = 0.1):
+    def get_confidence_set_cvxpy(
+        self,
+        theta: cp.Variable,
+        type: Union[str, None] = None,
+        params: Dict = {},
+        delta: float = 0.1,
+    ):
         if self.fitted == True:
             return self.set_fn(theta)
 
-        theta_fit = params['estimate']
-        H = params['regularizer_hessian']
+        theta_fit = params["estimate"]
+        H = params["regularizer_hessian"]
 
         if H is not None:
             V = self.information_matrix() + H
         else:
             V = self.information_matrix()
 
-        if type in ["none","sub-exp"]:
+        if type in ["none", "sub-exp"]:
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
             beta = self.confidence_parameter(delta, params, type=type)
-            self.set_fn = lambda theta: [cp.sum_squares(L @ (theta - theta_fit)) <= beta]
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
         elif type == "adaptive-AB":
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
             beta = self.confidence_parameter(delta, params, type=type)
-            self.set_fn = lambda theta: [cp.sum_squares(L @ (theta - theta_fit)) <= beta]
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
         elif type == "LR":
@@ -69,7 +83,9 @@ def get_confidence_set_cvxpy(self,
             set = self.lr_confidence_set_cvxpy(theta, beta, params)
 
         else:
-            raise NotImplementedError("The desired confidence set type is not supported.")
+            raise NotImplementedError(
+                "The desired confidence set type is not supported."
+            )
         print(type, "USING BETA: ", beta)
 
         self.set = set
@@ -78,49 +94,62 @@ def get_confidence_set_cvxpy(self,
         return set
 
     def information_matrix(self):
-        V = self.x.T@self.x/(2*self.b)**2
+        V = self.x.T @ self.x / (2 * self.b) ** 2
         return V
 
-
-    def get_objective_torch(self, mask = None):
+    def get_objective_torch(self, mask=None):
         if mask is None:
-            def likelihood(theta): return torch.sum(torch.abs(self.x@theta - self.y)/self.sigma)
+
+            def likelihood(theta):
+                return torch.sum(torch.abs(self.x @ theta - self.y) / self.sigma)
+
         else:
+
             def likelihood(theta):
-                if torch.sum(mask.int())>0:
-                    return torch.sum(torch.abs(self.x[mask,:]@theta - self.y[mask,:])/self.sigma)
+                if torch.sum(mask.int()) > 0:
+                    return torch.sum(
+                        torch.abs(self.x[mask, :] @ theta - self.y[mask, :])
+                        / self.sigma
+                    )
                 else:
-                    return torch.sum(theta*0)
-        return likelihood
-
+                    return torch.sum(theta * 0)
 
+        return likelihood
 
-    def confidence_parameter(self, delta, params, type = None):
-        print (type)
+    def confidence_parameter(self, delta, params, type=None):
+        print(type)
 
         if type is None or type == "none":
-            beta =  2.0 * np.log(1/delta)
+            beta = 2.0 * np.log(1 / delta)
 
         else:
-            if 'd_eff' in params.keys():
+            if "d_eff" in params.keys():
                 n = self.x.size()[0]
-                d = params['d_eff']
+                d = params["d_eff"]
             else:
-                d = params['m']
+                d = params["m"]
 
-            B = params['bound']
-            H = params['regularizer_hessian']
+            B = params["bound"]
+            H = params["regularizer_hessian"]
             lam = torch.max(torch.linalg.eigvalsh(H))
 
             if type == "sub-exp":
                 # this takes the pseudo-maximization with a fixed mixture
                 V = self.information_matrix()
-                L = 1.
+                L = 1.0
                 size = V.size()[0]
-                beta = (lam*(B + self.b/L) + L/(self.b*np.sqrt(lam))*(d*np.log(2)+np.log(1./delta)+0.5*torch.slogdet(V*lam+torch.eye(size))[1]))
+                beta = lam * (B + self.b / L) + L / (self.b * np.sqrt(lam)) * (
+                    d * np.log(2)
+                    + np.log(1.0 / delta)
+                    + 0.5 * torch.slogdet(V * lam + torch.eye(size))[1]
+                )
             elif type == "adaptive-AB":
                 V = self.information_matrix()
-                beta = 2*np.log(1./delta) + (torch.logdet(V+H) - torch.logdet(H)) + lam*B
+                beta = (
+                    2 * np.log(1.0 / delta)
+                    + (torch.logdet(V + H) - torch.logdet(H))
+                    + lam * B
+                )
             else:
                 raise NotImplementedError("given confidence sets are not implemented.")
-        return beta
\ No newline at end of file
+        return beta
diff --git a/stpy/probability/likelihood.py b/stpy/probability/likelihood.py
index c950cc0..939cd12 100644
--- a/stpy/probability/likelihood.py
+++ b/stpy/probability/likelihood.py
@@ -3,6 +3,7 @@
 import numpy as np
 import torch
 
+
 class Likelihood(ABC):
 
     def __init__(self):
@@ -14,7 +15,7 @@ def evaluate_log(self, f):
         pass
 
     @abstractmethod
-    def scale(self, err = None, bound = None):
+    def scale(self, err=None, bound=None):
         return
 
     @abstractmethod
@@ -22,7 +23,7 @@ def normalization(self, d):
         return
 
     @abstractmethod
-    def evaluate_datapoint(self, f, d, mask = None):
+    def evaluate_datapoint(self, f, d, mask=None):
         pass
 
     @abstractmethod
@@ -33,20 +34,18 @@ def get_confidence_set_cvxpy(self, theta, type, params, delta):
     def information_matrix(self, theta_fit):
         pass
 
-
     @abstractmethod
-    def get_objective_cvxpy(self, mask = None):
+    def get_objective_cvxpy(self, mask=None):
         pass
 
     @abstractmethod
     def get_objective_torch(self):
         pass
 
-
     def add_data_point(self, d):
-        x,y = d
-        self.x = torch.vstack(self.x,x)
-        self.y = torch.vstack(self.y,y)
+        x, y = d
+        self.x = torch.vstack(self.x, x)
+        self.y = torch.vstack(self.y, y)
         self.fitted = False
 
     def load_data(self, D):
@@ -60,18 +59,18 @@ def confidence_parameter_likelihood_ratio(self, delta, params):
         :param params:
         :return:
         """
-        evidence = params['evidence']
-        estimators = params['estimator_sequence']
+        evidence = params["evidence"]
+        estimators = params["estimator_sequence"]
 
-        val = 0.
-        for i in range(len(estimators)-1):
+        val = 0.0
+        for i in range(len(estimators) - 1):
             ev = evidence[i]
             est = estimators[i]
             if est is not None:
-                xx = self.x[i,:].view(1,-1)
-                yy = self.y[i,:].view(1,-1)
-                val += self.evaluate_datapoint(est, (xx, yy), mask = ev)
-        val = np.log(1/delta) + val
+                xx = self.x[i, :].view(1, -1)
+                yy = self.y[i, :].view(1, -1)
+                val += self.evaluate_datapoint(est, (xx, yy), mask=ev)
+        val = np.log(1 / delta) + val
         return val
 
     def lr_confidence_set_cvxpy(self, theta, beta, params):
@@ -82,20 +81,24 @@ def lr_confidence_set_cvxpy(self, theta, beta, params):
         :param params:
         :return:
         """
-        evidence = torch.Tensor(params['evidence']).bool()
-        self.set_fn = lambda theta:  [self.get_objective_cvxpy(mask = evidence)(theta) <= beta]
+        evidence = torch.tensor(params["evidence"]).bool()
+        self.set_fn = lambda theta: [
+            self.get_objective_cvxpy(mask=evidence)(theta) <= beta
+        ]
         set = self.set_fn(theta)
         return set
 
-
-    def confidence_parameter_prior_posterior(self, delta,params):
-        H = params['regularizer_hessian']
-        sigma = params['sigma']
+    def confidence_parameter_prior_posterior(self, delta, params):
+        H = params["regularizer_hessian"]
+        sigma = params["sigma"]
         n = self.x.size()[0]
-        K = (self.x@self.x.T + torch.max(H)*sigma**2*torch.eye(n))
-        evidence_of_the_data = -0.5*self.y.T@torch.linalg.solve(K,self.y)-0.5*torch.linalg.slogdet(K)[1]#-(n/2)*np.log(2*np.pi) ## remove this as in likelihood not added
-        evidence_of_the_data = evidence_of_the_data #- np.log(2*np.pi*sigma**2)
-        return np.log(1./delta) - evidence_of_the_data
+        K = self.x @ self.x.T + torch.max(H) * sigma**2 * torch.eye(n)
+        evidence_of_the_data = (
+            -0.5 * self.y.T @ torch.linalg.solve(K, self.y)
+            - 0.5 * torch.linalg.slogdet(K)[1]
+        )  # -(n/2)*np.log(2*np.pi) ## remove this as in likelihood not added
+        evidence_of_the_data = evidence_of_the_data  # - np.log(2*np.pi*sigma**2)
+        return np.log(1.0 / delta) - evidence_of_the_data
 
     def prior_posterior_lr_confidence_set_cvxpy(self, theta, beta, params):
         """
@@ -106,11 +109,11 @@ def prior_posterior_lr_confidence_set_cvxpy(self, theta, beta, params):
         :return:
         """
         # create a Gaussian likelihood
-        sigma = params['sigma']
-        def gauss_likelihood(theta): return cp.sum_squares(self.x @ theta - self.y) / (2 * sigma ** 2)
-        self.set_fn = lambda theta:  [gauss_likelihood(theta)<= beta]
-        set = self.set_fn(theta)
-        return set
-
+        sigma = params["sigma"]
 
+        def gauss_likelihood(theta):
+            return cp.sum_squares(self.x @ theta - self.y) / (2 * sigma**2)
 
+        self.set_fn = lambda theta: [gauss_likelihood(theta) <= beta]
+        set = self.set_fn(theta)
+        return set
diff --git a/stpy/probability/noise_models.py b/stpy/probability/noise_models.py
index d736646..19db7a8 100644
--- a/stpy/probability/noise_models.py
+++ b/stpy/probability/noise_models.py
@@ -9,366 +9,421 @@
 
 
 class NoiseModel(ABC):
-	"""
-	Class provides an interface to sample noise observations and evaluate their likelihood
-	"""
-	def __init__(self):
-		pass
-
-	@abstractmethod
-	def sample(self, xs, theta):
-		pass
-
-	@abstractmethod
-	def sample_noise(self, xs):
-		pass
-
-	def joint_log_likelihood(self, ys, xs, theta: Union[np.array, cp.Variable]) -> Union[np.array, cp.Expression]:
-		""" Returns the sum of the lls, i.e. the joint ll"""
-		if isinstance(theta, cp.Variable):
-			return cp.sum(self.log_likelihood(ys, xs, theta))
-		else:
-			return np.sum(self.log_likelihood(ys, xs, theta))
-
-
-
-	def get_mosek_params(self, threads=4):
-		if self.convex:
-			return {
-				mosek.iparam.num_threads: threads,
-				mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
-				mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
-				mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
-				mosek.dparam.intpnt_co_tol_rel_gap: 1e-4
-			}
-		else:
-			raise AttributeError("Fetching mosek parameters disallowed for non-convex problems")
-
-	@abstractmethod
-	def convex(self) -> bool:
-		pass
+    """
+    Class provides an interface to sample noise observations and evaluate their likelihood
+    """
+
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def sample(self, xs, theta):
+        pass
+
+    @abstractmethod
+    def sample_noise(self, xs):
+        pass
+
+    def joint_log_likelihood(
+        self, ys, xs, theta: Union[np.array, cp.Variable]
+    ) -> Union[np.array, cp.Expression]:
+        """Returns the sum of the lls, i.e. the joint ll"""
+        if isinstance(theta, cp.Variable):
+            return cp.sum(self.log_likelihood(ys, xs, theta))
+        else:
+            return np.sum(self.log_likelihood(ys, xs, theta))
+
+    def get_mosek_params(self, threads=4):
+        if self.convex:
+            return {
+                mosek.iparam.num_threads: threads,
+                mosek.iparam.intpnt_solve_form: mosek.solveform.primal,
+                mosek.dparam.intpnt_co_tol_pfeas: 1e-4,
+                mosek.dparam.intpnt_co_tol_dfeas: 1e-4,
+                mosek.dparam.intpnt_co_tol_rel_gap: 1e-4,
+            }
+        else:
+            raise AttributeError(
+                "Fetching mosek parameters disallowed for non-convex problems"
+            )
+
+    @abstractmethod
+    def convex(self) -> bool:
+        pass
 
 
 class AdditiveHomoscedasticNoiseModel(NoiseModel):
-	"""
-	Assume a linear model. Only thing left to implement is the eta log-likelihood in both cvxpy and numpy
+    """
+    Assume a linear model. Only thing left to implement is the eta log-likelihood in both cvxpy and numpy
 
-	TODO discuss whether xs @ theta should be replaced by a f_noiseless type function you can pass at initialization?
-	"""
-	@abstractmethod
-	def sample_noise(self, xs):
-		""" pass xs in order to know how large noise should be. Also able to deal with heteroscedastic later on """
-		pass
+    TODO discuss whether xs @ theta should be replaced by a f_noiseless type function you can pass at initialization?
+    """
 
-	def sample(self, xs, theta):
-		return xs @ theta + self.sample_noise(xs)
+    @abstractmethod
+    def sample_noise(self, xs):
+        """pass xs in order to know how large noise should be. Also able to deal with heteroscedastic later on"""
+        pass
 
-	def log_likelihood(self, ys, xs, theta):  # TODO change base class
-		if ys.shape[0] == 0:
-			return 0. # this is to avoid problems with cvxpy variables of size 0, which it doesn't like
-		if isinstance(theta, cp.Variable):
-			return self.cvxpy_noise_log_likelihood(ys - (xs @ theta))
-		else:
-			return self.noise_log_likelihood(ys - (xs @ theta))
+    def sample(self, xs, theta):
+        return xs @ theta + self.sample_noise(xs)
 
+    def log_likelihood(self, ys, xs, theta):  # TODO change base class
+        if ys.shape[0] == 0:
+            return 0.0  # this is to avoid problems with cvxpy variables of size 0, which it doesn't like
+        if isinstance(theta, cp.Variable):
+            return self.cvxpy_noise_log_likelihood(ys - (xs @ theta))
+        else:
+            return self.noise_log_likelihood(ys - (xs @ theta))
 
 
 class PoissonNoise(NoiseModel):
 
-	def __init__(self, lam):
-		self.lam = lam
+    def __init__(self, lam):
+        self.lam = lam
 
-	def sample_noise(self, xs):
-		return torch.poisson(self.lam(xs).view(-1)).view(-1,1)
-	def convex(self) -> bool:
-		pass
+    def sample_noise(self, xs):
+        return torch.poisson(self.lam(xs).view(-1)).view(-1, 1)
 
-	def sample(self, xs, theta):
-		pass
+    def convex(self) -> bool:
+        pass
 
-	def mean(self, xs):
-		return self.lam(xs)
-class GaussianNoise(AdditiveHomoscedasticNoiseModel):
-	def __init__(self, sigma=0.1):
-		"""
-		:param sigma: standard deviation
-		"""
-		super().__init__()
-		self.sigma = sigma
+    def sample(self, xs, theta):
+        pass
 
-	def sample_noise(self, xs):
-		return self.sigma*np.random.normal(scale=1.0, size=(xs.shape[0], 1))
+    def mean(self, xs):
+        return self.lam(xs)
 
-	def noise_log_likelihood(self, etas, xs=None):
-		return -(0.5*((etas) ** 2))/(self.sigma **  2) - 0.5*np.log(2*np.pi*(self.sigma**2))
 
-	def cvxpy_noise_log_likelihood(self, etas, xs=None):
-		return -0.5 * cp.square(etas) / (self.sigma ** 2) - 0.5*np.log(2 * np.pi * self.sigma ** 2)
+class GaussianNoise(AdditiveHomoscedasticNoiseModel):
+    def __init__(self, sigma=0.1):
+        """
+        :param sigma: standard deviation
+        """
+        super().__init__()
+        self.sigma = sigma
 
-	@property
-	def convex(self) -> bool:
-		return True
+    def sample_noise(self, xs):
+        return self.sigma * np.random.normal(scale=1.0, size=(xs.shape[0], 1))
 
-	def __str__(self):
-		return "GaussianAdditive"
+    def noise_log_likelihood(self, etas, xs=None):
+        return -(0.5 * ((etas) ** 2)) / (self.sigma**2) - 0.5 * np.log(
+            2 * np.pi * (self.sigma**2)
+        )
 
+    def cvxpy_noise_log_likelihood(self, etas, xs=None):
+        return -0.5 * cp.square(etas) / (self.sigma**2) - 0.5 * np.log(
+            2 * np.pi * self.sigma**2
+        )
 
+    @property
+    def convex(self) -> bool:
+        return True
 
-class HuberNoise(AdditiveHomoscedasticNoiseModel):
-	def __init__(self, sigma=0.1):
-		"""
-		:param sigma: standard deviation
-		"""
-		super().__init__()
-		self.sigma = sigma
+    def __str__(self):
+        return "GaussianAdditive"
 
-	def sample_noise(self, xs):
-		return self.sigma*(np.random.normal(scale=1.0, size=(xs.shape[0], 1)) +  np.random.laplace(scale=self.sigma, size=(xs.shape[0], 1)))/2.
 
-	@property
-	def convex(self) -> bool:
-		return True
+class HuberNoise(AdditiveHomoscedasticNoiseModel):
+    def __init__(self, sigma=0.1):
+        """
+        :param sigma: standard deviation
+        """
+        super().__init__()
+        self.sigma = sigma
+
+    def sample_noise(self, xs):
+        return (
+            self.sigma
+            * (
+                np.random.normal(scale=1.0, size=(xs.shape[0], 1))
+                + np.random.laplace(scale=self.sigma, size=(xs.shape[0], 1))
+            )
+            / 2.0
+        )
+
+    @property
+    def convex(self) -> bool:
+        return True
+
+    def __str__(self):
+        return "GaussianAdditive"
 
-	def __str__(self):
-		return "GaussianAdditive"
 
 class AdditiveBoundedNoise(GaussianNoise):
-	""" Sub-Gaussian bounded norm, with a Gaussian Likelihood"""
-	def __init__(self, lower, upper):
-		super().__init__(upper-lower)
-		self.lower = lower
-		self.upper = upper
+    """Sub-Gaussian bounded norm, with a Gaussian Likelihood"""
+
+    def __init__(self, lower, upper):
+        super().__init__(upper - lower)
+        self.lower = lower
+        self.upper = upper
 
-	def sample_noise(self, xs):
-		raw = np.random.random_sample(size=(xs.shape[0], 1))
-		rescaled = self.lower + raw * self.sigma
-		print(rescaled)
-		return rescaled  # sigma is the length of the interval
+    def sample_noise(self, xs):
+        raw = np.random.random_sample(size=(xs.shape[0], 1))
+        rescaled = self.lower + raw * self.sigma
+        print(rescaled)
+        return rescaled  # sigma is the length of the interval
 
-	def __str__(self):
-		return "BoundedNoiseAdditive"
+    def __str__(self):
+        return "BoundedNoiseAdditive"
 
 
 class MisspecifiedAdditiveGaussianNoise(GaussianNoise):
-	def __init__(self, sigma=1.0, actual_sigma=0.1):
-		"""
-		:param sigma: standard deviation
-		"""
-		super().__init__(sigma=sigma)
-		self.actual_sigma = actual_sigma
+    def __init__(self, sigma=1.0, actual_sigma=0.1):
+        """
+        :param sigma: standard deviation
+        """
+        super().__init__(sigma=sigma)
+        self.actual_sigma = actual_sigma
 
-	def sample_noise(self, xs):
-		return self.actual_sigma*np.random.normal(scale=1.0, size=(xs.shape[0], 1))
+    def sample_noise(self, xs):
+        return self.actual_sigma * np.random.normal(scale=1.0, size=(xs.shape[0], 1))
 
-	def __str__(self):
-		return "MisspecifiedGaussianAdditive"
+    def __str__(self):
+        return "MisspecifiedGaussianAdditive"
 
 
 class LaplaceNoise(GaussianNoise):
-	def __init__(self, b):
-		"""
-		:param sigma: this is sometimes also denoted as b
-		"""
-		super().__init__()
-		self.b = b
+    def __init__(self, b):
+        """
+        :param sigma: this is sometimes also denoted as b
+        """
+        super().__init__()
+        self.b = b
 
-	def noise_log_likelihood(self, etas):
-		return -np.log(2*self.b) - np.abs(etas)/self.b
+    def noise_log_likelihood(self, etas):
+        return -np.log(2 * self.b) - np.abs(etas) / self.b
 
-	def cvxpy_noise_log_likelihood(self, etas):
-		return -np.log(2*self.b) - cp.abs(etas)/self.b
+    def cvxpy_noise_log_likelihood(self, etas):
+        return -np.log(2 * self.b) - cp.abs(etas) / self.b
 
-	def sample_noise(self, xs):
-		return np.random.laplace(loc = 0, scale=self.b, size=(xs.shape[0], 1))
+    def sample_noise(self, xs):
+        return np.random.laplace(loc=0, scale=self.b, size=(xs.shape[0], 1))
 
-	def __str__(self):
-		return "Laplace"
+    def __str__(self):
+        return "Laplace"
 
-	@property
-	def convex(self) -> bool:
-		return True
+    @property
+    def convex(self) -> bool:
+        return True
 
 
 class AdditiveGumbelNoise(AdditiveHomoscedasticNoiseModel):
-	def __init__(self, beta, mu):
-		super().__init__()
-		self.beta = beta
-		self.mu = mu
+    def __init__(self, beta, mu):
+        super().__init__()
+        self.beta = beta
+        self.mu = mu
 
-	def sample_noise(self, xs):
-		return np.random.gumbel(loc=self.mu, scale=self.beta, size=(xs.shape[0],))
+    def sample_noise(self, xs):
+        return np.random.gumbel(loc=self.mu, scale=self.beta, size=(xs.shape[0],))
 
-	def noise_log_likelihood(self, etas):
-		return -np.log(self.beta) - 1/self.beta*(etas - self.mu) - np.exp(-1/self.beta*(etas-self.mu))
+    def noise_log_likelihood(self, etas):
+        return (
+            -np.log(self.beta)
+            - 1 / self.beta * (etas - self.mu)
+            - np.exp(-1 / self.beta * (etas - self.mu))
+        )
 
-	def cvxpy_noise_log_likelihood(self, etas):
-		return -np.log(self.beta) - 1/self.beta*(etas - self.mu) - cp.exp(-1/self.beta*(etas-self.mu))
+    def cvxpy_noise_log_likelihood(self, etas):
+        return (
+            -np.log(self.beta)
+            - 1 / self.beta * (etas - self.mu)
+            - cp.exp(-1 / self.beta * (etas - self.mu))
+        )
 
-	def __str__(self):
-		return "Gumbel"
+    def __str__(self):
+        return "Gumbel"
+
+    @property
+    def convex(self) -> bool:
+        return True
 
-	@property
-	def convex(self) -> bool:
-		return True
 
 class AdditiveTwoSidedWeibullNoise(AdditiveHomoscedasticNoiseModel):
-	def __init__(self, scale, shape):
-		"""
-		:param scale: lambda
-		:param shape: k
-		"""
-		super().__init__()
-		self.scale = scale
-		self.shape = shape
-
-	def noise_log_likelihood(self, etas):
-		etas = np.abs(etas)
-		return np.log(0.5*self.shape/self.scale) + (self.shape - 1)*np.log(etas/self.scale) - np.power(etas/self.scale, self.shape)
-
-	def cvxpy_noise_log_likelihood(self, etas):
-		raise NotImplementedError("cvxpy makes no sense for non-convex sets")
-
-	def sample_noise(self, xs):
-		signs = np.sign(np.random.normal(size=xs.shape[0]))
-		weibull = np.random.weibull(self.shape, size=xs.shape[0])
-		return self.scale * signs * weibull
-
-	def __str__(self):
-		return "TwoSidedWeibull"
-
-	@property
-	def convex(self) -> bool:
-		return False
+    def __init__(self, scale, shape):
+        """
+        :param scale: lambda
+        :param shape: k
+        """
+        super().__init__()
+        self.scale = scale
+        self.shape = shape
+
+    def noise_log_likelihood(self, etas):
+        etas = np.abs(etas)
+        return (
+            np.log(0.5 * self.shape / self.scale)
+            + (self.shape - 1) * np.log(etas / self.scale)
+            - np.power(etas / self.scale, self.shape)
+        )
+
+    def cvxpy_noise_log_likelihood(self, etas):
+        raise NotImplementedError("cvxpy makes no sense for non-convex sets")
+
+    def sample_noise(self, xs):
+        signs = np.sign(np.random.normal(size=xs.shape[0]))
+        weibull = np.random.weibull(self.shape, size=xs.shape[0])
+        return self.scale * signs * weibull
+
+    def __str__(self):
+        return "TwoSidedWeibull"
+
+    @property
+    def convex(self) -> bool:
+        return False
+
 
 class BernoulliNoise(NoiseModel):
 
-	def __init__(self, prob):
-		"""
-		:param scale: lambda
-			Note lambda should work for both cvxpy and np parameter inputs and takes xs, theta
-		:param shape: p
-		"""
-		super().__init__()
-		self.prob = prob # lambda , $lambda^(1/a) to connect to sampling below
+    def __init__(self, prob):
+        """
+        :param scale: lambda
+                Note lambda should work for both cvxpy and np parameter inputs and takes xs, theta
+        :param shape: p
+        """
+        super().__init__()
+        self.prob = prob  # lambda , $lambda^(1/a) to connect to sampling below
 
-	def mean(self, xs):
-		return self.prob(xs)
+    def mean(self, xs):
+        return self.prob(xs)
 
-	def sample_noise(self, xs):
-		bernouli = torch.bernoulli(self.prob(xs).view(-1))
-		return bernouli.view(-1,1)
+    def sample_noise(self, xs):
+        bernouli = torch.bernoulli(self.prob(xs).view(-1))
+        return bernouli.view(-1, 1)
 
-	def convex(self):
-		pass
+    def convex(self):
+        pass
 
-	def sample(self, xs, theta):
-		pass
+    def sample(self, xs, theta):
+        pass
 
-	def log_likelihood(self, ys, xs, theta: Union[np.array, cp.Variable]) -> Union[np.array, cp.Expression]:
-		pass
+    def log_likelihood(
+        self, ys, xs, theta: Union[np.array, cp.Variable]
+    ) -> Union[np.array, cp.Expression]:
+        pass
 
 
 class LogWeibullNoise(NoiseModel):
-	def __init__(self, lam, p = 2, lam_form = lambda x, y: np.exp(x@y)):
-		"""
-		:param scale: lambda
-			Note lambda should work for both cvxpy and np parameter inputs and takes xs, theta
-		:param shape: p
-		"""
-		super().__init__()
-		self.lam = lam # lambda , $lambda^(1/a) to connect to sampling below 
-		self.p = p #  
-		self.lam_form = lam_form
-
-	def sample(self,xs,theta):
-		pass
-
-	def log_likelihood(self, ys, xs, theta):
-		assert(xs is not None)
-		if isinstance(theta, cp.Variable):
-			return self.cvxpy_log_likelihood(ys, xs, theta)
-		else:
-			return self.noise_log_likelihood(ys, xs, theta)
-	
-	def noise_log_likelihood(self,ys, xs, theta):
-		return np.log(self.lam_form(xs, theta).reshape(-1)) + self.p*ys.reshape(-1) - np.exp(ys).reshape(-1)**self.p*self.lam_form(xs, theta).reshape(-1)
-		# notice that lam(xs) = exp(\theta^\top xs) in common parametrization hence the loglikelihood becomes 
-		# xs @ theta + p*y - np.exp(y)**p*np.exp(xs@\theta) # which is strongly convex in theta
-
-	def sample_noise(self, xs):
-		weibull = (self.lam(xs)**(1/self.p)).reshape(-1)*np.random.weibull(self.p, size=xs.shape[0])
-		weibull = weibull.reshape(-1,1)
-		return np.log(weibull)
-
-	def mean(self, xs):
-		return (np.log(self.lam(xs)) - np.euler_gamma)/self.p
-
-	def cvxpy_log_likelihood(self, ys, xs, theta):
-		# This works only fi 
-		return xs @ theta + self.p*ys - cp.multiply((np.exp(ys)**self.p).reshape(-1),cp.exp(xs@theta))
-
-	def __str__(self):
-		return "logWeibull"
-
-	@property
-	def convex(self) -> bool:
-		return True
-
-class WeibullNoise(LogWeibullNoise):
-
-	def noise_log_likelihood(self,ys, xs, theta):
-		return np.log(self.lam_form(xs, theta).reshape(-1)) + np.log(self.p * (ys.reshape(-1)**(self.p-1))) - self.lam_form(xs, theta).reshape(-1)*(ys.reshape(-1)**self.p)
-		# notice that lam(xs) = exp(\theta^\top xs) in common parametrization hence the loglikelihood becomes
-		# xs @ theta + p*y - np.exp(y)**p*np.exp(xs@\theta) # which is strongly convex in theta
+    def __init__(self, lam, p=2, lam_form=lambda x, y: np.exp(x @ y)):
+        """
+        :param scale: lambda
+                Note lambda should work for both cvxpy and np parameter inputs and takes xs, theta
+        :param shape: p
+        """
+        super().__init__()
+        self.lam = lam  # lambda , $lambda^(1/a) to connect to sampling below
+        self.p = p  #
+        self.lam_form = lam_form
+
+    def sample(self, xs, theta):
+        pass
+
+    def log_likelihood(self, ys, xs, theta):
+        assert xs is not None
+        if isinstance(theta, cp.Variable):
+            return self.cvxpy_log_likelihood(ys, xs, theta)
+        else:
+            return self.noise_log_likelihood(ys, xs, theta)
+
+    def noise_log_likelihood(self, ys, xs, theta):
+        return (
+            np.log(self.lam_form(xs, theta).reshape(-1))
+            + self.p * ys.reshape(-1)
+            - np.exp(ys).reshape(-1) ** self.p * self.lam_form(xs, theta).reshape(-1)
+        )
+        # notice that lam(xs) = exp(\theta^\top xs) in common parametrization hence the loglikelihood becomes
+        # xs @ theta + p*y - np.exp(y)**p*np.exp(xs@\theta) # which is strongly convex in theta
+
+    def sample_noise(self, xs):
+        weibull = (self.lam(xs) ** (1 / self.p)).reshape(-1) * np.random.weibull(
+            self.p, size=xs.shape[0]
+        )
+        weibull = weibull.reshape(-1, 1)
+        return np.log(weibull)
+
+    def mean(self, xs):
+        return (np.log(self.lam(xs)) - np.euler_gamma) / self.p
+
+    def cvxpy_log_likelihood(self, ys, xs, theta):
+        # This works only fi
+        return (
+            xs @ theta
+            + self.p * ys
+            - cp.multiply((np.exp(ys) ** self.p).reshape(-1), cp.exp(xs @ theta))
+        )
+
+    def __str__(self):
+        return "logWeibull"
+
+    @property
+    def convex(self) -> bool:
+        return True
 
-	def noise_likelihood(self,ys, xs, theta):
-		return self.lam_form(xs, theta).reshape(-1)*(self.p * (ys.reshape(-1)**(self.p-1)))*np.exp(- self.lam_form(xs, theta).reshape(-1)*(ys.reshape(-1)**self.p))
-		# notice that lam(xs) = exp(\theta^\top xs) in common parametrization hence the loglikelihood becomes
-		# xs @ theta + p*y - np.exp(y)**p*np.exp(xs@\theta) # which is strongly convex in theta
 
-	def sample_noise(self, xs):
-		convert_lambda = (1/self.lam(xs))**(1/self.p)
-		weibull = convert_lambda.view(-1)*np.random.weibull(self.p, size=xs.shape[0])
-		weibull = weibull.reshape(-1,1)
-		return weibull
-
-	def mode(self, xs):
-		convert_lambda = (1/self.lam(xs))**(1/self.p)
-		return convert_lambda*((((self.p-1)/self.p))**(1/self.p))
+class WeibullNoise(LogWeibullNoise):
 
-	def mean(self, xs):
-		convert_lambda = (1/self.lam(xs))**(1/self.p)
-		return  convert_lambda*scipy.special.gamma(1. + 1./self.p)
+    def noise_log_likelihood(self, ys, xs, theta):
+        return (
+            np.log(self.lam_form(xs, theta).reshape(-1))
+            + np.log(self.p * (ys.reshape(-1) ** (self.p - 1)))
+            - self.lam_form(xs, theta).reshape(-1) * (ys.reshape(-1) ** self.p)
+        )
+        # notice that lam(xs) = exp(\theta^\top xs) in common parametrization hence the loglikelihood becomes
+        # xs @ theta + p*y - np.exp(y)**p*np.exp(xs@\theta) # which is strongly convex in theta
+
+    def noise_likelihood(self, ys, xs, theta):
+        return (
+            self.lam_form(xs, theta).reshape(-1)
+            * (self.p * (ys.reshape(-1) ** (self.p - 1)))
+            * np.exp(-self.lam_form(xs, theta).reshape(-1) * (ys.reshape(-1) ** self.p))
+        )
+        # notice that lam(xs) = exp(\theta^\top xs) in common parametrization hence the loglikelihood becomes
+        # xs @ theta + p*y - np.exp(y)**p*np.exp(xs@\theta) # which is strongly convex in theta
+
+    def sample_noise(self, xs):
+        convert_lambda = (1 / self.lam(xs)) ** (1 / self.p)
+        weibull = convert_lambda.view(-1) * np.random.weibull(self.p, size=xs.shape[0])
+        weibull = weibull.reshape(-1, 1)
+        return weibull
+
+    def mode(self, xs):
+        convert_lambda = (1 / self.lam(xs)) ** (1 / self.p)
+        return convert_lambda * ((((self.p - 1) / self.p)) ** (1 / self.p))
+
+    def mean(self, xs):
+        convert_lambda = (1 / self.lam(xs)) ** (1 / self.p)
+        return convert_lambda * scipy.special.gamma(1.0 + 1.0 / self.p)
 
 
 if __name__ == "__main__":
-	import matplotlib.pyplot as plt
-
-	d = 2
-	p = 2
-	lam = lambda x: torch.exp(torch.sum(x, dim = 1))
-	lam_form = lambda x,theta: torch.exp(x@theta)
-
-	W = WeibullNoise(lam, p = p, lam_form=lam_form)
-
-	tstar = torch.ones(size = (2,1)).double()
-	x = torch.ones(size = (1,2)).double()
-	print(lam(x), lam_form(x,tstar))
-	pdf = lambda y: W.noise_likelihood(y,x,tstar)#torch.exp(W.noise_log_likelihood(y,x,tstar))
-
-	y = torch.linspace(0,5,1000).double()
-	#plt.plot(y, pdf(y))
-	samples = []
-	mean = float(np.log(lam(x)))
-	for _ in range(10000):
-		samples.append(-np.log(float(W.sample_noise(x).view(-1)))*p - np.euler_gamma - mean)
-
-	print (np.mean(samples))
-	print( (np.pi**2/6))
-	print (np.var(samples))
-	#plt.plot(np.exp(W.mode(x)),pdf(W.mode(x)),'ko')
-
-	plt.hist(samples, density=True)
-	plt.show()
-
-
+    import matplotlib.pyplot as plt
+
+    d = 2
+    p = 2
+    lam = lambda x: torch.exp(torch.sum(x, dim=1))
+    lam_form = lambda x, theta: torch.exp(x @ theta)
+
+    W = WeibullNoise(lam, p=p, lam_form=lam_form)
+
+    tstar = torch.ones(size=(2, 1)).double()
+    x = torch.ones(size=(1, 2)).double()
+    print(lam(x), lam_form(x, tstar))
+    pdf = lambda y: W.noise_likelihood(
+        y, x, tstar
+    )  # torch.exp(W.noise_log_likelihood(y,x,tstar))
+
+    y = torch.linspace(0, 5, 1000).double()
+    # plt.plot(y, pdf(y))
+    samples = []
+    mean = float(np.log(lam(x)))
+    for _ in range(10000):
+        samples.append(
+            -np.log(float(W.sample_noise(x).view(-1))) * p - np.euler_gamma - mean
+        )
+
+    print(np.mean(samples))
+    print((np.pi**2 / 6))
+    print(np.var(samples))
+    # plt.plot(np.exp(W.mode(x)),pdf(W.mode(x)),'ko')
+
+    plt.hist(samples, density=True)
+    plt.show()
diff --git a/stpy/probability/poisson_likelihood.py b/stpy/probability/poisson_likelihood.py
index d588fb7..af110a0 100644
--- a/stpy/probability/poisson_likelihood.py
+++ b/stpy/probability/poisson_likelihood.py
@@ -6,6 +6,7 @@
 from stpy.probability.gaussian_likelihood import GaussianLikelihood
 import scipy
 
+
 class PoissonLikelihoodCanonical(GaussianLikelihood):
 
     def __init__(self):
@@ -13,66 +14,80 @@ def __init__(self):
 
     def evaluate_datapoint(self, theta, d, mask):
         if mask is None:
-            mask = 1.
+            mask = 1.0
         x, y = d
-        r = -y*x@theta + torch.exp(x@theta)
+        r = -y * x @ theta + torch.exp(x @ theta)
         r = r * mask
         return r
 
     def link(self, s):
         return torch.exp(s)
 
-    def scale(self, err = None, bound = None):
+    def scale(self, err=None, bound=None):
         return np.exp(bound)
 
-    def get_objective_cvxpy(self, mask = None):
+    def get_objective_cvxpy(self, mask=None):
         if mask is None:
+
             def likelihood(theta):
-                return -self.y.T@(self.x @ theta) + cp.sum(cp.exp(self.x@theta))
+                return -self.y.T @ (self.x @ theta) + cp.sum(cp.exp(self.x @ theta))
+
         else:
+
             def likelihood(theta):
-                if torch.sum(mask.double())>1e-8:
-                    return -(mask*self.y).T@(self.x @ theta) + mask.T @ cp.exp(self.x@theta)
+                if torch.sum(mask.double()) > 1e-8:
+                    return -(mask * self.y).T @ (self.x @ theta) + mask.T @ cp.exp(
+                        self.x @ theta
+                    )
                 else:
-                    return cp.sum(theta*0)
+                    return cp.sum(theta * 0)
+
         return likelihood
 
-    def get_confidence_set_cvxpy(self,
-                                 theta: cp.Variable,
-                                 type: Union[str, None] = None,
-                                 params: Dict = {},
-                                 delta: float = 0.1):
+    def get_confidence_set_cvxpy(
+        self,
+        theta: cp.Variable,
+        type: Union[str, None] = None,
+        params: Dict = {},
+        delta: float = 0.1,
+    ):
         if self.fitted == True:
             return self.set_fn(theta)
 
-        theta_fit = params['estimate']
-        H = params['regularizer_hessian']
+        theta_fit = params["estimate"]
+        H = params["regularizer_hessian"]
         lam = torch.max(torch.linalg.eigvalsh(H))
-        B = params['bound']
-        d_eff = params['d_eff']
-        bound = params['bound']
+        B = params["bound"]
+        d_eff = params["d_eff"]
+        bound = params["bound"]
         if type == "LR":
             beta = self.confidence_parameter(delta, params, type=type)
             set = self.lr_confidence_set_cvxpy(theta, beta, params)
 
-        elif type in ['mutny']:
+        elif type in ["mutny"]:
             vars = np.exp(bound)
-            V = self.x.T @torch.diag(vars)@ self.x + H
+            V = self.x.T @ torch.diag(vars) @ self.x + H
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
-            beta = 2.*np.log(1./delta)
-            self.set_fn = lambda theta: [cp.sum_squares(L @ (theta - theta_fit)) <= beta]
+            beta = 2.0 * np.log(1.0 / delta)
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
-        elif type in ['laplace']:
-            vars = torch.exp(self.x@ theta_fit).view(-1)
-            V = self.x.T @torch.diag(vars) @ self.x  + H
+        elif type in ["laplace"]:
+            vars = torch.exp(self.x @ theta_fit).view(-1)
+            V = self.x.T @ torch.diag(vars) @ self.x + H
             L = torch.from_numpy(scipy.linalg.sqrtm(V.numpy()))
-            beta = 2.*np.log(1./delta)
-            self.set_fn = lambda theta: [cp.sum_squares(L @ (theta - theta_fit)) <= beta]
+            beta = 2.0 * np.log(1.0 / delta)
+            self.set_fn = lambda theta: [
+                cp.sum_squares(L @ (theta - theta_fit)) <= beta
+            ]
             set = self.set_fn(theta)
 
         else:
-            raise NotImplementedError("The desired confidence set type is not supported.")
+            raise NotImplementedError(
+                "The desired confidence set type is not supported."
+            )
 
         self.set = set
         self.fitted = True
@@ -80,35 +95,43 @@ def get_confidence_set_cvxpy(self,
         return set
 
     def information_matrix(self):
-        V = self.x.T@self.x/self.sigma
+        V = self.x.T @ self.x / self.sigma
         return V
 
-    def confidence_parameter(self, delta, params, type = None):
-        H = params['regularizer_hessian']
+    def confidence_parameter(self, delta, params, type=None):
+        H = params["regularizer_hessian"]
         lam = torch.max(torch.linalg.eigvalsh(H))
-        B = params['bound']
-        d_eff = params['d_eff']
+        B = params["bound"]
+        d_eff = params["d_eff"]
 
         if type is None or type == "none" or type == "laplace":
             # this is a common heuristic
-            beta =  2.0
+            beta = 2.0
 
         elif type == "adaptive-AB":
-            sigma = 1./4.
-            V = self.x.T @ self.x / sigma ** 2 + H
-            beta = 2 * np.log(1. / delta) + (torch.logdet(V + H) - torch.logdet(H)) + lam * B
+            sigma = 1.0 / 4.0
+            V = self.x.T @ self.x / sigma**2 + H
+            beta = (
+                2 * np.log(1.0 / delta)
+                + (torch.logdet(V + H) - torch.logdet(H))
+                + lam * B
+            )
 
         elif type == "LR":
             # this is based on sequential LR test
             beta = self.confidence_parameter_likelihood_ratio(delta, params)
 
         elif type == "Faubry":
-            H = params['regularizer_hessian']
-            lam = H[0., 0]
-            theta_fit = params['estimate']
-            D = torch.diag(1./(self.x @ theta_fit).view(-1))
+            H = params["regularizer_hessian"]
+            lam = H[0.0, 0]
+            theta_fit = params["estimate"]
+            D = torch.diag(1.0 / (self.x @ theta_fit).view(-1))
             V = self.x.T @ D @ self.x + H
-            beta = np.sqrt(lam)/2. + 2./np.sqrt(lam)*(torch.logdet(V) - torch.logdet(H)) + 2/np.sqrt(lam)* np.log(1/delta)*d_eff
+            beta = (
+                np.sqrt(lam) / 2.0
+                + 2.0 / np.sqrt(lam) * (torch.logdet(V) - torch.logdet(H))
+                + 2 / np.sqrt(lam) * np.log(1 / delta) * d_eff
+            )
         else:
             raise NotImplementedError("Not implemented")
         return beta
diff --git a/stpy/probability/robust_likelihood.py b/stpy/probability/robust_likelihood.py
index 0cab487..8cc2dd7 100644
--- a/stpy/probability/robust_likelihood.py
+++ b/stpy/probability/robust_likelihood.py
@@ -4,9 +4,10 @@
 from typing import Union, Dict, List
 from stpy.probability.likelihood import Likelihood
 
+
 class RobustGraphicalLikelihood(Likelihood):
 
-    def __init__(self, coin, supp,  sigma = 0.1):
+    def __init__(self, coin, supp, sigma=0.1):
         super().__init__()
         self.coin = coin
         self.supp = supp
@@ -20,48 +21,68 @@ def evaluate_datapoint(self, theta, d):
         return torch.log(1 + torch.exp())
 
     def add_data_point(self, d):
-        x,y = d
-        self.x = torch.vstack(self.x,x)
-        self.y = torch.vstack(self.y,y)
+        x, y = d
+        self.x = torch.vstack(self.x, x)
+        self.y = torch.vstack(self.y, y)
         self.fitted = False
 
     def load_data(self, D):
         self.x, self.y = D
         self.fitted = False
 
-    def get_objective_cvxpy(self, mask = None):
+    def get_objective_cvxpy(self, mask=None):
         if mask is None:
             if self.Sigma is None:
-                def likelihood(theta): return cp.sum(cp.abs(self.x@theta - self.y)/self.sigma)
+
+                def likelihood(theta):
+                    return cp.sum(cp.abs(self.x @ theta - self.y) / self.sigma)
 
             else:
-                def likelihood(theta): return cp.sum(cp.abs(torch.linalg.inv(self.Sigma)@(self.x@theta - self.y)))
+
+                def likelihood(theta):
+                    return cp.sum(
+                        cp.abs(torch.linalg.inv(self.Sigma) @ (self.x @ theta - self.y))
+                    )
+
         else:
             if self.Sigma is None:
+
                 def likelihood(theta):
-                    if torch.sum(mask.int())>0:
-                        return cp.sum(cp.abs(self.x[mask,:]@theta - self.y[mask,:])/self.sigma)
+                    if torch.sum(mask.int()) > 0:
+                        return cp.sum(
+                            cp.abs(self.x[mask, :] @ theta - self.y[mask, :])
+                            / self.sigma
+                        )
                     else:
-                        return cp.sum(theta*0)
+                        return cp.sum(theta * 0)
 
             else:
+
                 def likelihood(theta):
-                    if torch.sum(mask.int())>0:
-                        return cp.sum(cp.abs(torch.linalg.inv(self.Sigma)@(self.x[mask,:]@theta - self.y[mask,:])))
+                    if torch.sum(mask.int()) > 0:
+                        return cp.sum(
+                            cp.abs(
+                                torch.linalg.inv(self.Sigma)
+                                @ (self.x[mask, :] @ theta - self.y[mask, :])
+                            )
+                        )
                     else:
-                        return cp.sum(theta*0)
+                        return cp.sum(theta * 0)
+
         return likelihood
 
-    def get_confidence_set_cvxpy(self,
-                                 theta: cp.Variable,
-                                 type: Union[str, None] = None,
-                                 params: Dict = {},
-                                 delta: float = 0.1):
+    def get_confidence_set_cvxpy(
+        self,
+        theta: cp.Variable,
+        type: Union[str, None] = None,
+        params: Dict = {},
+        delta: float = 0.1,
+    ):
         if self.fitted == True:
             return self.set_fn(theta)
 
-        theta_fit = params['estimate']
-        H = params['regularizer_hessian']
+        theta_fit = params["estimate"]
+        H = params["regularizer_hessian"]
 
         beta = self.confidence_parameter(delta, params, type=type)
 
@@ -78,13 +99,14 @@ def get_confidence_set_cvxpy(self,
             set = self.lr_confidence_set_cvxpy(theta, beta, params)
 
         else:
-            raise NotImplementedError("The desired confidence set type is not supported.")
+            raise NotImplementedError(
+                "The desired confidence set type is not supported."
+            )
 
         self.set = set
         self.fitted = True
 
         return set
 
-
     def get_objective_torch(self):
         raise NotImplementedError("Implement me please.")
diff --git a/stpy/probability/weibul_likelihood.py b/stpy/probability/weibul_likelihood.py
index 6d11179..77c509c 100644
--- a/stpy/probability/weibul_likelihood.py
+++ b/stpy/probability/weibul_likelihood.py
@@ -15,21 +15,19 @@ def __init__(self, p):
     def information_matrix(self, theta_fit):
         pass
 
-
     def normalization(self, d):
         pass
 
-
-    def evaluate_datapoint(self, theta, d, mask = None):
+    def evaluate_datapoint(self, theta, d, mask=None):
         if mask is None:
-            mask = 1.
+            mask = 1.0
         x, y = d
         lam = torch.exp(x @ theta)
         l = -torch.log(lam) + (y ** (self.p)) * lam
         l = l * mask
         return l
 
-    def scale(self, err = None, bound = None):
+    def scale(self, err=None, bound=None):
         return np.exp(bound)
 
     def add_data_point(self, d):
@@ -50,33 +48,44 @@ def get_objective_torch(self):
 
     def get_objective_cvxpy(self, mask=None):
         if mask is None:
+
             def likelihood(theta):
-                return -cp.sum(self.x@theta) + cp.sum(cp.diag(self.y**(self.p))@cp.exp(self.x @ theta))
+                return -cp.sum(self.x @ theta) + cp.sum(
+                    cp.diag(self.y ** (self.p)) @ cp.exp(self.x @ theta)
+                )
+
         else:
+
             def likelihood(theta):
-                if torch.sum(mask.int())>0:
-                    return - cp.sum(self.x[mask,:] @ theta) + cp.sum(cp.diag(self.y[mask,:]**(self.p))@cp.exp(self.x[mask,:] @ theta))
+                if torch.sum(mask.int()) > 0:
+                    return -cp.sum(self.x[mask, :] @ theta) + cp.sum(
+                        cp.diag(self.y[mask, :] ** (self.p))
+                        @ cp.exp(self.x[mask, :] @ theta)
+                    )
                 else:
                     return cp.sum(theta * 0)
+
         return likelihood
 
-    def get_confidence_set_cvxpy(self,
-                                 theta: cp.Variable,
-                                 type: Union[str, None] = None,
-                                 params: Dict = {},
-                                 delta: float = 0.1):
+    def get_confidence_set_cvxpy(
+        self,
+        theta: cp.Variable,
+        type: Union[str, None] = None,
+        params: Dict = {},
+        delta: float = 0.1,
+    ):
         if self.fitted == True:
             return self.set_fn(theta)
 
-        theta_fit = params['estimate']
-        H = params['regularizer_hessian']
+        theta_fit = params["estimate"]
+        H = params["regularizer_hessian"]
 
         beta = self.confidence_parameter(delta, params, type=type)
 
         if type in ["laplace"]:
             V = self.information_matrix(theta_fit)
             if H is not None:
-                 V += H
+                V += H
             self.set_fn = lambda theta: [cp.quad_form(theta - theta_fit, V) <= beta]
             set = self.set_fn(theta)
 
@@ -84,18 +93,22 @@ def get_confidence_set_cvxpy(self,
             set = self.lr_confidence_set_cvxpy(theta, beta, params)
 
         else:
-            raise NotImplementedError("The desired confidence set type is not supported.")
+            raise NotImplementedError(
+                "The desired confidence set type is not supported."
+            )
 
         self.set = set
         self.fitted = True
         return set
 
-    def confidence_parameter(self, delta, params, type = None):
+    def confidence_parameter(self, delta, params, type=None):
         if type == "LR":
             # this is based on sequential LR test
             beta = self.confidence_parameter_likelihood_ratio(delta, params)
         elif type == "laplace":
-            beta = 2.
+            beta = 2.0
         else:
-            raise NotImplementedError("The desired confidence set type is not supported.")
-        return beta
\ No newline at end of file
+            raise NotImplementedError(
+                "The desired confidence set type is not supported."
+            )
+        return beta
diff --git a/stpy/random_process.py b/stpy/random_process.py
index 85aa85a..9839a28 100755
--- a/stpy/random_process.py
+++ b/stpy/random_process.py
@@ -3,335 +3,546 @@
 import matplotlib.pyplot as plt
 import matplotlib
 
+
 class RandomProcess:
 
-	def visualize_function(self,xtest,f_trues, filename = None, colors = None):
-		from mpl_toolkits.mplot3d import axes3d, Axes3D
-		d = xtest.size()[1]
-		if d == 1:
-			if isinstance(f_trues, list):
-				for f_true in f_trues:
-					plt.plot(xtest,f_true(xtest))
-			else:
-				plt.plot(xtest, f_trues(xtest))
-		elif d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=(15, 7))
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			if isinstance(f_trues, list):
-				for index, f_true in enumerate(f_trues):
-					grid_z = griddata((xx, yy), f_true(xtest)[:, 0].numpy(), (grid_x, grid_y), method='linear')
-					if colors is not None:
-						color = colors[index]
-					ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4, color = color)
-			else:
-				grid_z = griddata((xx, yy), f_trues(xtest)[:, 0].numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4)
-
-			if filename is not None:
-				plt.xticks(fontsize=20, rotation=0)
-				plt.yticks(fontsize=20, rotation=0)
-				plt.savefig(filename, dpi = 300)
-
-
-
-
-	def visualize_function_contour(self, xtest, f_true, filename = None, levels = 10, figsize = (15, 7)):
-		from mpl_toolkits.mplot3d import axes3d, Axes3D
-		d = xtest.size()[1]
-		if d ==1:
-			pass
-		elif d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			f = f_true(xtest)
-			grid_z_f = griddata((xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-
-			fig, ax = plt.subplots(figsize=figsize)
-			cs = ax.contourf(grid_x, grid_y, grid_z_f,levels= levels)
-			ax.contour(cs, colors='k')
-			cbar = fig.colorbar(cs)
-			#if self.x is not None:
-			#	ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), c='r', s=100, marker="o")
-			ax.grid(c='k', ls='-', alpha=0.1)
-
-			if filename is not None:
-				plt.xticks(fontsize=24, rotation=0)
-				plt.yticks(fontsize=24, rotation=0)
-				plt.savefig(filename, dpi = 300)
-			#plt.show()
-
-	def visualize(self,xtest,f_true = None, points = True, show = True, size = 2,
-				  norm = 1, fig = True, sqrtbeta = 2, constrained = None, d = None, matheron_kernel=None):
-		from mpl_toolkits.mplot3d import axes3d, Axes3D
-
-		[mu, std] = self.mean_std(xtest)
-
-		if d is None:
-			d = self.d
-
-		if d == 1:
-			if fig == True:
-				plt.figure(figsize=(15, 7))
-				plt.clf()
-			if self.x is not None:
-				plt.plot(self.x.detach().numpy(), self.y.detach().numpy(), 'r+', ms=10, marker="o")
-			if size > 0:
-
-				if matheron_kernel is not None:
-					z = self.sample_matheron(xtest,matheron_kernel, size=size).numpy().T
-				else:
-					z  = self.sample(xtest, size=size).numpy().T
-
-				for z_arr,label in zip(z,['sample']+[None for _ in range(size-1)]):
-					plt.plot(xtest.view(-1).numpy(),z_arr, 'k--', lw = 2, label = label)
-
-			plt.fill_between(xtest.numpy().flat, (mu - sqrtbeta * std).numpy().flat, (mu + sqrtbeta * std).numpy().flat,color="#dddddd")
-			if f_true is not None:
-				plt.plot(xtest.numpy(),f_true(xtest).numpy(),'b-',lw = 2, label = "truth")
-			plt.plot(xtest.numpy(), mu.numpy(), 'r-', lw=2, label="posterior mean")
-			#plt.title('Posterior mean prediction plus 2 st.deviation')
-			plt.legend()
-			if show == True:
-				plt.show()
-
-		elif d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=(15,7))
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			if f_true is not None:
-				grid_z = griddata((xx, yy), f_true(xtest)[:,0].numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z, color='b', alpha=0.4, label = "truth")
-			if points == True and self.fit == True:
-				ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), self.y[:,0].detach().numpy(), c='r', s=100, marker="o", depthshade=False)
-			if self.beta is not None:
-				beta = self.beta(norm = norm)
-				grid_z2 = griddata((xx, yy), (mu.detach()+beta*std.detach())[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z2, color='gray', alpha=0.2)
-				grid_z3 = griddata((xx, yy), (mu.detach()-beta*std.detach())[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z3, color='gray', alpha=0.2)
-
-			ax.plot_surface(grid_x, grid_y, grid_z_mu, color='r', alpha=0.4)
-			#plt.title('Posterior mean prediction plus 2 st.deviation')
-			plt.show()
-
-		else:
-			print("Visualization not implemented")
-
-	def visualize_subopt(self,xtest,f_true = None, points = True, show = True, size = 2, norm = 1, fig = True, beta = 2):
-		from mpl_toolkits.mplot3d import axes3d, Axes3D
-		[mu, std] = self.mean_std(xtest)
-
-		print ("Visualizing in: ", self.d, "dimensions...")
-
-		if self.d == 1:
-			if fig == True:
-				plt.figure(figsize=(15, 7))
-				plt.clf()
-			if self.x is not None:
-				plt.plot(self.x.detach().numpy(), self.y.detach().numpy(), 'r+', ms=10, marker="o")
-			plt.plot(xtest.numpy(), self.sample(xtest, size=size).numpy(), 'k--', lw=2, label="sample")
-			plt.fill_between(xtest.numpy().flat, (mu - 2 * std).numpy().flat, (mu + 2 * std).numpy().flat,color="#dddddd")
-			if f_true is not None:
-				plt.plot(xtest.numpy(),f_true(xtest).numpy(),'b-',lw = 2, label = "truth")
-			plt.plot(xtest.numpy(), mu.numpy(), 'r-', lw=2, label="posterior mean")
-
-			min = torch.max(mu - beta*std)
-			mask = (mu + beta*std < min)
-			v = torch.min(mu - beta * std).numpy()-1
-			plt.plot(xtest.numpy()[mask], 0*xtest.numpy()[mask]+v,'ko', lw = 6,label = "Discarted Region")
-
-
-
-			plt.title('Posterior mean prediction plus 2 st.deviation')
-			plt.legend()
-
-			if show == True:
-				plt.show()
-
-	def visualize_slice(self,xtest,slice, show = True, eps = None, size = 1, beta = 2):
-		append = torch.ones(size = (xtest.size()[0],1), dtype=torch.float64)*slice
-		xtest2 = torch.cat((xtest,append), dim = 1)
-
-		[mu, std] = self.mean_std(xtest2)
-
-		plt.figure(figsize=(15, 7))
-		plt.clf()
-		plt.plot(xtest.numpy(), self.sample(xtest, size=size).numpy(), 'k--', lw=2, label="sample")
-		print(std.size(), mu.size())
-		if self.x is not None:
-			plt.plot(self.x[:,0].detach().numpy(), self.y.detach().numpy(), 'r+', ms=10, marker="o")
-		plt.fill_between(xtest.numpy().flat, (mu - 2 * std).numpy().flat, (mu + 2 * std).numpy().flat, color="#dddddd")
-		plt.fill_between(xtest.numpy().flat, (mu + 2 * std).numpy().flat, (mu + 2 * std + 2*self.s).numpy().flat, color="#bbdefb")
-		plt.fill_between(xtest.numpy().flat, (mu - 2 * std - 2*self.s).numpy().flat, (mu - 2 * std).numpy().flat, color="#bbdefb")
-
-		if eps is not None:
-			mask = (beta*std < eps)
-			v = torch.min(mu - beta * std - 2*self.s).numpy()
-			plt.plot(xtest.numpy()[mask], 0*xtest.numpy()[mask]+v,'k', lw = 6,label = "$\\mathcal{D}_E$ - $\\epsilon$ accurate domain in a subspace")
-
-		plt.plot(xtest.numpy(), mu.numpy(), 'r-', lw=2, label="posterior mean")
-		plt.title('Posterior mean prediction plus 2 st.deviation')
-		plt.legend()
-		if show == True:
-			plt.show()
-
-
-
-	def visualize_contour_with_gap(self,xtest,f_true = None, gap = None, show = False):
-		[mu, _] = self.mean_std(xtest)
-
-		if self.d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu)
-			ax.contour(cs, colors='k')
-
-			ax.plot(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), 'ro', ms=10)
-			cbar = fig.colorbar(cs)
-
-			ax.grid(c='k', ls='-', alpha=0.1)
-
-			if f_true is not None:
-				f = f_true(xtest)
-				grid_z_f = griddata((xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				fig, ax = plt.subplots(figsize=(15, 7))
-				cs = ax.contourf(grid_x, grid_y, grid_z_f)
-				ax.contour(cs, colors='k')
-				cbar = fig.colorbar(cs)
-				ax.grid(c='k', ls='-', alpha=0.1)
-			if show == True:
-				plt.show()
-
-	def visualize_contour(self,xtest,f_true = None, show = True, points = True, ms = 5, levels = 20):
-		[mu, _] = self.mean_std(xtest)
-
-		if self.d == 2:
-			from scipy.interpolate import griddata
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu)
-			ax.contour(cs, colors='k')
-			if points == True:
-				ax.plot(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), 'wo', ms=ms, alpha = 0.5)
-			cbar = fig.colorbar(cs)
-			ax.grid(c='k', ls='-', alpha=0.1)
-
-			if f_true is not None:
-				f = f_true(xtest)
-				grid_z_f = griddata((xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				fig, ax = plt.subplots(figsize=(15, 7))
-				cs = ax.contourf(grid_x, grid_y, grid_z_f, levels = levels)
-				ax.contour(cs, colors='k')
-				cbar = fig.colorbar(cs)
-				ax.grid(c='k', ls='-', alpha=0.1)
-			if show == True:
-				plt.show()
-			return ax
-
-	def visualize_quiver(self,xtest, size = 2,norm = 1):
-		from mpl_toolkits.mplot3d import axes3d, Axes3D
-		[mu, std] = self.mean_std(xtest)
-		if self.d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=(15,7))
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].detach().numpy()
-			yy = xtest[:, 1].detach().numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z_mu = griddata((xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			#
-
-			ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), self.y[:,0].detach().numpy(), c='r', s=100, marker="o", depthshade=False)
-
-			if self.beta is not None:
-				beta = self.beta(norm = norm)
-				grid_z2 = griddata((xx, yy), (mu.detach()+beta*std.detach())[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z2, color='gray', alpha=0.2)
-				grid_z3 = griddata((xx, yy), (mu.detach()-beta*std.detach())[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-				ax.plot_surface(grid_x, grid_y, grid_z3, color='gray', alpha=0.2)
-
-			ax.plot_surface(grid_x, grid_y, grid_z_mu, color='r', alpha=0.4)
-			plt.title('Posterior mean prediction plus 2 st.deviation')
-
-
-			derivatives = torch.zeros(xtest.size()[0],2)
-			for index,point in enumerate(xtest):
-				derivatives[index,:] = self.mean_gradient_hessian(point.view(-1,2))
-				print (derivatives[index,:] )
-
-			print (derivatives.size())
-
-
-			grid_der_x_mu = griddata((xx, yy), derivatives[:, 0].detach().numpy(), (grid_x, grid_y), method='linear')
-			grid_der_y_mu = griddata((xx, yy), derivatives[:, 1].detach().numpy(), (grid_x, grid_y), method='linear')
-
-			fig, ax = plt.subplots(figsize=(15, 7))
-			cs = ax.contourf(grid_x, grid_y, grid_z_mu)
-
-			ax.contour(cs, colors='k')
-
-			# Plot grid.
-			ax.grid(c='k', ls='-', alpha=0.1)
-			ax.quiver(grid_x, grid_y, grid_der_x_mu, grid_der_y_mu)
-
-			plt.show()
-
-		else:
-			print("Visualization not implemented")
+    def visualize_function(self, xtest, f_trues, filename=None, colors=None):
+        from mpl_toolkits.mplot3d import axes3d, Axes3D
+
+        d = xtest.size()[1]
+        if d == 1:
+            if isinstance(f_trues, list):
+                for f_true in f_trues:
+                    plt.plot(xtest, f_true(xtest))
+            else:
+                plt.plot(xtest, f_trues(xtest))
+        elif d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].numpy()
+            yy = xtest[:, 1].numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            if isinstance(f_trues, list):
+                for index, f_true in enumerate(f_trues):
+                    grid_z = griddata(
+                        (xx, yy),
+                        f_true(xtest)[:, 0].numpy(),
+                        (grid_x, grid_y),
+                        method="linear",
+                    )
+                    if colors is not None:
+                        color = colors[index]
+                    ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4, color=color)
+            else:
+                grid_z = griddata(
+                    (xx, yy),
+                    f_trues(xtest)[:, 0].numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z, alpha=0.4)
+
+            if filename is not None:
+                plt.xticks(fontsize=20, rotation=0)
+                plt.yticks(fontsize=20, rotation=0)
+                plt.savefig(filename, dpi=300)
+
+    def visualize_function_contour(
+        self, xtest, f_true, filename=None, levels=10, figsize=(15, 7)
+    ):
+        from mpl_toolkits.mplot3d import axes3d, Axes3D
+
+        d = xtest.size()[1]
+        if d == 1:
+            pass
+        elif d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].numpy()
+            yy = xtest[:, 1].numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            f = f_true(xtest)
+            grid_z_f = griddata(
+                (xx, yy), f[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+
+            fig, ax = plt.subplots(figsize=figsize)
+            cs = ax.contourf(grid_x, grid_y, grid_z_f, levels=levels)
+            ax.contour(cs, colors="k")
+            cbar = fig.colorbar(cs)
+            # if self.x is not None:
+            # 	ax.scatter(self.x[:, 0].detach().numpy(), self.x[:, 1].detach().numpy(), c='r', s=100, marker="o")
+            ax.grid(c="k", ls="-", alpha=0.1)
+
+            if filename is not None:
+                plt.xticks(fontsize=24, rotation=0)
+                plt.yticks(fontsize=24, rotation=0)
+                plt.savefig(filename, dpi=300)
+            # plt.show()
+
+    def visualize(
+        self,
+        xtest,
+        f_true=None,
+        points=True,
+        show=True,
+        size=2,
+        norm=1,
+        fig=True,
+        sqrtbeta=2,
+        constrained=None,
+        d=None,
+        matheron_kernel=None,
+    ):
+        from mpl_toolkits.mplot3d import axes3d, Axes3D
+
+        [mu, std] = self.mean_std(xtest)
+
+        if d is None:
+            d = self.d
+
+        if d == 1:
+            if fig == True:
+                plt.figure(figsize=(15, 7))
+                plt.clf()
+            if self.x is not None:
+                plt.plot(
+                    self.x.detach().numpy(),
+                    self.y.detach().numpy(),
+                    "r+",
+                    ms=10,
+                    marker="o",
+                )
+            if size > 0:
+
+                if matheron_kernel is not None:
+                    z = (
+                        self.sample_matheron(xtest, matheron_kernel, size=size)
+                        .numpy()
+                        .T
+                    )
+                else:
+                    z = self.sample(xtest, size=size).numpy().T
+
+                for z_arr, label in zip(
+                    z, ["sample"] + [None for _ in range(size - 1)]
+                ):
+                    plt.plot(xtest.view(-1).numpy(), z_arr, "k--", lw=2, label=label)
+
+            plt.fill_between(
+                xtest.numpy().flat,
+                (mu - sqrtbeta * std).numpy().flat,
+                (mu + sqrtbeta * std).numpy().flat,
+                color="#dddddd",
+            )
+            if f_true is not None:
+                plt.plot(
+                    xtest.numpy(), f_true(xtest).numpy(), "b-", lw=2, label="truth"
+                )
+            plt.plot(xtest.numpy(), mu.numpy(), "r-", lw=2, label="posterior mean")
+            # plt.title('Posterior mean prediction plus 2 st.deviation')
+            plt.legend()
+            if show == True:
+                plt.show()
+
+        elif d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].numpy()
+            yy = xtest[:, 1].numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            if f_true is not None:
+                grid_z = griddata(
+                    (xx, yy),
+                    f_true(xtest)[:, 0].numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(
+                    grid_x, grid_y, grid_z, color="b", alpha=0.4, label="truth"
+                )
+            if points == True and self.fit == True:
+                ax.scatter(
+                    self.x[:, 0].detach().numpy(),
+                    self.x[:, 1].detach().numpy(),
+                    self.y[:, 0].detach().numpy(),
+                    c="r",
+                    s=100,
+                    marker="o",
+                    depthshade=False,
+                )
+            if self.beta is not None:
+                beta = self.beta(norm=norm)
+                grid_z2 = griddata(
+                    (xx, yy),
+                    (mu.detach() + beta * std.detach())[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z2, color="gray", alpha=0.2)
+                grid_z3 = griddata(
+                    (xx, yy),
+                    (mu.detach() - beta * std.detach())[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z3, color="gray", alpha=0.2)
+
+            ax.plot_surface(grid_x, grid_y, grid_z_mu, color="r", alpha=0.4)
+            # plt.title('Posterior mean prediction plus 2 st.deviation')
+            plt.show()
+
+        else:
+            print("Visualization not implemented")
+
+    def visualize_subopt(
+        self,
+        xtest,
+        f_true=None,
+        points=True,
+        show=True,
+        size=2,
+        norm=1,
+        fig=True,
+        beta=2,
+    ):
+        from mpl_toolkits.mplot3d import axes3d, Axes3D
+
+        [mu, std] = self.mean_std(xtest)
+
+        print("Visualizing in: ", self.d, "dimensions...")
+
+        if self.d == 1:
+            if fig == True:
+                plt.figure(figsize=(15, 7))
+                plt.clf()
+            if self.x is not None:
+                plt.plot(
+                    self.x.detach().numpy(),
+                    self.y.detach().numpy(),
+                    "r+",
+                    ms=10,
+                    marker="o",
+                )
+            plt.plot(
+                xtest.numpy(),
+                self.sample(xtest, size=size).numpy(),
+                "k--",
+                lw=2,
+                label="sample",
+            )
+            plt.fill_between(
+                xtest.numpy().flat,
+                (mu - 2 * std).numpy().flat,
+                (mu + 2 * std).numpy().flat,
+                color="#dddddd",
+            )
+            if f_true is not None:
+                plt.plot(
+                    xtest.numpy(), f_true(xtest).numpy(), "b-", lw=2, label="truth"
+                )
+            plt.plot(xtest.numpy(), mu.numpy(), "r-", lw=2, label="posterior mean")
+
+            min = torch.max(mu - beta * std)
+            mask = mu + beta * std < min
+            v = torch.min(mu - beta * std).numpy() - 1
+            plt.plot(
+                xtest.numpy()[mask],
+                0 * xtest.numpy()[mask] + v,
+                "ko",
+                lw=6,
+                label="Discarted Region",
+            )
+
+            plt.title("Posterior mean prediction plus 2 st.deviation")
+            plt.legend()
+
+            if show == True:
+                plt.show()
+
+    def visualize_slice(self, xtest, slice, show=True, eps=None, size=1, beta=2):
+        append = torch.ones(size=(xtest.size()[0], 1), dtype=torch.float64) * slice
+        xtest2 = torch.cat((xtest, append), dim=1)
+
+        [mu, std] = self.mean_std(xtest2)
+
+        plt.figure(figsize=(15, 7))
+        plt.clf()
+        plt.plot(
+            xtest.numpy(),
+            self.sample(xtest, size=size).numpy(),
+            "k--",
+            lw=2,
+            label="sample",
+        )
+        print(std.size(), mu.size())
+        if self.x is not None:
+            plt.plot(
+                self.x[:, 0].detach().numpy(),
+                self.y.detach().numpy(),
+                "r+",
+                ms=10,
+                marker="o",
+            )
+        plt.fill_between(
+            xtest.numpy().flat,
+            (mu - 2 * std).numpy().flat,
+            (mu + 2 * std).numpy().flat,
+            color="#dddddd",
+        )
+        plt.fill_between(
+            xtest.numpy().flat,
+            (mu + 2 * std).numpy().flat,
+            (mu + 2 * std + 2 * self.s).numpy().flat,
+            color="#bbdefb",
+        )
+        plt.fill_between(
+            xtest.numpy().flat,
+            (mu - 2 * std - 2 * self.s).numpy().flat,
+            (mu - 2 * std).numpy().flat,
+            color="#bbdefb",
+        )
+
+        if eps is not None:
+            mask = beta * std < eps
+            v = torch.min(mu - beta * std - 2 * self.s).numpy()
+            plt.plot(
+                xtest.numpy()[mask],
+                0 * xtest.numpy()[mask] + v,
+                "k",
+                lw=6,
+                label="$\\mathcal{D}_E$ - $\\epsilon$ accurate domain in a subspace",
+            )
+
+        plt.plot(xtest.numpy(), mu.numpy(), "r-", lw=2, label="posterior mean")
+        plt.title("Posterior mean prediction plus 2 st.deviation")
+        plt.legend()
+        if show == True:
+            plt.show()
+
+    def visualize_contour_with_gap(self, xtest, f_true=None, gap=None, show=False):
+        [mu, _] = self.mean_std(xtest)
+
+        if self.d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu)
+            ax.contour(cs, colors="k")
+
+            ax.plot(
+                self.x[:, 0].detach().numpy(),
+                self.x[:, 1].detach().numpy(),
+                "ro",
+                ms=10,
+            )
+            cbar = fig.colorbar(cs)
+
+            ax.grid(c="k", ls="-", alpha=0.1)
+
+            if f_true is not None:
+                f = f_true(xtest)
+                grid_z_f = griddata(
+                    (xx, yy),
+                    f[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                fig, ax = plt.subplots(figsize=(15, 7))
+                cs = ax.contourf(grid_x, grid_y, grid_z_f)
+                ax.contour(cs, colors="k")
+                cbar = fig.colorbar(cs)
+                ax.grid(c="k", ls="-", alpha=0.1)
+            if show == True:
+                plt.show()
+
+    def visualize_contour(
+        self, xtest, f_true=None, show=True, points=True, ms=5, levels=20
+    ):
+        [mu, _] = self.mean_std(xtest)
+
+        if self.d == 2:
+            from scipy.interpolate import griddata
+
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu)
+            ax.contour(cs, colors="k")
+            if points == True:
+                ax.plot(
+                    self.x[:, 0].detach().numpy(),
+                    self.x[:, 1].detach().numpy(),
+                    "wo",
+                    ms=ms,
+                    alpha=0.5,
+                )
+            cbar = fig.colorbar(cs)
+            ax.grid(c="k", ls="-", alpha=0.1)
+
+            if f_true is not None:
+                f = f_true(xtest)
+                grid_z_f = griddata(
+                    (xx, yy),
+                    f[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                fig, ax = plt.subplots(figsize=(15, 7))
+                cs = ax.contourf(grid_x, grid_y, grid_z_f, levels=levels)
+                ax.contour(cs, colors="k")
+                cbar = fig.colorbar(cs)
+                ax.grid(c="k", ls="-", alpha=0.1)
+            if show == True:
+                plt.show()
+            return ax
+
+    def visualize_quiver(self, xtest, size=2, norm=1):
+        from mpl_toolkits.mplot3d import axes3d, Axes3D
+
+        [mu, std] = self.mean_std(xtest)
+        if self.d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].detach().numpy()
+            yy = xtest[:, 1].detach().numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z_mu = griddata(
+                (xx, yy), mu[:, 0].detach().numpy(), (grid_x, grid_y), method="linear"
+            )
+            #
+
+            ax.scatter(
+                self.x[:, 0].detach().numpy(),
+                self.x[:, 1].detach().numpy(),
+                self.y[:, 0].detach().numpy(),
+                c="r",
+                s=100,
+                marker="o",
+                depthshade=False,
+            )
+
+            if self.beta is not None:
+                beta = self.beta(norm=norm)
+                grid_z2 = griddata(
+                    (xx, yy),
+                    (mu.detach() + beta * std.detach())[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z2, color="gray", alpha=0.2)
+                grid_z3 = griddata(
+                    (xx, yy),
+                    (mu.detach() - beta * std.detach())[:, 0].detach().numpy(),
+                    (grid_x, grid_y),
+                    method="linear",
+                )
+                ax.plot_surface(grid_x, grid_y, grid_z3, color="gray", alpha=0.2)
+
+            ax.plot_surface(grid_x, grid_y, grid_z_mu, color="r", alpha=0.4)
+            plt.title("Posterior mean prediction plus 2 st.deviation")
+
+            derivatives = torch.zeros(xtest.size()[0], 2)
+            for index, point in enumerate(xtest):
+                derivatives[index, :] = self.mean_gradient_hessian(point.view(-1, 2))
+                print(derivatives[index, :])
+
+            print(derivatives.size())
+
+            grid_der_x_mu = griddata(
+                (xx, yy),
+                derivatives[:, 0].detach().numpy(),
+                (grid_x, grid_y),
+                method="linear",
+            )
+            grid_der_y_mu = griddata(
+                (xx, yy),
+                derivatives[:, 1].detach().numpy(),
+                (grid_x, grid_y),
+                method="linear",
+            )
+
+            fig, ax = plt.subplots(figsize=(15, 7))
+            cs = ax.contourf(grid_x, grid_y, grid_z_mu)
+
+            ax.contour(cs, colors="k")
+
+            # Plot grid.
+            ax.grid(c="k", ls="-", alpha=0.1)
+            ax.quiver(grid_x, grid_y, grid_der_x_mu, grid_der_y_mu)
+
+            plt.show()
+
+        else:
+            print("Visualization not implemented")
 
 
 if __name__ == "__main__":
-	from stpy.continuous_processes.gauss_procc import GaussianProcess
-	from stpy.continuous_processes.fourier_fea import GaussianProcessFF
-	from stpy.continuous_processes.kernelized_features import KernelizedFeatures
-	from stpy.kernels import KernelFunction
-	from stpy.embeddings.embedding import HermiteEmbedding, RFFEmbedding
-	import stpy
-	import torch
-	import matplotlib.pyplot as plt
-	import numpy as np
-
-	n = 1024
-	N = 256
-	gamma = 0.09
-	s = 0.1
-	# benchmark = stpy.test_functions.benchmarks.GaussianProcessSample(d =1, gamma = gamma, sigma = s, n = n)
-	benchmark = stpy.test_functions.benchmarks.Simple1DFunction(d=1, sigma=s)
-
-	x = benchmark.initial_guess(N, adv_inv=True)
-	y = benchmark.eval(x)
-	xtest = benchmark.interval(1024)
-
-	# GP = GaussianProcess(gamma=gamma, s=s)
-	# GP.fit_gp(x, y)
-	# GP.visualize(xtest, show=False, size=5)
-	# plt.show()
-
-	m = 64
-	kernel = KernelFunction(gamma=gamma)
-	embedding = HermiteEmbedding(gamma=gamma, m=m)
-	RFF = KernelizedFeatures(embedding=embedding, s=s, m=m)
-	RFF.fit_gp(x, y)
-	RFF.visualize(xtest, fig = False, show=False, size=5, matheron_kernel = kernel)
-	plt.show()
\ No newline at end of file
+    from stpy.continuous_processes.gauss_procc import GaussianProcess
+    from stpy.continuous_processes.fourier_fea import GaussianProcessFF
+    from stpy.continuous_processes.kernelized_features import KernelizedFeatures
+    from stpy.kernels import KernelFunction
+    from stpy.embeddings.embedding import HermiteEmbedding, RFFEmbedding
+    import stpy
+    import torch
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    n = 1024
+    N = 256
+    gamma = 0.09
+    s = 0.1
+    # benchmark = stpy.test_functions.benchmarks.GaussianProcessSample(d =1, gamma = gamma, sigma = s, n = n)
+    benchmark = stpy.test_functions.benchmarks.Simple1DFunction(d=1, sigma=s)
+
+    x = benchmark.initial_guess(N, adv_inv=True)
+    y = benchmark.eval(x)
+    xtest = benchmark.interval(1024)
+
+    # GP = GaussianProcess(gamma=gamma, s=s)
+    # GP.fit_gp(x, y)
+    # GP.visualize(xtest, show=False, size=5)
+    # plt.show()
+
+    m = 64
+    kernel = KernelFunction(gamma=gamma)
+    embedding = HermiteEmbedding(gamma=gamma, m=m)
+    RFF = KernelizedFeatures(embedding=embedding, s=s, m=m)
+    RFF.fit_gp(x, y)
+    RFF.visualize(xtest, fig=False, show=False, size=5, matheron_kernel=kernel)
+    plt.show()
diff --git a/stpy/regularization/constraints.py b/stpy/regularization/constraints.py
index 20a94df..77c945c 100644
--- a/stpy/regularization/constraints.py
+++ b/stpy/regularization/constraints.py
@@ -48,7 +48,7 @@ class AbsoluteValueConstraint(Constraints):
 
     def __init__(self, c=None):
         if c is None:
-            self.c = 1.
+            self.c = 1.0
         else:
             self.c = c
 
@@ -65,7 +65,7 @@ class QuadraticInequalityConstraint(Constraints):
     def __init__(self, Q, b=None, c=None):
         self.Q = Q
         if c is None:
-            self.c = 1.
+            self.c = 1.0
         else:
             self.c = c
         if b is None:
@@ -87,7 +87,6 @@ def __init__(self, q, c, d, groups):
         self.groups = groups
         self.convex = False
 
-
     def get_list_cvxpy_constraints(self, theta):
         w = self.q / (1 - self.q)
         set_of_constraints = []
@@ -97,7 +96,7 @@ def get_list_cvxpy_constraints(self, theta):
             # l1 constraint
             constraints = []
             weights = np.ones(d) * w
-            weights[i] = 1.
+            weights[i] = 1.0
             group = self.groups[i]
             constraints.append(cp.norm(theta[group]).T * weights[i] <= self.c)
             # l_infinity constraint
@@ -114,6 +113,7 @@ def get_constraint_cvxpy(self, theta):
         ## Does not work for non-convex constraints
         return None
 
+
 class NonConvexNormConstraint(Constraints):
 
     def __init__(self, q, c, d):
@@ -132,7 +132,7 @@ def construct(self, q, d):
             polytope = copy.copy(square)
             zero = np.zeros(d).reshape(1, -1)
             appex = copy.copy(zero)
-            appex[0, i // 2] = (float(i % 2) - 0.5) * 2.
+            appex[0, i // 2] = (float(i % 2) - 0.5) * 2.0
             polytope = np.concatenate((appex, polytope))
             self.vertex_description.append(polytope)
             self.polyhedra_vertex_description.append(polytope)
@@ -167,7 +167,7 @@ def get_list_cvxpy_constraints(self, theta):
             # l1 constraint
             constraints = []
             weights = np.ones(self.d) * w
-            weights[i] = 1.
+            weights[i] = 1.0
             constraints.append(cp.abs(theta).T @ weights <= self.c)
             # l_infinity constraint
             for j in range(self.d):
diff --git a/stpy/regularization/regularizer.py b/stpy/regularization/regularizer.py
index 21888f0..9aa91c7 100644
--- a/stpy/regularization/regularizer.py
+++ b/stpy/regularization/regularizer.py
@@ -8,7 +8,7 @@
 
 class Regularizer(ABC):
 
-    def __init__(self, lam=1.):
+    def __init__(self, lam=1.0):
         self.lam = lam
         self.groups = None
         self.convex = True
@@ -19,7 +19,9 @@ def eval(self, theta):
 
     @abstractmethod
     def get_regularizer_cvxpy(self):
-        def reg(theta): return 0
+        def reg(theta):
+            return 0
+
         return reg
 
     def is_convex(self):
@@ -28,8 +30,10 @@ def is_convex(self):
     def get_constraint_set_cvxpy(self, theta, c):
         return [self.get_regularizer_cvxpy()(theta) <= c]
 
-    def get_constraint_object(self,c):
-        return CustomConstraint(None, lambda theta: self.get_constraint_set_cvxpy(theta, c))
+    def get_constraint_object(self, c):
+        return CustomConstraint(
+            None, lambda theta: self.get_constraint_set_cvxpy(theta, c)
+        )
 
     def hessian(self, theta_fit):
         pass
@@ -37,27 +41,30 @@ def hessian(self, theta_fit):
 
 class L2Regularizer(Regularizer):
 
-    def __init__(self, lam=1.):
-        super().__init__(lam = lam)
+    def __init__(self, lam=1.0):
+        super().__init__(lam=lam)
 
     def get_regularizer_cvxpy(self):
-        def reg(theta): return self.lam*cp.sum_squares(theta)/2.
+        def reg(theta):
+            return self.lam * cp.sum_squares(theta) / 2.0
+
         return reg
 
     def eval(self, theta):
-        return self.lam*torch.sum(theta**2)/2.
+        return self.lam * torch.sum(theta**2) / 2.0
 
     def hessian(self, theta):
-        return self.lam * torch.eye(n = theta.size()[0]).double()/2.
+        return self.lam * torch.eye(n=theta.size()[0]).double() / 2.0
+
 
 class NonConvexLqRegularizer(Regularizer):
 
-    def __init__(self, lam=1., q = 0.5):
-        super().__init__(lam = lam)
+    def __init__(self, lam=1.0, q=0.5):
+        super().__init__(lam=lam)
         self.q = q
 
     def eval(self, theta):
-        return self.lam*torch.sum(torch.abs(theta)**self.q)
+        return self.lam * torch.sum(torch.abs(theta) ** self.q)
 
     def hessian(self, theta):
         return None
@@ -67,14 +74,16 @@ def is_convex(self):
 
     def get_regularizer_cvxpy(self, eta):
         def reg(theta):
-            norm = cp.sum_squares(theta/eta.reshape(-1,1))
-            return self.q*0.5*norm*self.lam
+            norm = cp.sum_squares(theta / eta.reshape(-1, 1))
+            return self.q * 0.5 * norm * self.lam
+
         return reg
 
+
 class GroupNonCovexLqRegularizer(NonConvexLqRegularizer):
 
-    def __init__(self, lam=1., q = 0.5, groups = None):
-        super().__init__(lam = lam)
+    def __init__(self, lam=1.0, q=0.5, groups=None):
+        super().__init__(lam=lam)
         self.q = q
         self.groups = groups
 
@@ -82,43 +91,44 @@ def eval(self, theta):
         val = None
         for group in self.groups:
             if val is None:
-                val = torch.norm(theta[group])**self.q
+                val = torch.norm(theta[group]) ** self.q
             else:
                 val += torch.norm(theta[group]) ** self.q
-        return self.lam*val
+        return self.lam * val
 
     def get_regularizer_cvxpy(self, eta):
         def reg(theta):
             val = None
-            for i,group in enumerate(self.groups):
+            for i, group in enumerate(self.groups):
                 if val is None:
-                    val = cp.sum_squares(theta[group])/eta[i].reshape(-1,1)
+                    val = cp.sum_squares(theta[group]) / eta[i].reshape(-1, 1)
                 else:
-                    val += cp.sum_squares(theta[group])/eta[i].reshape(-1,1)
-            return val*self.lam
+                    val += cp.sum_squares(theta[group]) / eta[i].reshape(-1, 1)
+            return val * self.lam
+
         return reg
 
 
 class L1Regularizer(Regularizer):
-    def __init__(self, lam=1.):
-        super().__init__(lam = lam)
+    def __init__(self, lam=1.0):
+        super().__init__(lam=lam)
 
     def get_regularizer_cvxpy(self):
         def reg(theta):
-            return self.lam*cp.norm1(theta)
+            return self.lam * cp.norm1(theta)
+
         return reg
 
     def eval(self, theta):
-        return self.lam*torch.sum(torch.abs(theta))
+        return self.lam * torch.sum(torch.abs(theta))
 
     def hessian(self, theta):
-        return self.lam * torch.eye(n = theta.size()[0]).double()
-
+        return self.lam * torch.eye(n=theta.size()[0]).double()
 
 
 class GroupL1L2Regularizer(Regularizer):
 
-    def __init__(self, lam = 1., groups = None):
+    def __init__(self, lam=1.0, groups=None):
         self.groups = groups
         self.lam = lam
         pass
@@ -137,14 +147,16 @@ def reg(theta):
                     norm = cp.norm2(theta[group])
                 else:
                     norm += cp.norm2(theta[group])
-            return cp.square(norm)*self.lam
+            return cp.square(norm) * self.lam
+
         return reg
 
     def hessian(self, theta):
         return None
 
+
 class NestedGroupL1Regularizer(Regularizer):
-    def __init__(self, lam = 1., groups = None, weights = None):
+    def __init__(self, lam=1.0, groups=None, weights=None):
         self.groups = groups
         self.lam = lam
         self.weights = weights
@@ -153,7 +165,7 @@ def __init__(self, lam = 1., groups = None, weights = None):
     def eval(self, theta):
         norm = 0
         for i, group in enumerate(self.groups):
-            norm += self.weights[i]*torch.sum(torch.abs(theta[group]))
+            norm += self.weights[i] * torch.sum(torch.abs(theta[group]))
         return norm**2 * self.lam
 
     def get_regularizer_cvxpy(self):
@@ -167,16 +179,17 @@ def reg(theta):
                 else:
                     norm += self.weights[i] * cp.norm1(theta[group])
 
-            return norm*self.lam
+            return norm * self.lam
 
         return reg
 
     def hessian(self, theta):
         return None
 
+
 class NestedGroupL1L2Regularizer(Regularizer):
 
-    def __init__(self, lam = 1., groups = None, weights = None):
+    def __init__(self, lam=1.0, groups=None, weights=None):
         self.groups = groups
         self.lam = lam
         self.weights = weights
@@ -199,7 +212,7 @@ def reg(theta):
                 else:
                     norm += self.weights[i] * cp.norm2(theta[group])
 
-            return cp.square(norm)*self.lam
+            return cp.square(norm) * self.lam
 
         return reg
 
@@ -208,7 +221,7 @@ def hessian(self, theta):
 
 
 class NonConvexNormRegularizer(Regularizer):
-    def __init__(self, lam = 1., q = 1. , groups = None):
+    def __init__(self, lam=1.0, q=1.0, groups=None):
         self.groups = groups
         self.lam = lam
         self.q = q
@@ -230,6 +243,6 @@ def reg(theta):
                 else:
                     norm += self.weights[i] * cp.norm2(theta[group])
 
-            return cp.square(norm)*self.lam
+            return cp.square(norm) * self.lam
 
         return reg
diff --git a/stpy/regularization/sdp_constraint.py b/stpy/regularization/sdp_constraint.py
index cb8d080..0c3d460 100644
--- a/stpy/regularization/sdp_constraint.py
+++ b/stpy/regularization/sdp_constraint.py
@@ -2,21 +2,23 @@
 from stpy.regularization.constraints import Constraints
 import cvxpy as cp
 
+
 class SDPConstraint(Constraints):
 
-    def __init__(self, type="trace", rank=1.):
+    def __init__(self, type="trace", rank=1.0):
 
         super().__init__()
 
         self.trace_constraint = None
         self.lambda_max_constraint = None
         self.psd_constraint = "Yes"
-        self.matrix_bound = 1.
+        self.matrix_bound = 1.0
         self.type = type
         self.rank = rank
-        self.custom_regularization= None
+        self.custom_regularization = None
 
         self.fit_params()
+
     def fit_params(self):
         if self.type == "stable-rank":
             self.matrix_bound = self.rank
@@ -24,12 +26,14 @@ def fit_params(self):
     def get_type(self):
         return self.type
 
-    def get_constraint_cvxpy(self,A,l,s_value):
+    def get_constraint_cvxpy(self, A, l, s_value):
         constraints = []
 
         # add a classical psd constraint
         if self.matrix_bound is not None:
-            constraints+=[cp.trace(A) <= self.matrix_bound * l] + [cp.lambda_max(A) <= l]
+            constraints += [cp.trace(A) <= self.matrix_bound * l] + [
+                cp.lambda_max(A) <= l
+            ]
 
         # trace regularization
         if self.trace_constraint is not None:
@@ -37,13 +41,13 @@ def get_constraint_cvxpy(self,A,l,s_value):
 
         # restrict the max eigenvalue
         if s_value is not None:
-            constraints += [l<=s_value]
+            constraints += [l <= s_value]
 
         # lambda_max regularization
         if self.lambda_max_constraint is not None:
             constraints += [cp.lambda_max(A) <= self.lambda_max_constraint]
 
         if self.custom_regularization is not None:
-            constraints += [self.custom_regularization(A,l,s_value)]
+            constraints += [self.custom_regularization(A, l, s_value)]
 
-        return constraints
\ No newline at end of file
+        return constraints
diff --git a/stpy/regularization/simplex_regularizer.py b/stpy/regularization/simplex_regularizer.py
index 1383812..e2b7a93 100644
--- a/stpy/regularization/simplex_regularizer.py
+++ b/stpy/regularization/simplex_regularizer.py
@@ -2,48 +2,57 @@
 import cvxpy as cp
 import numpy as np
 import torch
+
+
 class ProbabilityRegularizer(Regularizer):
 
     def __init__(self, lam=1, w=None, d=1, **kwargs):
         super().__init__(lam)
         self.lam = lam
         if w is None:
-            self.w = torch.ones(d).double()/d
+            self.w = torch.ones(d).double() / d
         else:
             self.w = w
         self.convex = True
         self.dcp = True
         self.d = d
         self.name = "default"
+
+
 class SupRegularizer(ProbabilityRegularizer):
 
-    def __init__(self, constrained = False, version = '1',**kwargs):
+    def __init__(self, constrained=False, version="1", **kwargs):
         super().__init__(**kwargs)
         self.convex = False
         self.name = "sup"
         self.constrained = constrained
         self.version = version
+
     def get_regularizer_cvxpy(self):
         pass
 
     def get_cvxpy_objectives_constraints_variables(self, d):
         if not self.constrained:
-            print (d, self.w )
-            objectives = [lambda x: cp.inv_pos(x[i])*self.lam/self.w[i] for i in range(d)]
+            print(d, self.w)
+            objectives = [
+                lambda x: cp.inv_pos(x[i]) * self.lam / self.w[i] for i in range(d)
+            ]
             constriants = [lambda x: [] for i in range(d)]
             return objectives, constriants, []
-        elif self.version == '1':
-            objectives = [lambda x: 0. for i in range(d)]
-            #constriants = [lambda x: [cp.inv_pos(x[i])<=1/self.lam]+[cp.max(x)<=x[i]] for i in range(d)]
-            constriants = [lambda x: [x[i] >= self.lam]  for i in range(d)]
+        elif self.version == "1":
+            objectives = [lambda x: 0.0 for i in range(d)]
+            # constriants = [lambda x: [cp.inv_pos(x[i])<=1/self.lam]+[cp.max(x)<=x[i]] for i in range(d)]
+            constriants = [lambda x: [x[i] >= self.lam] for i in range(d)]
             return objectives, constriants, []
         else:
-            objectives = [lambda x: 0.]
+            objectives = [lambda x: 0.0]
             I = np.eye(d)
-            constriants = [lambda x: [ I*self.lam*cp.sum(x) << d*cp.diag(x)]]
+            constriants = [lambda x: [I * self.lam * cp.sum(x) << d * cp.diag(x)]]
             return objectives, constriants, []
+
     def eval(self, theta):
-        return self.lam/torch.max(self.w*theta)
+        return self.lam / torch.max(self.w * theta)
+
 
 class DirichletRegularizer(ProbabilityRegularizer):
 
@@ -52,11 +61,12 @@ def __init__(self, **kwargs):
         self.name = "dirichlet"
 
     def get_regularizer_cvxpy(self):
-        return lambda x: cp.sum((self.w-1)@cp.log(x)) * self.lam
+        return lambda x: cp.sum((self.w - 1) @ cp.log(x)) * self.lam
 
     def eval(self, theta):
         return self.lam / torch.sum(torch.abs(theta))
 
+
 class WeightedAitchisonRegularizer(ProbabilityRegularizer):
 
     def __init__(self, **kwargs):
@@ -64,21 +74,20 @@ def __init__(self, **kwargs):
         self.dcp = False
         self.name = "aitchison"
 
-
     def get_regularizer_cvxpy(self):
         def reg(x):
-           # outer = sum([cp.log(x[j])*cp.log(x[i]) for i,j in zip(range(self.d),range(self.d)) if i!=j])
-            return 2*self.lam*(cp.sum(cp.log(x)**2))
+            # outer = sum([cp.log(x[j])*cp.log(x[i]) for i,j in zip(range(self.d),range(self.d)) if i!=j])
+            return 2 * self.lam * (cp.sum(cp.log(x) ** 2))
 
         return reg
+
     def eval(self, theta):
         return self.lam / torch.sum(torch.abs(theta))
 
 
 class L1MeasureRegularizer(ProbabilityRegularizer):
     def get_regularizer_cvxpy(self):
-        return lambda x: cp.norm1(x)*self.lam
+        return lambda x: cp.norm1(x) * self.lam
 
     def eval(self, theta):
-        return self.lam/torch.sum(torch.abs(theta))
-
+        return self.lam / torch.sum(torch.abs(theta))
diff --git a/stpy/sampling/hmc.py b/stpy/sampling/hmc.py
index 879fd17..1e6ce13 100644
--- a/stpy/sampling/hmc.py
+++ b/stpy/sampling/hmc.py
@@ -1,5 +1,7 @@
-params_hmc = hamiltorch.sample(log_prob_func=log_prob_func,
-							   params_init=params_init,
-							   num_samples=num_samples,
-							   step_size=step_size,
-							   num_steps_per_sample=num_steps_per_sample)
+params_hmc = hamiltorch.sample(
+    log_prob_func=log_prob_func,
+    params_init=params_init,
+    num_samples=num_samples,
+    step_size=step_size,
+    num_steps_per_sample=num_steps_per_sample,
+)
diff --git a/stpy/sampling/langevin.py b/stpy/sampling/langevin.py
index c7255b7..1914ae5 100644
--- a/stpy/sampling/langevin.py
+++ b/stpy/sampling/langevin.py
@@ -2,25 +2,30 @@
 import torch
 import scipy
 
-class LangevinSampler():
 
-	def __init__(self, verbose = False):
-		self.verbose = verbose
-		pass
+class LangevinSampler:
 
-	def calculate(self, HessianF,theta0):
-		W = HessianF(theta0)
-		L = float(scipy.sparse.linalg.eigsh(W.numpy(), k=1, which='LM', return_eigenvectors=False, tol=1e-3))
-		return L
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+        pass
 
-	def sample(self, F, nablaF, HessianF, theta0, steps = 100):
-		L = self.calculate(HessianF, theta0)
-		eta = 0.5 / (L + 1)
-		m = theta0.size()[0]
-		theta = theta0
-		for k in range(steps):
-			w = torch.randn(size=(m, 1)).double()
-			theta = theta - eta * nablaF(theta) + np.sqrt(2 * eta) * w
-			if self.verbose == True:
-				print("Iter:", k, theta.T)
-		return theta
\ No newline at end of file
+    def calculate(self, HessianF, theta0):
+        W = HessianF(theta0)
+        L = float(
+            scipy.sparse.linalg.eigsh(
+                W.numpy(), k=1, which="LM", return_eigenvectors=False, tol=1e-3
+            )
+        )
+        return L
+
+    def sample(self, F, nablaF, HessianF, theta0, steps=100):
+        L = self.calculate(HessianF, theta0)
+        eta = 0.5 / (L + 1)
+        m = theta0.size()[0]
+        theta = theta0
+        for k in range(steps):
+            w = torch.randn(size=(m, 1)).double()
+            theta = theta - eta * nablaF(theta) + np.sqrt(2 * eta) * w
+            if self.verbose == True:
+                print("Iter:", k, theta.T)
+        return theta
diff --git a/stpy/sampling/proximal_langevin.py b/stpy/sampling/proximal_langevin.py
index 2cf69c1..322ccab 100644
--- a/stpy/sampling/proximal_langevin.py
+++ b/stpy/sampling/proximal_langevin.py
@@ -2,20 +2,26 @@
 import torch
 import numpy as np
 
+
 def ProximalLangevin(LangevinSampler):
 
-	def sample(self, F, nablaF, HessianF, theta0, prox, steps = 100):
-		L = self.calculate(HessianF, theta0)
-		eta = 0.5 / (L + 1)
-		m = theta0.size()[0]
-		theta = theta0
-		for k in range(steps):
-			w = torch.randn(size=(m, 1)).double()
-			theta = (1 - eta) * theta - eta * nablaF(theta) + eta * prox(theta) + np.sqrt(2 * eta) * w
-			if self.verbose == True:
-				print("Iter:", k, theta.T)
-		return prox(theta)
+    def sample(self, F, nablaF, HessianF, theta0, prox, steps=100):
+        L = self.calculate(HessianF, theta0)
+        eta = 0.5 / (L + 1)
+        m = theta0.size()[0]
+        theta = theta0
+        for k in range(steps):
+            w = torch.randn(size=(m, 1)).double()
+            theta = (
+                (1 - eta) * theta
+                - eta * nablaF(theta)
+                + eta * prox(theta)
+                + np.sqrt(2 * eta) * w
+            )
+            if self.verbose == True:
+                print("Iter:", k, theta.T)
+        return prox(theta)
 
 
 def MirrorLangevin(LangvinSampler):
-	pass
\ No newline at end of file
+    pass
diff --git a/stpy/sampling/sampling_helper.py b/stpy/sampling/sampling_helper.py
index 6cbef04..0024a3a 100644
--- a/stpy/sampling/sampling_helper.py
+++ b/stpy/sampling/sampling_helper.py
@@ -1,58 +1,56 @@
-
-
 import torch
 import numpy as np
 import matplotlib.pyplot as plt
 
-def get_increment(eta, steps, f, w0, path = False):
-	"""
 
-	:param eta: terminal time
-	:param steps: number of steps
-	:param f: the operator
-	:param w0: initial point
-	:return:
-	"""
+def get_increment(eta, steps, f, w0, path=False):
+    """
 
-	tau = eta/steps
-	w = w0
-	sequence = []
+    :param eta: terminal time
+    :param steps: number of steps
+    :param f: the operator
+    :param w0: initial point
+    :return:
+    """
 
-	for i in range(steps):
+    tau = eta / steps
+    w = w0
+    sequence = []
 
+    for i in range(steps):
 
-		n = torch.randn(size = w0.size()).double()
-		w = w + np.sqrt(2*tau)*f(w,n)
-		if path:
-			sequence.append(w)
+        n = torch.randn(size=w0.size()).double()
+        w = w + np.sqrt(2 * tau) * f(w, n)
+        if path:
+            sequence.append(w)
 
-	if path:
-		return sequence
-	else:
-		return w
+    if path:
+        return sequence
+    else:
+        return w
 
-if __name__ == "__main__":
 
-	f = lambda w: torch.diag(1./torch.abs(w.view(-1)))
-	d = 1
-	w0 = torch.zeros(size = (d,1)).double() + 2
-	step = 100
-	path = get_increment(2, step, f, w0, path = True)
-	#plt.plot(path)
-
-	i = 0
-	colors = ['k','r','b','orange','brown','purple']
-	for steps in [5,10,20,100,200,500]:
-
-		repeats = 100
-		ws = []
-		for _ in range(repeats):
-			path = get_increment(2,steps,f,w0, path = True)
-			xtest = torch.linspace(0,2,steps)
-			plt.plot(xtest, path, color = colors[i])
-		i = i + 1
-	#	plt.hist(np.array(ws), label = str(step))
-
-	plt.legend()
-	plt.show()
+if __name__ == "__main__":
 
+    f = lambda w: torch.diag(1.0 / torch.abs(w.view(-1)))
+    d = 1
+    w0 = torch.zeros(size=(d, 1)).double() + 2
+    step = 100
+    path = get_increment(2, step, f, w0, path=True)
+    # plt.plot(path)
+
+    i = 0
+    colors = ["k", "r", "b", "orange", "brown", "purple"]
+    for steps in [5, 10, 20, 100, 200, 500]:
+
+        repeats = 100
+        ws = []
+        for _ in range(repeats):
+            path = get_increment(2, steps, f, w0, path=True)
+            xtest = torch.linspace(0, 2, steps)
+            plt.plot(xtest, path, color=colors[i])
+        i = i + 1
+    # 	plt.hist(np.array(ws), label = str(step))
+
+    plt.legend()
+    plt.show()
diff --git a/stpy/test_functions/benchmarks.py b/stpy/test_functions/benchmarks.py
index 9cd2c47..8432ca4 100755
--- a/stpy/test_functions/benchmarks.py
+++ b/stpy/test_functions/benchmarks.py
@@ -6,513 +6,566 @@
 from stpy.continuous_processes.gauss_procc import GaussianProcess
 
 
-class BenchmarkFunction():
-
-	def __init__(self, type="discrete", d=1, gamma=1.0, dts=None, **kwargs):
-		self.scale = 1.0
-		self.type = type
-		self.gamma = gamma
-		self.d = d
-		self.dts = None
-		self.groups = None
-
-	def eval_noiseless(self, X):
-		if X.size()[1] != self.d:
-			raise AssertionError("Invalid dimension for the Benchmark function ...")
-		pass
-
-	def eval(self, X, sigma=None):
-		z = self.eval_noiseless(X)
-		if sigma is None:
-			y = z/self.scale + self.s * torch.randn(X.size()[0], 1, dtype=torch.float64)
-		else:
-			y = z/self.scale + sigma * torch.randn(X.size()[0], 1, dtype=torch.float64)
-		return y
-
-	def optimum(self):
-		return 1.0
-
-	def maximum(self, xtest=None):
-		if self.type == "discrete":
-			self.max = self.maximum_discrete(xtest)
-		else:
-			self.max = self.maximum_continuous()
-		return self.max
-
-	def maximum_discrete(self, xtest):
-		maximum =torch.max(self.eval_noiseless(xtest))
-		return maximum
-
-	def maximum_continuous(self):
-		return 1.0
-
-	def scale_max(self, xtest=None):
-		self.scale = self.maximum(xtest=xtest)
-		print("Scaling with", self.scale)
-
-	def optimize(self, xtest, sigma, restarts=5):
-		(n, d) = xtest.size()
-		ytest = self.eval(xtest, sigma=sigma)
-		kernel = stpy.kernels.KernelFunction(kernel_name="ard", gamma=torch.ones(d, dtype=torch.float64) * 0.1,
-											 groups=self.groups)
-		GP = stpy.continuous_processes.gauss_procc.GaussianProcess(kernel_custom=kernel, s=sigma, d=d)
-		GP.fit(xtest, ytest)
-		GP.optimize_params(type="bandwidth", restarts=restarts)
-		print("Optimized")
-		# GP.visualize(xtest)
-		self.gamma = torch.min(kernel.gamma)
-		return self.gamma
-
-	def return_params(self):
-		return (self.gamma, self.groups, self.d)
-
-	def bandwidth(self):
-		return self.gamma
-
-	def set_group_param(self, groups):
-		self.groups = groups
-
-	def bounds(self):
-		b = tuple([(-0.5, 0.5) for i in range(self.d)])
-		return b
-
-	def initial_guess(self, N, adv_inv=False):
-		if adv_inv == False:
-			x = torch.from_numpy(np.random.uniform(-0.5, 0.5, size=(N, self.d)))
-		else:
-			x = torch.from_numpy(np.random.uniform(-0.5, 0., size=(N, self.d)))
-		return x
-
-	def interval(self, n, L_infinity_ball=0.5):
-		if n == None:
-			xtest = None
-		else:
-			xtest = torch.from_numpy(stpy.helpers.helper.interval(n, self.d, L_infinity_ball=L_infinity_ball))
-		return xtest
-
-	def visualize(self, xtest):
-		import matplotlib.pyplot as plt
-		d = xtest.size()[1]
-		if d == 1:
-			plt.figure(figsize=(15, 7))
-			plt.clf()
-			plt.plot(xtest.numpy(), self.eval_noiseless(xtest)[:, 0].numpy())
-			plt.show()
-		elif d == 2:
-			from scipy.interpolate import griddata
-			plt.figure(figsize=(15, 7))
-			plt.clf()
-			ax = plt.axes(projection='3d')
-			xx = xtest[:, 0].numpy()
-			yy = xtest[:, 1].numpy()
-			grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-			grid_z = griddata((xx, yy), self.eval_noiseless(xtest)[:, 0].numpy(), (grid_x, grid_y), method='linear')
-			ax.plot_surface(grid_x, grid_y, grid_z, color='b', alpha=0.4)
-			plt.show()
+class BenchmarkFunction:
+
+    def __init__(self, type="discrete", d=1, gamma=1.0, dts=None, **kwargs):
+        self.scale = 1.0
+        self.type = type
+        self.gamma = gamma
+        self.d = d
+        self.dts = None
+        self.groups = None
+
+    def eval_noiseless(self, X):
+        if X.size()[1] != self.d:
+            raise AssertionError("Invalid dimension for the Benchmark function ...")
+        pass
+
+    def eval(self, X, sigma=None):
+        z = self.eval_noiseless(X)
+        if sigma is None:
+            y = z / self.scale + self.s * torch.randn(
+                X.size()[0], 1, dtype=torch.float64
+            )
+        else:
+            y = z / self.scale + sigma * torch.randn(
+                X.size()[0], 1, dtype=torch.float64
+            )
+        return y
+
+    def optimum(self):
+        return 1.0
+
+    def maximum(self, xtest=None):
+        if self.type == "discrete":
+            self.max = self.maximum_discrete(xtest)
+        else:
+            self.max = self.maximum_continuous()
+        return self.max
+
+    def maximum_discrete(self, xtest):
+        maximum = torch.max(self.eval_noiseless(xtest))
+        return maximum
+
+    def maximum_continuous(self):
+        return 1.0
+
+    def scale_max(self, xtest=None):
+        self.scale = self.maximum(xtest=xtest)
+        print("Scaling with", self.scale)
+
+    def optimize(self, xtest, sigma, restarts=5):
+        (n, d) = xtest.size()
+        ytest = self.eval(xtest, sigma=sigma)
+        kernel = stpy.kernels.KernelFunction(
+            kernel_name="ard",
+            gamma=torch.ones(d, dtype=torch.float64) * 0.1,
+            groups=self.groups,
+        )
+        GP = stpy.continuous_processes.gauss_procc.GaussianProcess(
+            kernel_custom=kernel, s=sigma, d=d
+        )
+        GP.fit(xtest, ytest)
+        GP.optimize_params(type="bandwidth", restarts=restarts)
+        print("Optimized")
+        # GP.visualize(xtest)
+        self.gamma = torch.min(kernel.gamma)
+        return self.gamma
+
+    def return_params(self):
+        return (self.gamma, self.groups, self.d)
+
+    def bandwidth(self):
+        return self.gamma
+
+    def set_group_param(self, groups):
+        self.groups = groups
+
+    def bounds(self):
+        b = tuple([(-0.5, 0.5) for i in range(self.d)])
+        return b
+
+    def initial_guess(self, N, adv_inv=False):
+        if adv_inv == False:
+            x = torch.from_numpy(np.random.uniform(-0.5, 0.5, size=(N, self.d)))
+        else:
+            x = torch.from_numpy(np.random.uniform(-0.5, 0.0, size=(N, self.d)))
+        return x
+
+    def interval(self, n, L_infinity_ball=0.5):
+        if n == None:
+            xtest = None
+        else:
+            xtest = torch.from_numpy(
+                stpy.helpers.helper.interval(n, self.d, L_infinity_ball=L_infinity_ball)
+            )
+        return xtest
+
+    def visualize(self, xtest):
+        import matplotlib.pyplot as plt
+
+        d = xtest.size()[1]
+        if d == 1:
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            plt.plot(xtest.numpy(), self.eval_noiseless(xtest)[:, 0].numpy())
+            plt.show()
+        elif d == 2:
+            from scipy.interpolate import griddata
+
+            plt.figure(figsize=(15, 7))
+            plt.clf()
+            ax = plt.axes(projection="3d")
+            xx = xtest[:, 0].numpy()
+            yy = xtest[:, 1].numpy()
+            grid_x, grid_y = np.mgrid[
+                min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j
+            ]
+            grid_z = griddata(
+                (xx, yy),
+                self.eval_noiseless(xtest)[:, 0].numpy(),
+                (grid_x, grid_y),
+                method="linear",
+            )
+            ax.plot_surface(grid_x, grid_y, grid_z, color="b", alpha=0.4)
+            plt.show()
 
 
 class CamelbackBenchmark(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		self.d = 2
-
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		xx = X[:, 0] * 4
-		yy = X[:, 1] * 2
-		y = (4. - 2.1 * xx ** 2 + (xx ** 4) / 3.) * (xx ** 2) + xx * yy + (-4. + 4 * (yy ** 2)) * (yy ** 2)
-		y = -y.view(X.size()[0], 1)
-		# y = np.tanh(y)
-		y = y / 5.
-		return y / self.scale
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.d = 2
+
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        xx = X[:, 0] * 4
+        yy = X[:, 1] * 2
+        y = (
+            (4.0 - 2.1 * xx**2 + (xx**4) / 3.0) * (xx**2)
+            + xx * yy
+            + (-4.0 + 4 * (yy**2)) * (yy**2)
+        )
+        y = -y.view(X.size()[0], 1)
+        # y = np.tanh(y)
+        y = y / 5.0
+        return y / self.scale
 
 
 # def optimize(self,xtest,sigma, restarts = 5):
 # self.gamma = 0.3
 
+
 # self.gamma = 0.3
 class QuadraticBenchmark(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		self.d = kwargs['d']
-		self.type = "continuous"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.d = kwargs["d"]
+        self.type = "continuous"
 
-		if 'R' in kwargs:
-			self.R = kwargs['R']
-			print("Quadratic Problem: Rotating - no longer additive.")
-			print(self.R)
-		else:
-			self.R = torch.eye(self.d, self.d, dtype=torch.float64)
-			print("Quadratic Problem: Additive.")
+        if "R" in kwargs:
+            self.R = kwargs["R"]
+            print("Quadratic Problem: Rotating - no longer additive.")
+            print(self.R)
+        else:
+            self.R = torch.eye(self.d, self.d, dtype=torch.float64)
+            print("Quadratic Problem: Additive.")
 
-	def eval_noiseless(self, X):
-		D = torch.diag(torch.Tensor([1., 2.]).double())
-		super().eval_noiseless(X)
-		(n, d) = X.size()
-		X = X @ self.R
-		sum_ = torch.sum((X @ D) ** 2, dim=1)
-		print(sum_.size())
-		return -sum_.view(-1, 1) / self.scale + 1
+    def eval_noiseless(self, X):
+        D = torch.diag(torch.tensor([1.0, 2.0]).double())
+        super().eval_noiseless(X)
+        (n, d) = X.size()
+        X = X @ self.R
+        sum_ = torch.sum((X @ D) ** 2, dim=1)
+        print(sum_.size())
+        return -sum_.view(-1, 1) / self.scale + 1
 
-	def bandwidth(self):
-		return 0.2
+    def bandwidth(self):
+        return 0.2
 
 
 class PolynomialBenchmark(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		self.d = kwargs['d']
-		self.type = "continuous"
-
-		if 'R' in kwargs:
-			self.R = kwargs['R']
-			print("Quadratic Problem: Rotating - no longer additive.")
-			print(self.R)
-		else:
-			self.R = torch.eye(self.d, self.d, dtype=torch.float64)
-			print("Quadratic Problem: Additive.")
-
-	def eval_noiseless(self, X):
-		D = torch.diag(torch.Tensor([1., 2.]).double())
-		super().eval_noiseless(X)
-		(n, d) = X.size()
-		X = X @ self.R
-		sum_ = torch.sum((X @ D) ** 2, dim=1) + torch.sum((X @ D) ** 3, dim=1) * 0.5 + torch.sum((X @ D) ** 4, dim=1)
-		print(sum_.size())
-		return -sum_.view(-1, 1) / self.scale + 1
-
-	def bandwidth(self):
-		return 0.2
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.d = kwargs["d"]
+        self.type = "continuous"
+
+        if "R" in kwargs:
+            self.R = kwargs["R"]
+            print("Quadratic Problem: Rotating - no longer additive.")
+            print(self.R)
+        else:
+            self.R = torch.eye(self.d, self.d, dtype=torch.float64)
+            print("Quadratic Problem: Additive.")
+
+    def eval_noiseless(self, X):
+        D = torch.diag(torch.tensor([1.0, 2.0]).double())
+        super().eval_noiseless(X)
+        (n, d) = X.size()
+        X = X @ self.R
+        sum_ = (
+            torch.sum((X @ D) ** 2, dim=1)
+            + torch.sum((X @ D) ** 3, dim=1) * 0.5
+            + torch.sum((X @ D) ** 4, dim=1)
+        )
+        print(sum_.size())
+        return -sum_.view(-1, 1) / self.scale + 1
+
+    def bandwidth(self):
+        return 0.2
 
 
 class MichalBenchmark(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		self.d = kwargs['d']
-		self.type = "continuous"
-
-		if 'R' in kwargs:
-			self.R = kwargs['R']
-			print("Michal Problem: Rotating - no longer additive.")
-			print(self.R)
-		else:
-			self.R = torch.eye(self.d, self.d, dtype=torch.float64)
-			print("Michal Problem: Additive.")
-
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		(n, d) = X.size()
-		X = X @ self.R
-		X = X / 0.75
-		X = (X + 0.5) * np.pi
-		ar = torch.from_numpy(np.arange(1, d + 1, 1, dtype=np.float64))
-		sum_ = torch.sin(X) * torch.pow(torch.sin(ar * X / np.pi), int(2 * d))
-		sum_ = torch.sum(sum_, dim=1).view(-1, 1)
-		return sum_ / self.scale
-
-	def optimize(self, xtest, sigma, restarts=5, n=512):
-		xtest = torch.zeros(n, self.d, dtype=torch.float64)
-		xtest[:, 0] = torch.linspace(-0.5, 0.5, n, dtype=torch.float64)
-		ytest = self.eval(xtest, sigma=sigma)
-		kernel = stpy.kernels.KernelFunction(kernel_name="ard", gamma=torch.ones(self.d, dtype=torch.float64) * 0.1,
-											 groups=self.groups)
-		GP = GaussianProcess(kernel=kernel, s=sigma, d=self.d)
-		GP.fit_gp(xtest, ytest)
-		#GP.optimize_params(type="bandwidth", restarts=restarts)
-		#print("Optimized")
-		#GP.back_prop
-		self.gamma = torch.min(kernel.gamma)
-		return self.gamma
-
-	def bandwidth(self):
-		return 0.2
-
-	def maximum_continuous(self):
-		opt = np.ones(shape=(20))
-		# holds with different constnat
-		opt[0] = 2.93254
-		opt[1] = 2.34661
-		opt[2] = 1.64107
-		opt[3] = 1.24415
-		opt[4] = 0.999643
-		opt[5] = 0.834879
-		opt[6] = 2.1089
-		opt[7] = 1.84835
-		opt[8] = 1.64448
-		opt[9] = 1.48089
-		opt[10] = 1.34678
-		opt[11] = 1.2349
-		opt[12] = 1.89701
-		opt[13] = 1.76194
-		opt[14] = 1.64477
-		opt[15] = 1.54218
-		opt[16] = 1.45162
-		opt[17] = 1.37109
-		opt[18] = 1.81774
-		return float(opt[self.d])
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.d = kwargs["d"]
+        self.type = "continuous"
+
+        if "R" in kwargs:
+            self.R = kwargs["R"]
+            print("Michal Problem: Rotating - no longer additive.")
+            print(self.R)
+        else:
+            self.R = torch.eye(self.d, self.d, dtype=torch.float64)
+            print("Michal Problem: Additive.")
+
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        (n, d) = X.size()
+        X = X @ self.R
+        X = X / 0.75
+        X = (X + 0.5) * np.pi
+        ar = torch.from_numpy(np.arange(1, d + 1, 1, dtype=np.float64))
+        sum_ = torch.sin(X) * torch.pow(torch.sin(ar * X / np.pi), int(2 * d))
+        sum_ = torch.sum(sum_, dim=1).view(-1, 1)
+        return sum_ / self.scale
+
+    def optimize(self, xtest, sigma, restarts=5, n=512):
+        xtest = torch.zeros(n, self.d, dtype=torch.float64)
+        xtest[:, 0] = torch.linspace(-0.5, 0.5, n, dtype=torch.float64)
+        ytest = self.eval(xtest, sigma=sigma)
+        kernel = stpy.kernels.KernelFunction(
+            kernel_name="ard",
+            gamma=torch.ones(self.d, dtype=torch.float64) * 0.1,
+            groups=self.groups,
+        )
+        GP = GaussianProcess(kernel=kernel, s=sigma, d=self.d)
+        GP.fit_gp(xtest, ytest)
+        # GP.optimize_params(type="bandwidth", restarts=restarts)
+        # print("Optimized")
+        # GP.back_prop
+        self.gamma = torch.min(kernel.gamma)
+        return self.gamma
+
+    def bandwidth(self):
+        return 0.2
+
+    def maximum_continuous(self):
+        opt = np.ones(shape=(20))
+        # holds with different constnat
+        opt[0] = 2.93254
+        opt[1] = 2.34661
+        opt[2] = 1.64107
+        opt[3] = 1.24415
+        opt[4] = 0.999643
+        opt[5] = 0.834879
+        opt[6] = 2.1089
+        opt[7] = 1.84835
+        opt[8] = 1.64448
+        opt[9] = 1.48089
+        opt[10] = 1.34678
+        opt[11] = 1.2349
+        opt[12] = 1.89701
+        opt[13] = 1.76194
+        opt[14] = 1.64477
+        opt[15] = 1.54218
+        opt[16] = 1.45162
+        opt[17] = 1.37109
+        opt[18] = 1.81774
+        return float(opt[self.d])
 
 
 class StybTangBenchmark(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		self.d = kwargs['d']
-		self.type = "discrete"
-		if 'R' in kwargs:
-			self.R = kwargs['R']
-			print("Stybtang Problem: Rotating - no longer additive.")
-			print(self.R)
-		else:
-			self.R = torch.eye(self.d, self.d, dtype=torch.float64)
-			print("Stybtang Problem: Additive")
-
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		(n, d) = X.size()
-		X = X @ self.R
-		X = X * 8
-		Y = X ** 2
-		sum_ = torch.sum(Y ** 2 - 16. * Y + 5 * X, dim=1).view(-1, 1)
-		return -(0.5 * sum_ / (d * 200.) + 0.5)/self.scale
-
-	# def maximum_continuous(self):
-	# 	opt = np.ones(shape=(self.d)) * (-2.9035) / 8
-	# 	opt = torch.from_numpy(opt.reshape(1, -1))
-	# 	value = self.eval_noiseless(opt)[0][0] * 16
-	# 	return value
-	#
-	# def optimize(self, xtest, sigma, restarts=5, n=512):
-	# 	xtest = torch.zeros(n, self.d, dtype=torch.float64)
-	# 	xtest[:, 0] = torch.linspace(-0.5, 0.5, n, dtype=torch.float64)
-	# 	ytest = self.eval(xtest, sigma=sigma)
-	# 	kernel = stpy.kernels.KernelFunction(kernel_name="ard", gamma=torch.ones(self.d, dtype=torch.float64) * 0.1,
-	# 										 groups=self.groups)
-	# 	GP = GaussianProcess(kernel_custom=kernel, s=sigma, d=self.d)
-	# 	GP.fit(xtest, ytest)
-	# 	GP.optimize_params(type="bandwidth", restarts=restarts)
-	# 	print("Optimized")
-	# 	self.gamma = torch.min(kernel.gamma)
-	# 	return self.gamma
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.d = kwargs["d"]
+        self.type = "discrete"
+        if "R" in kwargs:
+            self.R = kwargs["R"]
+            print("Stybtang Problem: Rotating - no longer additive.")
+            print(self.R)
+        else:
+            self.R = torch.eye(self.d, self.d, dtype=torch.float64)
+            print("Stybtang Problem: Additive")
+
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        (n, d) = X.size()
+        X = X @ self.R
+        X = X * 8
+        Y = X**2
+        sum_ = torch.sum(Y**2 - 16.0 * Y + 5 * X, dim=1).view(-1, 1)
+        return -(0.5 * sum_ / (d * 200.0) + 0.5) / self.scale
+
+    # def maximum_continuous(self):
+    # 	opt = np.ones(shape=(self.d)) * (-2.9035) / 8
+    # 	opt = torch.from_numpy(opt.reshape(1, -1))
+    # 	value = self.eval_noiseless(opt)[0][0] * 16
+    # 	return value
+    #
+    # def optimize(self, xtest, sigma, restarts=5, n=512):
+    # 	xtest = torch.zeros(n, self.d, dtype=torch.float64)
+    # 	xtest[:, 0] = torch.linspace(-0.5, 0.5, n, dtype=torch.float64)
+    # 	ytest = self.eval(xtest, sigma=sigma)
+    # 	kernel = stpy.kernels.KernelFunction(kernel_name="ard", gamma=torch.ones(self.d, dtype=torch.float64) * 0.1,
+    # 										 groups=self.groups)
+    # 	GP = GaussianProcess(kernel_custom=kernel, s=sigma, d=self.d)
+    # 	GP.fit(xtest, ytest)
+    # 	GP.optimize_params(type="bandwidth", restarts=restarts)
+    # 	print("Optimized")
+    # 	self.gamma = torch.min(kernel.gamma)
+    # 	return self.gamma
+
 
 class GeneralizedAdditiveOverlap(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		self.d = kwargs['d']
-		self.type = "continuous"
-
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		(n, d) = X.size()
-		sum_ = torch.sum(torch.exp(-(torch.from_numpy(np.diff(X.numpy(), axis=1) / 0.25)) ** 2), dim=1).view(-1, 1)
-		return 0.5 * sum_ / self.scale
-
-	def maximum_continuous(self):
-		opt = torch.from_numpy(np.zeros(shape=(1, self.d)))
-		value = self.eval_noiseless(opt)[0][0]
-		return value
-
-	def optimize(self, xtest, sigma, restarts=5, n=512):
-		xtest = torch.zeros(n, self.d, dtype=torch.float64)
-		xtest[:, 0] = torch.linspace(-0.5, 0.5, n, dtype=torch.float64)
-		ytest = self.eval(xtest, sigma=sigma)
-		kernel = stpy.kernels.KernelFunction(kernel_name="ard", gamma=torch.ones(self.d, dtype=torch.float64) * 0.1,
-											 groups=self.groups)
-		GP = stpy.continuous_processes.gauss_procc.GaussianProcess(kernel_custom=kernel, s=sigma, d=self.d)
-		GP.fit(xtest, ytest)
-		GP.optimize_params(type="bandwidth", restarts=restarts)
-		print("Optimized")
-		# self.gamma = torch.min(kernel.gamma)
-		# self.gamma = torch.zeros(1,1,dtype = torch.DoubleTensor)
-		# self.gamma[0,0] =0.35
-		self.gamma = torch.Tensor([0.35]).double()
-		return self.gamma
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.d = kwargs["d"]
+        self.type = "continuous"
+
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        (n, d) = X.size()
+        sum_ = torch.sum(
+            torch.exp(-((torch.from_numpy(np.diff(X.numpy(), axis=1) / 0.25)) ** 2)),
+            dim=1,
+        ).view(-1, 1)
+        return 0.5 * sum_ / self.scale
+
+    def maximum_continuous(self):
+        opt = torch.from_numpy(np.zeros(shape=(1, self.d)))
+        value = self.eval_noiseless(opt)[0][0]
+        return value
+
+    def optimize(self, xtest, sigma, restarts=5, n=512):
+        xtest = torch.zeros(n, self.d, dtype=torch.float64)
+        xtest[:, 0] = torch.linspace(-0.5, 0.5, n, dtype=torch.float64)
+        ytest = self.eval(xtest, sigma=sigma)
+        kernel = stpy.kernels.KernelFunction(
+            kernel_name="ard",
+            gamma=torch.ones(self.d, dtype=torch.float64) * 0.1,
+            groups=self.groups,
+        )
+        GP = stpy.continuous_processes.gauss_procc.GaussianProcess(
+            kernel_custom=kernel, s=sigma, d=self.d
+        )
+        GP.fit(xtest, ytest)
+        GP.optimize_params(type="bandwidth", restarts=restarts)
+        print("Optimized")
+        # self.gamma = torch.min(kernel.gamma)
+        # self.gamma = torch.zeros(1,1,dtype = torch.DoubleTensor)
+        # self.gamma[0,0] =0.35
+        self.gamma = torch.tensor([0.35]).double()
+        return self.gamma
 
 
 class SwissFEL(BenchmarkFunction):
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		self.d = kwargs['d']
-		name = kwargs['dts']
-		self.Simulator = FelSimulator(self.d, 0.0, "quadrupoles_2d")
-		self.Simulator.load_fresh(name, dts='0')
-		#self.groups = stpy.helpers.helper.full_group(self.d)
-		GP = GaussianProcess(kernel_name="ard", d = self.d)
-		self.Simulator.fit_simulator(GP, optimize="bandwidth", restarts=2)
-		self.type = "continuous"
-		self.s = self.Simulator.s
-
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		y = self.Simulator.eval(X, sigma=0)
-		return y
-
-	def maximum(self, xtest=None):
-		return torch.max(self.Simulator.eval(xtest,sigma = 0))
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.d = kwargs["d"]
+        name = kwargs["dts"]
+        self.Simulator = FelSimulator(self.d, 0.0, "quadrupoles_2d")
+        self.Simulator.load_fresh(name, dts="0")
+        # self.groups = stpy.helpers.helper.full_group(self.d)
+        GP = GaussianProcess(kernel_name="ard", d=self.d)
+        self.Simulator.fit_simulator(GP, optimize="bandwidth", restarts=2)
+        self.type = "continuous"
+        self.s = self.Simulator.s
+
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        y = self.Simulator.eval(X, sigma=0)
+        return y
+
+    def maximum(self, xtest=None):
+        return torch.max(self.Simulator.eval(xtest, sigma=0))
 
 
 class CustomBenchmark(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__(**kwargs)
-		if 'func' in kwargs:
-			self.eval_f = kwargs['func']
-		else:
-			self.eval_f = lambda x: x[:, 0].view(-1, 1) * 0
-		if 'likelihood' in kwargs:
-			self.likelihood = kwargs['likelihood']
-		else:
-			self.likelihood = None
-
-	def set_eval(self, f, scale=1.):
-		self.eval_f = f
-		self.scale = scale
-
-	def eval_noiseless(self, X):
-		#super().eval_noiseless(X)
-		y = self.eval_f(X)
-		return y / self.scale
-
-	def eval(self, X):
-		if self.likelihood is not None:
-			return self.eval_noiseless(X)+self.likelihood.sample_noise(X)
-		else:
-			return self.eval_noiseless(X)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "func" in kwargs:
+            self.eval_f = kwargs["func"]
+        else:
+            self.eval_f = lambda x: x[:, 0].view(-1, 1) * 0
+        if "likelihood" in kwargs:
+            self.likelihood = kwargs["likelihood"]
+        else:
+            self.likelihood = None
+
+    def set_eval(self, f, scale=1.0):
+        self.eval_f = f
+        self.scale = scale
+
+    def eval_noiseless(self, X):
+        # super().eval_noiseless(X)
+        y = self.eval_f(X)
+        return y / self.scale
+
+    def eval(self, X):
+        if self.likelihood is not None:
+            return self.eval_noiseless(X) + self.likelihood.sample_noise(X)
+        else:
+            return self.eval_noiseless(X)
+
 
 class GaussianProcessSample(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__()
-		self.d = kwargs['d']
-		self.kernel_name = kwargs['name']
-		self.gamma = kwargs['gamma']
-		self.sigma = kwargs['sigma']
-		self.n = kwargs['n']
-		self.sample(self.n)
-
-	def sample(self, n):
-		self.xtest = self.interval(n)
-		GP = stpy.continuous_processes.gauss_procc.GaussianProcess(s=self.sigma, gamma=self.gamma,
-																   kernel=self.kernel_name)
-		self.sample = GP.sample(self.xtest).numpy()
-
-	def isin(self, element, test_elements, assume_unique=False):
-		(n, d) = element.shape
-		(m, d) = test_elements.shape
-		maskFull = np.full((n), False, dtype=bool)
-		for j in range(m):
-			mask = np.full((n), True, dtype=bool)
-			for i in range(d):
-				mask = np.logical_and(mask, np.in1d(element[:, i], test_elements[j, i], assume_unique=assume_unique))
-			# mask = np.logical_and(mask, np.isclose(element[:, i], test_elements[j, i], atol=1e-02))
-			# print (j, i, mask)
-			maskFull = np.logical_or(mask, maskFull)
-		# print (maskFull)
-		return maskFull
-
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		mask = self.isin(self.xtest.numpy(), X.numpy())
-		y = torch.from_numpy(self.sample[mask, :]).view(-1, 1)
-		return y / self.scale
-
-	def initial_guess(self, N, adv_inv=False):
-		x = self.xtest[np.random.permutation(np.arange(0, self.xtest.size()[0], 1))[0:N], :]
-		x = torch.sort(x, dim=0)[0]
-		return x
-
-	def scale_max(self, xtest=None):
-		pass
-
-	def optimize(self, xtest, sigma, restarts=5):
-		pass
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.d = kwargs["d"]
+        self.kernel_name = kwargs["name"]
+        self.gamma = kwargs["gamma"]
+        self.sigma = kwargs["sigma"]
+        self.n = kwargs["n"]
+        self.sample(self.n)
+
+    def sample(self, n):
+        self.xtest = self.interval(n)
+        GP = stpy.continuous_processes.gauss_procc.GaussianProcess(
+            s=self.sigma, gamma=self.gamma, kernel=self.kernel_name
+        )
+        self.sample = GP.sample(self.xtest).numpy()
+
+    def isin(self, element, test_elements, assume_unique=False):
+        (n, d) = element.shape
+        (m, d) = test_elements.shape
+        maskFull = np.full((n), False, dtype=bool)
+        for j in range(m):
+            mask = np.full((n), True, dtype=bool)
+            for i in range(d):
+                mask = np.logical_and(
+                    mask,
+                    np.in1d(
+                        element[:, i], test_elements[j, i], assume_unique=assume_unique
+                    ),
+                )
+            # mask = np.logical_and(mask, np.isclose(element[:, i], test_elements[j, i], atol=1e-02))
+            # print (j, i, mask)
+            maskFull = np.logical_or(mask, maskFull)
+        # print (maskFull)
+        return maskFull
+
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        mask = self.isin(self.xtest.numpy(), X.numpy())
+        y = torch.from_numpy(self.sample[mask, :]).view(-1, 1)
+        return y / self.scale
+
+    def initial_guess(self, N, adv_inv=False):
+        x = self.xtest[
+            np.random.permutation(np.arange(0, self.xtest.size()[0], 1))[0:N], :
+        ]
+        x = torch.sort(x, dim=0)[0]
+        return x
+
+    def scale_max(self, xtest=None):
+        pass
+
+    def optimize(self, xtest, sigma, restarts=5):
+        pass
 
 
 class KernelizedSample(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__()
-		self.d = kwargs['d']
-		# self.kernel_name = kwargs['name']
-		# self.gamma = kwargs['gamma']
-		self.sigma = kwargs['sigma']
-		# self.n = kwargs['n']
-		self.embed = kwargs['embed']
-		self.m = kwargs['m']
-		self.sample()
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.d = kwargs["d"]
+        # self.kernel_name = kwargs['name']
+        # self.gamma = kwargs['gamma']
+        self.sigma = kwargs["sigma"]
+        # self.n = kwargs['n']
+        self.embed = kwargs["embed"]
+        self.m = kwargs["m"]
+        self.sample()
 
-	def set_theta(self, theta):
-		self.theta = theta
+    def set_theta(self, theta):
+        self.theta = theta
 
-	def set_cutoff(self, cutoff):
-		self.theta[cutoff:, 0] = 0
+    def set_cutoff(self, cutoff):
+        self.theta[cutoff:, 0] = 0
 
-	def sample(self):
-		print("basis size:", self.m)
-		GP = stpy.continuous_processes.kernelized_features.KernelizedFeatures(d=self.d, m=self.m, embeding=self.embed)
-		self.theta = GP.sample_theta(size=1)
-		print(self.theta)
+    def sample(self):
+        print("basis size:", self.m)
+        GP = stpy.continuous_processes.kernelized_features.KernelizedFeatures(
+            d=self.d, m=self.m, embeding=self.embed
+        )
+        self.theta = GP.sample_theta(size=1)
+        print(self.theta)
 
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		y = torch.mm(self.embed(X), self.theta)
-		return y / self.scale
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        y = torch.mm(self.embed(X), self.theta)
+        return y / self.scale
 
-	def scale_max(self, xtest=None):
-		pass
+    def scale_max(self, xtest=None):
+        pass
 
-	def optimize(self, xtest, sigma, restarts=5):
-		pass
+    def optimize(self, xtest, sigma, restarts=5):
+        pass
 
 
 class Simple1DFunction(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__()
-		self.d = kwargs['d']
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.d = kwargs["d"]
+
+    def eval_noiseless(self, X):
+        super().eval_noiseless(X)
+        z = (X + 0.5) * 1.2
+        y = -(1.4 - 3 * z) * torch.sin(18 * z)
+        return y
 
-	def eval_noiseless(self, X):
-		super().eval_noiseless(X)
-		z = (X+0.5)*1.2
-		y = -(1.4-3*z)*torch.sin(18*z)
-		return y
+    def maximum(self, xtest):
+        return torch.max(torch.abs(self.eval_noiseless(xtest)))
 
-	def maximum(self, xtest):
-		return torch.max(torch.abs(self.eval_noiseless(xtest)))
 
 class MultiRKHS(BenchmarkFunction):
 
-	def __init__(self, **kwargs):
-		super().__init__()
-		self.d = 1
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.d = 1
 
-	def eval_noiseless(self, X):
-		y = 10 * X ** 2  # + 0.1*torch.sin(10*X) #+ torch.sum(torch.exp(-(X-Xi)**2)*Wi)
-		return y
+    def eval_noiseless(self, X):
+        y = 10 * X**2  # + 0.1*torch.sin(10*X) #+ torch.sum(torch.exp(-(X-Xi)**2)*Wi)
+        return y
 
-	def maximum(self, xtest=None):
-		pass
+    def maximum(self, xtest=None):
+        pass
 
 
 class LinearBenchmark(BenchmarkFunction):
 
-	def __init__(self, d, s):
-		self.d = d
-		self.s = s
-		# sample a plane
-		self.theta = torch.randn(d, 1, dtype=torch.float64)
-
-	def eval_noiseless(self, X):
-		y = torch.mm(X, self.theta)
-		return y
-
-	def eval(self, X, sigma=None):
-		if sigma is None:
-			sigma = self.s
-		z = self.eval_noiseless(X)
-		y = z + sigma * torch.randn(X.size()[0], 1, dtype=torch.float64)
-		return y
+    def __init__(self, d, s):
+        self.d = d
+        self.s = s
+        # sample a plane
+        self.theta = torch.randn(d, 1, dtype=torch.float64)
+
+    def eval_noiseless(self, X):
+        y = torch.mm(X, self.theta)
+        return y
+
+    def eval(self, X, sigma=None):
+        if sigma is None:
+            sigma = self.s
+        z = self.eval_noiseless(X)
+        y = z + sigma * torch.randn(X.size()[0], 1, dtype=torch.float64)
+        return y
diff --git a/stpy/test_functions/neural_net.py b/stpy/test_functions/neural_net.py
index 82807cd..82d770d 100755
--- a/stpy/test_functions/neural_net.py
+++ b/stpy/test_functions/neural_net.py
@@ -4,206 +4,232 @@
 
 
 def matlab_style_gauss2D(shape=(3, 3), sigma=0.5):
-	m, n = [(ss - 1.) / 2. for ss in shape]
-	y, x = np.ogrid[-m:m + 1, -n:n + 1]
-	h = np.exp(-(x * x + y * y) / (2. * sigma * sigma))
-	h[h < np.finfo(h.dtype).eps * h.max()] = 0
-	sumh = h.sum()
-	if sumh != 0:
-		h /= sumh
-	return h
+    m, n = [(ss - 1.0) / 2.0 for ss in shape]
+    y, x = np.ogrid[-m : m + 1, -n : n + 1]
+    h = np.exp(-(x * x + y * y) / (2.0 * sigma * sigma))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    sumh = h.sum()
+    if sumh != 0:
+        h /= sumh
+    return h
 
 
 def gaussian_filters(shape, sigmas):
-	(height, width, enter, leave) = shape
-	G = np.zeros(shape=shape)
-	for q in range(enter):
-		for index, val in enumerate(sigmas):
-			G[:, :, q, index] = matlab_style_gauss2D(shape=(height, width), sigma=val)
-	return G
+    (height, width, enter, leave) = shape
+    G = np.zeros(shape=shape)
+    for q in range(enter):
+        for index, val in enumerate(sigmas):
+            G[:, :, q, index] = matlab_style_gauss2D(shape=(height, width), sigma=val)
+    return G
 
 
 def gaussian_filters_tf(shape, sigmas):
-	G = gaussian_filters(shape, sigmas)
-	return tf.cast(tf.Variable(G), tf.float32)
+    G = gaussian_filters(shape, sigmas)
+    return tf.cast(tf.Variable(G), tf.float32)
 
 
 def deepnn(x, initialization_params, no_filters_1=32, no_filters_2=64):
-	"""deepnn builds the graph for a deep net for classifying digits.
-	Args:
-			x: an input tensor with the dimensions (N_examples, 784), where 784 is the
-			number of pixels in a standard MNIST image.
-	Returns:
-			A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values
-			equal to the logits of classifying the digit into one of 10 classes (the
-			digits 0-9). keep_prob is a scalar placeholder for the probability of
-			dropout.
-	"""
-
-	# Reshape to use within a convolutional neural net.
-	# Last dimension is for "features" - there is only one here, since images are
-	# grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
-	with tf.name_scope('reshape'):
-		x_image = tf.reshape(x, [-1, 28, 28, 1])
-
-	# First convolutional layer - maps one grayscale image to 32 feature maps.
-	with tf.name_scope('conv1'):
-		# W_conv1 = weight_variable([5, 5, 1, no_filters_1])
-		W_conv1 = gaussian_filters_tf([5, 5, 1, no_filters_1], initialization_params[0:no_filters_1])
-		b_conv1 = bias_variable([no_filters_1])
-		h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
-
-	# Pooling layer - downsamples by 2X.
-	with tf.name_scope('pool1'):
-		h_pool1 = max_pool_2x2(h_conv1)
-
-	# Second convolutional layer -- maps 32 feature maps to 64.
-	with tf.name_scope('conv2'):
-		# W_conv2 = weight_variable([5, 5, no_filters_1, no_filters_2])
-		W_conv2 = gaussian_filters_tf([5, 5, no_filters_1, no_filters_2], initialization_params[no_filters_1:])
-		b_conv2 = bias_variable([no_filters_2])
-		h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
-
-	# Second pooling layer.
-	with tf.name_scope('pool2'):
-		h_pool2 = max_pool_2x2(h_conv2)
-
-	# Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
-	# is down to 7x7x64 feature maps -- maps this to 1024 features.
-	with tf.name_scope('fc1'):
-		W_fc1 = weight_variable([7 * 7 * no_filters_2, 1024])
-		b_fc1 = bias_variable([1024])
-
-		h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * no_filters_2])
-		h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
-
-	# Dropout - controls the complexity of the model, prevents co-adaptation of
-	# features.
-	with tf.name_scope('dropout'):
-		keep_prob = tf.placeholder(tf.float32)
-		h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
-
-	# Map the 1024 features to 10 classes, one for each digit
-	with tf.name_scope('fc2'):
-		W_fc2 = weight_variable([1024, 10])
-		b_fc2 = bias_variable([10])
-
-		y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
-	return y_conv, keep_prob
+    """deepnn builds the graph for a deep net for classifying digits.
+    Args:
+                    x: an input tensor with the dimensions (N_examples, 784), where 784 is the
+                    number of pixels in a standard MNIST image.
+    Returns:
+                    A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values
+                    equal to the logits of classifying the digit into one of 10 classes (the
+                    digits 0-9). keep_prob is a scalar placeholder for the probability of
+                    dropout.
+    """
+
+    # Reshape to use within a convolutional neural net.
+    # Last dimension is for "features" - there is only one here, since images are
+    # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
+    with tf.name_scope("reshape"):
+        x_image = tf.reshape(x, [-1, 28, 28, 1])
+
+    # First convolutional layer - maps one grayscale image to 32 feature maps.
+    with tf.name_scope("conv1"):
+        # W_conv1 = weight_variable([5, 5, 1, no_filters_1])
+        W_conv1 = gaussian_filters_tf(
+            [5, 5, 1, no_filters_1], initialization_params[0:no_filters_1]
+        )
+        b_conv1 = bias_variable([no_filters_1])
+        h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
+
+    # Pooling layer - downsamples by 2X.
+    with tf.name_scope("pool1"):
+        h_pool1 = max_pool_2x2(h_conv1)
+
+    # Second convolutional layer -- maps 32 feature maps to 64.
+    with tf.name_scope("conv2"):
+        # W_conv2 = weight_variable([5, 5, no_filters_1, no_filters_2])
+        W_conv2 = gaussian_filters_tf(
+            [5, 5, no_filters_1, no_filters_2], initialization_params[no_filters_1:]
+        )
+        b_conv2 = bias_variable([no_filters_2])
+        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
+
+    # Second pooling layer.
+    with tf.name_scope("pool2"):
+        h_pool2 = max_pool_2x2(h_conv2)
+
+    # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
+    # is down to 7x7x64 feature maps -- maps this to 1024 features.
+    with tf.name_scope("fc1"):
+        W_fc1 = weight_variable([7 * 7 * no_filters_2, 1024])
+        b_fc1 = bias_variable([1024])
+
+        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * no_filters_2])
+        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
+
+    # Dropout - controls the complexity of the model, prevents co-adaptation of
+    # features.
+    with tf.name_scope("dropout"):
+        keep_prob = tf.placeholder(tf.float32)
+        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
+
+    # Map the 1024 features to 10 classes, one for each digit
+    with tf.name_scope("fc2"):
+        W_fc2 = weight_variable([1024, 10])
+        b_fc2 = bias_variable([10])
+
+        y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
+    return y_conv, keep_prob
 
 
 def conv2d(x, W):
-	"""conv2d returns a 2d convolution layer with full stride."""
-	return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
+    """conv2d returns a 2d convolution layer with full stride."""
+    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME")
 
 
 def max_pool_2x2(x):
-	"""max_pool_2x2 downsamples a feature map by 2X."""
-	return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
-						  strides=[1, 2, 2, 1], padding='SAME')
+    """max_pool_2x2 downsamples a feature map by 2X."""
+    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
 
 
 def weight_variable(shape):
-	"""weight_variable generates a weight variable of a given shape."""
-	initial = tf.truncated_normal(shape, stddev=0.1)
-	return tf.Variable(initial)
+    """weight_variable generates a weight variable of a given shape."""
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
 
 
 def bias_variable(shape):
-	"""bias_variable generates a bias variable of a given shape."""
-	initial = tf.constant(0.1, shape=shape)
-	return tf.Variable(initial)
-
-
-def train_network(mnist, verbose=True, initialization_params=None, min_steps_val=10,
-				  val_size=3000, dropout=0.5, learning_rate=10e-4, maxiter=500, val_count=1, batch_size=80, **kwargs):
-	# Import data
-	# Create the model
-	x = tf.placeholder(tf.float32, [None, 784])
-
-	# Define loss and optimizer
-	y_ = tf.placeholder(tf.float32, [None, 10])
-
-	# Build the graph for the deep net
-	y_conv, keep_prob = deepnn(x, initialization_params)
-
-	with tf.name_scope('loss'):
-		cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)
-
-	cross_entropy = tf.reduce_mean(cross_entropy)
-
-	with tf.name_scope('adam_optimizer'):
-		train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)
-
-	with tf.name_scope('accuracy'):
-		correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
-		correct_prediction = tf.cast(correct_prediction, tf.float32)
-	accuracy = tf.reduce_mean(correct_prediction)
-
-	# graph_location = tempfile.mkdtemp()
-	# print('Saving graph to: %s' % graph_location)
-	# train_writer = tf.summary.FileWriter(graph_location)
-	# train_writer.add_graph(tf.get_default_graph())
-
-	init = tf.initialize_all_variables()
-
-	with tf.Session() as sess:
-		sess.run(init)
-		# sess.run(tf.global_variables_initializer())
-
-		oldval_scores = np.zeros((min_steps_val))
-		j = 0
-		for i in range(maxiter):
-			batch = mnist.train.next_batch(batch_size)
-			train_accuracy = accuracy.eval(feed_dict={
-				x: batch[0], y_: batch[1], keep_prob: 1.0})
-			if i % val_count == 0:
-
-				val_accuracy = accuracy.eval(feed_dict={
-					x: mnist.validation.images[0:val_size],
-					y_: mnist.validation.labels[0:val_size], keep_prob: 1.0})
-				oldval_scores[j % min_steps_val] = val_accuracy
-				j = j + 1
-				if verbose == True:
-					print('step %d, training accuracy: %f, validation accuracy: %f' % (i, train_accuracy, val_accuracy))
-				## validation stopping 
-				if i > min_steps_val:
-					if np.mean(oldval_scores) > val_accuracy:
-						if verbose == True:
-							print("Validation stopping")
-						break
-			train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: dropout})
-
-		test_accuracy = accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
-		if verbose == True:
-			print('test accuracy %g' % test_accuracy)
-
-		return (i, test_accuracy)
+    """bias_variable generates a bias variable of a given shape."""
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+
+def train_network(
+    mnist,
+    verbose=True,
+    initialization_params=None,
+    min_steps_val=10,
+    val_size=3000,
+    dropout=0.5,
+    learning_rate=10e-4,
+    maxiter=500,
+    val_count=1,
+    batch_size=80,
+    **kwargs
+):
+    # Import data
+    # Create the model
+    x = tf.placeholder(tf.float32, [None, 784])
+
+    # Define loss and optimizer
+    y_ = tf.placeholder(tf.float32, [None, 10])
+
+    # Build the graph for the deep net
+    y_conv, keep_prob = deepnn(x, initialization_params)
+
+    with tf.name_scope("loss"):
+        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+            labels=y_, logits=y_conv
+        )
+
+    cross_entropy = tf.reduce_mean(cross_entropy)
+
+    with tf.name_scope("adam_optimizer"):
+        train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)
+
+    with tf.name_scope("accuracy"):
+        correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+        correct_prediction = tf.cast(correct_prediction, tf.float32)
+    accuracy = tf.reduce_mean(correct_prediction)
+
+    # graph_location = tempfile.mkdtemp()
+    # print('Saving graph to: %s' % graph_location)
+    # train_writer = tf.summary.FileWriter(graph_location)
+    # train_writer.add_graph(tf.get_default_graph())
+
+    init = tf.initialize_all_variables()
+
+    with tf.Session() as sess:
+        sess.run(init)
+        # sess.run(tf.global_variables_initializer())
+
+        oldval_scores = np.zeros((min_steps_val))
+        j = 0
+        for i in range(maxiter):
+            batch = mnist.train.next_batch(batch_size)
+            train_accuracy = accuracy.eval(
+                feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0}
+            )
+            if i % val_count == 0:
+
+                val_accuracy = accuracy.eval(
+                    feed_dict={
+                        x: mnist.validation.images[0:val_size],
+                        y_: mnist.validation.labels[0:val_size],
+                        keep_prob: 1.0,
+                    }
+                )
+                oldval_scores[j % min_steps_val] = val_accuracy
+                j = j + 1
+                if verbose == True:
+                    print(
+                        "step %d, training accuracy: %f, validation accuracy: %f"
+                        % (i, train_accuracy, val_accuracy)
+                    )
+                ## validation stopping
+                if i > min_steps_val:
+                    if np.mean(oldval_scores) > val_accuracy:
+                        if verbose == True:
+                            print("Validation stopping")
+                        break
+            train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: dropout})
+
+        test_accuracy = accuracy.eval(
+            feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}
+        )
+        if verbose == True:
+            print("test accuracy %g" % test_accuracy)
+
+        return (i, test_accuracy)
 
 
 if __name__ == "__main__":
-	N = 16
-	N2 = 32
-	sigmas = np.random.randn(N + N2)
-	sigmas = sigmas ** 2
-	# print (train_network("/tmp/tensorflow", dropout = 0.7, verbose = False, val_size = 1,  initialization_params = sigmas, no_filters_1=N, no_filters_2=N2))
-
-	##############################################
-	########  Visualization of Filters ###########
-	##############################################
-	import matplotlib as mpl
-
-	V = gaussian_filters((5, 5, 1, N), sigmas[0:N])
-	fig, axes = plt.subplots(nrows=4, ncols=int(N / 4))
-	for index, ax in enumerate(axes.flat):
-		im = ax.imshow(V[:, :, 0, index], interpolation='nearest', vmin=0, vmax=1)
-	cax, kw = mpl.colorbar.make_axes([ax for ax in axes.flat])
-	plt.colorbar(im, cax=cax, **kw)
-
-	V2 = gaussian_filters((5, 5, N, N2), sigmas[N:])
-	fig, axes = plt.subplots(nrows=8, ncols=int(N2 / 8))
-	for index, ax in enumerate(axes.flat):
-		im = ax.imshow(V2[:, :, 0, index], interpolation='nearest', vmin=0, vmax=1)
-	cax, kw = mpl.colorbar.make_axes([ax for ax in axes.flat])
-	plt.colorbar(im, cax=cax, **kw)
-	plt.show()
+    N = 16
+    N2 = 32
+    sigmas = np.random.randn(N + N2)
+    sigmas = sigmas**2
+    # print (train_network("/tmp/tensorflow", dropout = 0.7, verbose = False, val_size = 1,  initialization_params = sigmas, no_filters_1=N, no_filters_2=N2))
+
+    ##############################################
+    ########  Visualization of Filters ###########
+    ##############################################
+    import matplotlib as mpl
+
+    V = gaussian_filters((5, 5, 1, N), sigmas[0:N])
+    fig, axes = plt.subplots(nrows=4, ncols=int(N / 4))
+    for index, ax in enumerate(axes.flat):
+        im = ax.imshow(V[:, :, 0, index], interpolation="nearest", vmin=0, vmax=1)
+    cax, kw = mpl.colorbar.make_axes([ax for ax in axes.flat])
+    plt.colorbar(im, cax=cax, **kw)
+
+    V2 = gaussian_filters((5, 5, N, N2), sigmas[N:])
+    fig, axes = plt.subplots(nrows=8, ncols=int(N2 / 8))
+    for index, ax in enumerate(axes.flat):
+        im = ax.imshow(V2[:, :, 0, index], interpolation="nearest", vmin=0, vmax=1)
+    cax, kw = mpl.colorbar.make_axes([ax for ax in axes.flat])
+    plt.colorbar(im, cax=cax, **kw)
+    plt.show()
diff --git a/stpy/test_functions/parallel_coordinates_plot.py b/stpy/test_functions/parallel_coordinates_plot.py
index e9d9a15..836ed92 100755
--- a/stpy/test_functions/parallel_coordinates_plot.py
+++ b/stpy/test_functions/parallel_coordinates_plot.py
@@ -8,63 +8,65 @@
 
 
 def parallel_coordinates_bo(X, Y, names=None, scaling=None, fig_size=(20, 10)):
-	"""
-		Parallel plot graph
+    """
+    Parallel plot graph
 
-		X : 2D numpy array of parameters [points,parameters]
-		Y : 1D numpy array of values
-		names: list of names size of (parameters)
-		scaling:
-			"stat": statistical scaling
-			None : no scaling
-			(low,hig): tuple, scales to [-1,1]
-		fig_size: fig size in inches
-	"""
+    X : 2D numpy array of parameters [points,parameters]
+    Y : 1D numpy array of values
+    names: list of names size of (parameters)
+    scaling:
+            "stat": statistical scaling
+            None : no scaling
+            (low,hig): tuple, scales to [-1,1]
+    fig_size: fig size in inches
+    """
 
-	if scaling == "stat":
-		scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
-		scaler.fit(X)
-		Z = scaler.transform(X)
-	elif scaling is None:
-		Z = X
-	else:
-		try:
-			Z = X
-			up, low = scaling
-			d = X.shape[1]
-			for i in range(d):
-				Z[:, i] = (2 * X[:, i]) / (up[i] - low[i]) + (1.0 - 2 * up[i] / (up[i] - low[i]))
-		except:
-			pass
+    if scaling == "stat":
+        scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
+        scaler.fit(X)
+        Z = scaler.transform(X)
+    elif scaling is None:
+        Z = X
+    else:
+        try:
+            Z = X
+            up, low = scaling
+            d = X.shape[1]
+            for i in range(d):
+                Z[:, i] = (2 * X[:, i]) / (up[i] - low[i]) + (
+                    1.0 - 2 * up[i] / (up[i] - low[i])
+                )
+        except:
+            pass
 
-	D = np.append(Z, Y, axis=1)
-	data = pd.DataFrame(D)
-	data = data.sort_values(by=Z.shape[1])
-	names = copy.copy(names)
-	names.append(Z.shape[1])
-	if names is not None:
-		data.columns = names
-	plt.figure(figsize=(fig_size))
-	plt.xticks(rotation=45)
-	ax = parallel_coordinates(data, Z.shape[1], colormap="summer")
-	ax.get_legend().remove()
-	plt.show()
+    D = np.append(Z, Y, axis=1)
+    data = pd.DataFrame(D)
+    data = data.sort_values(by=Z.shape[1])
+    names = copy.copy(names)
+    names.append(Z.shape[1])
+    if names is not None:
+        data.columns = names
+    plt.figure(figsize=(fig_size))
+    plt.xticks(rotation=45)
+    ax = parallel_coordinates(data, Z.shape[1], colormap="summer")
+    ax.get_legend().remove()
+    plt.show()
 
 
 if __name__ == "__main__":
-	from stpy.test_functions.protein_benchmark import ProteinBenchmark
+    from stpy.test_functions.protein_benchmark import ProteinBenchmark
 
-	Benchmark = ProteinBenchmark("protein_data_gb1.h5", dim=3, ref=['A', 'B', 'C', 'D'])
-	names = Benchmark.data['P1'].values
-	Benchmark.self_translate()
-	vals = Benchmark.data['P1'].values
+    Benchmark = ProteinBenchmark("protein_data_gb1.h5", dim=3, ref=["A", "B", "C", "D"])
+    names = Benchmark.data["P1"].values
+    Benchmark.self_translate()
+    vals = Benchmark.data["P1"].values
 
-	print(Benchmark.data)
-	X = Benchmark.data.values[0:8000, 0:3]
-	Y = Benchmark.data.values[0:8000, 5].reshape(-1, 1)
-	print(X.shape, Y.shape)
-	names = ["P1", "P2", "P3"]
-	# plt.yticks(vals, names)
-	parallel_coordinates_bo(X, Y, names=names)
+    print(Benchmark.data)
+    X = Benchmark.data.values[0:8000, 0:3]
+    Y = Benchmark.data.values[0:8000, 5].reshape(-1, 1)
+    print(X.shape, Y.shape)
+    names = ["P1", "P2", "P3"]
+    # plt.yticks(vals, names)
+    parallel_coordinates_bo(X, Y, names=names)
 
-	plt.show()
+    plt.show()
diff --git a/stpy/test_functions/protein_benchmark.py b/stpy/test_functions/protein_benchmark.py
index 1dc4189..008bfe4 100755
--- a/stpy/test_functions/protein_benchmark.py
+++ b/stpy/test_functions/protein_benchmark.py
@@ -3,407 +3,489 @@
 import pandas as pd
 import torch
 
-#import stpy.helpers.helper as helper
-#from stpy.test_functions.benchmarks import BenchmarkFunction
-
-
-class ProteinOperator():
-
-	def __init__(self):
-
-		self.real_names = {'A': 'Ala', 'R': 'Arg', 'N': 'Asn', 'D': 'Asp', 'C': 'Cys', 'Q': 'Gln', 'E': 'Glu',
-						   'G': 'Gly',
-						   'H': 'His', 'I': 'Iso', 'L': 'Leu', 'K': 'Lys', 'M': 'Met', 'F': 'Phe',
-						   'P': 'Pro', 'S': 'Ser', 'T': 'Thr', 'W': 'Trp', 'Y': 'Tyr', 'V': 'Val', 'B': 'Asx'}
-
-		self.dictionary = {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'Q': 5, 'E': 6, 'G': 7,
-						   'H': 8, 'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13,
-						   'P': 14, 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19, 'B': 3}
-
-		self.inv_dictionary = {v: k for k, v in self.dictionary.items()}
-
-		self.inv_real_names = {v: k for k, v in self.real_names.items()}
-
-		self.Negative = ['D', 'E']
-		self.Positive = ['R', 'K', 'H']
-		self.Aromatic = ['F', 'W', 'Y', 'H']
-		self.Polar = ['N', 'Q', 'S', 'T', 'Y']
-		self.Aliphatic = ['A', 'G', 'I', 'L', 'V']
-		self.Amide = ['N', 'Q']
-		self.Sulfur = ['C', 'M']
-		self.Hydroxil = ['S', 'T']
-		self.Small = ['A', 'S', 'T', 'P', 'G', 'V']
-		self.Medium = ['M', 'L', 'I', 'C', 'N', 'Q', 'K', 'D', 'E']
-		self.Large = ['R', 'H', 'W', 'F', 'Y']
-		self.Hydro = ['M', 'L', 'I', 'V', 'A']
-		self.Cyclic = ['P']
-		self.Random = ['F', 'W', 'L', 'S', 'D']
-
-	def translate(self, X):
-		f = lambda x: self.dictionary[x]
-		Y = np.zeros(shape=X.shape).astype(int)
-		for i in range(X.shape[0]):
-			for j in range(X.shape[1]):
-				Y[i, j] = f(X[i, j])
-		return Y
-
-	def remove_wild_type_mutations(self, mutation):
-		mutation_split = mutation.split("+")
-		output = []
-		for mut in mutation_split:
-			if mut[0] != mut[-1]:
-				output.append(mut)
-		return "+".join(output)
-
-	def get_variant_code(self, mutation):
-		mutation_split = mutation.split("+")
-		return "".join([mut[-1] for mut in mutation_split])
-
-	def get_substitutes_from_mutation(self, mutation):
-		mutation_split = mutation.split("+")
-		original = []
-		new = []
-		positions = []
-
-		for mut in mutation_split:
-			original.append(mut[0])
-			new.append(mut[-1])
-			positions.append(int(mut[1:-1]))
-
-		return (original, new, positions)
-
-	def mutation(self, original_seq, positions, new_seq):
-		old_seq = list(original_seq)
-		new_seq = list(new_seq)
-		identifier = []
-		for old, new, position in zip(old_seq, new_seq, positions):
-			if old != new:
-				identifier.append(old + str(position) + new)
-		return '+'.join(identifier)
-
-	def interval_number(self, dim=None):
-		if dim is None:
-			dim = self.dim
-		arr = self.interval_letters(dim=dim)
-		out = self.translate(arr)
-		return out
-
-	def interval_onehot(self, dim=None):
-		if dim is None:
-			dim = self.dim
-		arr = self.interval_letters(dim=dim)
-		out = self.translate_one_hot(arr)
-		return out
-
-	def interval_letters(self, dim=None):
-		if dim is None:
-			dim = self.dim
-
-		names = list(self.dictionary.keys())
-		names.remove('B')
-		arr = []
-		for i in range(dim):
-			arr.append(names)
-		out = helper.cartesian(arr)
-		return out
-
-	def translate_amino_acid(self, letter):
-		return self.dictionary[letter]
-
-	def translate_mutation_series(self, series):
-		f = lambda x: np.array(list(map(int, [self.dictionary[a] for a in list(str(x))]))).reshape(-1, 1)
-		xtest = np.concatenate(series.apply(f).values, axis=1).T
-		return xtest
-
-	def translate_one_hot(self, X):
-		try:
-			Y = self.translate(X)
-		except:
-			Y = X
-		n, d = list(X.shape)
-		Z = np.zeros(shape=(n, d * self.total))
-		for i in range(n):
-			for j in range(d):
-				Z[i, Y[i, j] + j * self.total] = 1.0
-
-		return Z
-
-	def get_real_name(self, name):
-		out = []
-		for i in name:
-			out.append(self.real_names[i])
-		return out
-
-
-class ProteinBenchmark():
-
-	def __init__(self, fname, dim=1, ref=['D', 'D', 'D', 'D'], avg=False, scale=True):
-		"""
-		initialize the protein benchmark
-
-		 fname : dataset name
-		 dim : dimension of the dataset
-		 ref : for smaller dimensions what is the reference in the 4 dim space?
-		 avg : average the effect over other combinations in lower dimensions
-		"""
-
-		"""
+# import stpy.helpers.helper as helper
+# from stpy.test_functions.benchmarks import BenchmarkFunction
+
+
+class ProteinOperator:
+
+    def __init__(self):
+
+        self.real_names = {
+            "A": "Ala",
+            "R": "Arg",
+            "N": "Asn",
+            "D": "Asp",
+            "C": "Cys",
+            "Q": "Gln",
+            "E": "Glu",
+            "G": "Gly",
+            "H": "His",
+            "I": "Iso",
+            "L": "Leu",
+            "K": "Lys",
+            "M": "Met",
+            "F": "Phe",
+            "P": "Pro",
+            "S": "Ser",
+            "T": "Thr",
+            "W": "Trp",
+            "Y": "Tyr",
+            "V": "Val",
+            "B": "Asx",
+        }
+
+        self.dictionary = {
+            "A": 0,
+            "R": 1,
+            "N": 2,
+            "D": 3,
+            "C": 4,
+            "Q": 5,
+            "E": 6,
+            "G": 7,
+            "H": 8,
+            "I": 9,
+            "L": 10,
+            "K": 11,
+            "M": 12,
+            "F": 13,
+            "P": 14,
+            "S": 15,
+            "T": 16,
+            "W": 17,
+            "Y": 18,
+            "V": 19,
+            "B": 3,
+        }
+
+        self.inv_dictionary = {v: k for k, v in self.dictionary.items()}
+
+        self.inv_real_names = {v: k for k, v in self.real_names.items()}
+
+        self.Negative = ["D", "E"]
+        self.Positive = ["R", "K", "H"]
+        self.Aromatic = ["F", "W", "Y", "H"]
+        self.Polar = ["N", "Q", "S", "T", "Y"]
+        self.Aliphatic = ["A", "G", "I", "L", "V"]
+        self.Amide = ["N", "Q"]
+        self.Sulfur = ["C", "M"]
+        self.Hydroxil = ["S", "T"]
+        self.Small = ["A", "S", "T", "P", "G", "V"]
+        self.Medium = ["M", "L", "I", "C", "N", "Q", "K", "D", "E"]
+        self.Large = ["R", "H", "W", "F", "Y"]
+        self.Hydro = ["M", "L", "I", "V", "A"]
+        self.Cyclic = ["P"]
+        self.Random = ["F", "W", "L", "S", "D"]
+
+    def translate(self, X):
+        f = lambda x: self.dictionary[x]
+        Y = np.zeros(shape=X.shape).astype(int)
+        for i in range(X.shape[0]):
+            for j in range(X.shape[1]):
+                Y[i, j] = f(X[i, j])
+        return Y
+
+    def remove_wild_type_mutations(self, mutation):
+        mutation_split = mutation.split("+")
+        output = []
+        for mut in mutation_split:
+            if mut[0] != mut[-1]:
+                output.append(mut)
+        return "+".join(output)
+
+    def get_variant_code(self, mutation):
+        mutation_split = mutation.split("+")
+        return "".join([mut[-1] for mut in mutation_split])
+
+    def get_substitutes_from_mutation(self, mutation):
+        mutation_split = mutation.split("+")
+        original = []
+        new = []
+        positions = []
+
+        for mut in mutation_split:
+            original.append(mut[0])
+            new.append(mut[-1])
+            positions.append(int(mut[1:-1]))
+
+        return (original, new, positions)
+
+    def mutation(self, original_seq, positions, new_seq):
+        old_seq = list(original_seq)
+        new_seq = list(new_seq)
+        identifier = []
+        for old, new, position in zip(old_seq, new_seq, positions):
+            if old != new:
+                identifier.append(old + str(position) + new)
+        return "+".join(identifier)
+
+    def interval_number(self, dim=None):
+        if dim is None:
+            dim = self.dim
+        arr = self.interval_letters(dim=dim)
+        out = self.translate(arr)
+        return out
+
+    def interval_onehot(self, dim=None):
+        if dim is None:
+            dim = self.dim
+        arr = self.interval_letters(dim=dim)
+        out = self.translate_one_hot(arr)
+        return out
+
+    def interval_letters(self, dim=None):
+        if dim is None:
+            dim = self.dim
+
+        names = list(self.dictionary.keys())
+        names.remove("B")
+        arr = []
+        for i in range(dim):
+            arr.append(names)
+        out = helper.cartesian(arr)
+        return out
+
+    def translate_amino_acid(self, letter):
+        return self.dictionary[letter]
+
+    def translate_mutation_series(self, series):
+        f = lambda x: np.array(
+            list(map(int, [self.dictionary[a] for a in list(str(x))]))
+        ).reshape(-1, 1)
+        xtest = np.concatenate(series.apply(f).values, axis=1).T
+        return xtest
+
+    def translate_one_hot(self, X):
+        try:
+            Y = self.translate(X)
+        except:
+            Y = X
+        n, d = list(X.shape)
+        Z = np.zeros(shape=(n, d * self.total))
+        for i in range(n):
+            for j in range(d):
+                Z[i, Y[i, j] + j * self.total] = 1.0
+
+        return Z
+
+    def get_real_name(self, name):
+        out = []
+        for i in name:
+            out.append(self.real_names[i])
+        return out
+
+
+class ProteinBenchmark:
+
+    def __init__(self, fname, dim=1, ref=["D", "D", "D", "D"], avg=False, scale=True):
+        """
+        initialize the protein benchmark
+
+         fname : dataset name
+         dim : dimension of the dataset
+         ref : for smaller dimensions what is the reference in the 4 dim space?
+         avg : average the effect over other combinations in lower dimensions
+        """
+
+        """
 		Convention of the following dictionary is to map B->D as B can stand for N and D. 
 		"""
 
-		self.dictionary = {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'Q': 5, 'E': 6, 'G': 7,
-						   'H': 8, 'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13,
-						   'P': 14, 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19, 'B': 3}
-
-		f = lambda x: self.dictionary[x]
-
-		self.total = 20
-		self.dim = dim
-		self.ref = ref
-		self.ref_translated = [f(x) for x in self.ref]
-
-		dset = pd.read_hdf(fname)
-
-		# average the effect over others
-		if avg == False:
-			mask = np.full(dset.shape[0], True, dtype=bool)
-			for j in range(4 - dim):
-				mask = np.logical_and(mask, dset["P" + str(4 - j)] == ref[3 - j])
-			self.data = dset[mask]
-		else:
-			# avg. not implemented
-			pass
-
-		if scale == True:
-			maximum = np.max(self.data[:]['Fitness'])
-			self.data[:]['Fitness'] = self.data[:]['Fitness'] / maximum
-		else:
-			pass
-
-		self.real_names = {'A': 'Ala', 'R': 'Arg', 'N': 'Asn', 'D': 'Asp', 'C': 'Cys', 'Q': 'Gln', 'E': 'Glu',
-						   'G': 'Gly',
-						   'H': 'His', 'I': 'Iso', 'L': 'Leu', 'K': 'Lys', 'M': 'Met', 'F': 'Phe',
-						   'P': 'Pro', 'S': 'Ser', 'T': 'Thr', 'W': 'Trp', 'Y': 'Tyr', 'V': 'Val', 'B': 'Asx'}
-
-		self.inv_real_names = {v: k for k, v in self.real_names.items()}
-
-		self.Negative = ['D', 'E']
-		self.Positive = ['R', 'K', 'H']
-		self.Aromatic = ['F', 'W', 'Y', 'H']
-		self.Polar = ['N', 'Q', 'S', 'T', 'Y']
-		self.Aliphatic = ['A', 'G', 'I', 'L', 'V']
-		self.Amide = ['N', 'Q']
-		self.Sulfur = ['C', 'M']
-		self.Hydroxil = ['S', 'T']
-		self.Small = ['A', 'S', 'T', 'P', 'G', 'V']
-		self.Medium = ['M', 'L', 'I', 'C', 'N', 'Q', 'K', 'D', 'E']
-		self.Large = ['R', 'H', 'W', 'F', 'Y']
-		self.Hydro = ['M', 'L', 'I', 'V', 'A']
-		self.Cyclic = ['P']
-		self.Random = ['F', 'W', 'L', 'S', 'D']
-
-	def get_real_name(self, name):
-		out = []
-		for i in name:
-			out.append(self.real_names[i])
-		return out
-
-	def data_summary(self):
-		y = self.data['Fitness'].values
-		maximum = np.max(y)
-		minimum = np.min(y)
-		return (maximum, minimum)
-
-	def translate(self, X):
-		f = lambda x: self.dictionary[x]
-		Y = np.zeros(shape=X.shape).astype(int)
-		for i in range(X.shape[0]):
-			for j in range(X.shape[1]):
-				Y[i, j] = f(X[i, j])
-		return Y
-
-	def translate_one_hot(self, X):
-		try:
-			Y = self.translate(X)
-		except:
-			Y = X
-		n, d = list(X.shape)
-		Z = np.zeros(shape=(n, d * self.total))
-		for i in range(n):
-			for j in range(d):
-				Z[i, Y[i, j] + j * self.total] = 1.0
-
-		return Z
-
-	def self_translate(self):
-		"""
-		self translate from
-		:return:
-		"""
-		f = lambda x: self.dictionary[x]
-		for j in range(4):
-			self.data['P' + str(j + 1)] = self.data['P' + str(j + 1)].apply(f)
-
-	def set_fidelity(self, F):
-		self.Fidelity = F
-
-	def scale(self):
-		self.scale = 1
-
-	def eval_noiseless(self, X):
-		"""
-		evaluate depends on the dimension
-		"""
-		res = []
-
-		# append
-		n = X.shape[0]
-		C = np.tile(self.ref_translated[self.dim:4], (n, 1))
-		X_ = np.concatenate((X, C), axis=1)
-		for i in range(n):
-			x = X_[i, :]
-			mask = np.full(self.data.shape[0], True, dtype=bool)
-			for j in range(4):
-				# print (x[j],self.data["P" + str(j + 1)])
-				mask = np.logical_and(mask, self.data["P" + str(j + 1)] == x[j])
-			res.append(self.data[mask]['Fitness'].values)
-		return np.array(res).reshape(-1, 1)
-
-	# def actions(self):
-	# 	number_of_actions = self.dim*(20**(self.dim-1))
-	#
-	# 	actions = []
-	#
-	# 	## this includes (20,d) actions
-	# 	one_dim = self.interval_onehot(dim = 1)
-	# 	#print (one_dim)
-	# 	#print ("one dim",one_dim.shape)
-	# 	if self.dim - 1>0:
-	# 		# this includes (20**(d-1), d) actions
-	# 		others = self.interval_onehot(dim = self.dim - 1)
-	# 		#print ("others:", others.shape)
-	# 		for fix_dim in range(self.dim):
-	# 			#print (fix_dim)
-	# 			action = np.zeros(shape=(20 ** (self.dim - 1), 20 * self.dim))
-	# 			for elem in one_dim:
-	# 				#print (fix_dim*20+(fix_dim+1)*20)
-	# 				action[:,fix_dim*20:(fix_dim+1)*20]=elem
-	# 				action[:,0:fix_dim*20] = others[:,0:fix_dim*20]
-	# 				action[:,(fix_dim+1) * 20:] = others[:,fix_dim*20:]
-	# 				actions.append(action)
-	# 		return actions
-	# 	else:
-	# 		return one_dim
-
-	def actions(self):
-		number_of_actions = self.dim * (20 ** (self.dim - 1))
-
-		actions = []
-
-		## this includes (20,d) actions
-		one_dim = self.interval_onehot(dim=1)
-		# print (one_dim)
-		# print ("one dim",one_dim.shape)
-		if self.dim - 1 > 0:
-			# this includes (20**(d-1), d) actions
-			others = self.interval_onehot(dim=self.dim - 1)
-			# print ("others:", others.shape)
-			for elem in others:
-				for fix_dim in range(self.dim):
-					action = np.zeros(shape=(20, 20 * self.dim))
-					action[:, fix_dim * 20:(fix_dim + 1) * 20] = one_dim
-					j = 0
-					for i in range(self.dim):
-						if i != fix_dim:
-							action[:, i * 20:(i + 1) * 20] = elem[j * 20:(j + 1) * 20]
-							j = j + 1
-
-					actions.append(action)
-			return actions
-		else:
-			return one_dim
-
-	def subsample_dts_indice_only(self, N, split=0.9):
-		self.self_translate()
-		xtest = self.interval_onehot()
-
-		indices = np.arange(0, N, 1)
-		sample = indices
-		np.random.shuffle(indices)
-
-		train = sample[0:int(np.round(split * N))]
-		test = sample[int(np.round(split * N)):N]
-
-		return (train, test)
-
-	def subsample_dts(self, N, split=0.90):
-		self.self_translate()
-		xtest = self.interval_onehot()
-		indices = np.arange(0, N, 1)
-
-		indices = np.random.shuffle(indices)
-		sample = xtest[indices, :]
-
-		y_sample = self.eval_one_hot(sample)
-
-		x_train = sample[0:int(np.round(split * N)), :]
-		y_train = y_sample[0:int(np.round(split * N)), :]
-		x_test = sample[int(np.round(split * N)):N, :]
-		y_test = y_sample[int(np.round(split * N)):N, :]
-
-		return (x_train, y_train, x_test, y_test)
-
-	def eval_fidelity(self, X):
-		return self.Fidelity(X)
-
-	def eval(self, X):
-		z = self.eval_noiseless(X)
-		return z
-
-	def eval_one_hot(self, X):
-		n, d = list(X.shape)
-		Z = np.zeros(shape=(n, self.dim))
-		for i in range(n):
-			for j in range(d):
-				if X[i, j] > 0:
-					Z[i, j // self.total] = j % self.total
-		Z = Z.astype(int)
-		Y = self.eval(Z)
-		return Y
-
-	def plot_one_site_map(self, kernel, save=None, dim=1):
-		plt.figure()
-		names = list(self.dictionary.keys())
-		names.remove('B')
-		real_names = self.get_real_name(names)
-		real_names = helper.cartesian([real_names for i in range(dim)])
-
-		xtest = torch.from_numpy(self.interval_onehot(dim=dim))
-		real_names = [','.join(list(i)) for i in real_names]
-		ax = plt.imshow(kernel(xtest, xtest).detach().numpy())
-		plt.colorbar()
-		plt.xticks(range(xtest.shape[0]), real_names, fontsize=10, rotation=60)
-		plt.yticks(range(xtest.shape[0]), real_names, fontsize=10)
-		plt.margins(0.2)
-		if save is not None:
-			plt.savefig(save)
-		else:
-			plt.show()
+        self.dictionary = {
+            "A": 0,
+            "R": 1,
+            "N": 2,
+            "D": 3,
+            "C": 4,
+            "Q": 5,
+            "E": 6,
+            "G": 7,
+            "H": 8,
+            "I": 9,
+            "L": 10,
+            "K": 11,
+            "M": 12,
+            "F": 13,
+            "P": 14,
+            "S": 15,
+            "T": 16,
+            "W": 17,
+            "Y": 18,
+            "V": 19,
+            "B": 3,
+        }
+
+        f = lambda x: self.dictionary[x]
+
+        self.total = 20
+        self.dim = dim
+        self.ref = ref
+        self.ref_translated = [f(x) for x in self.ref]
+
+        dset = pd.read_hdf(fname)
+
+        # average the effect over others
+        if avg == False:
+            mask = np.full(dset.shape[0], True, dtype=bool)
+            for j in range(4 - dim):
+                mask = np.logical_and(mask, dset["P" + str(4 - j)] == ref[3 - j])
+            self.data = dset[mask]
+        else:
+            # avg. not implemented
+            pass
+
+        if scale == True:
+            maximum = np.max(self.data[:]["Fitness"])
+            self.data[:]["Fitness"] = self.data[:]["Fitness"] / maximum
+        else:
+            pass
+
+        self.real_names = {
+            "A": "Ala",
+            "R": "Arg",
+            "N": "Asn",
+            "D": "Asp",
+            "C": "Cys",
+            "Q": "Gln",
+            "E": "Glu",
+            "G": "Gly",
+            "H": "His",
+            "I": "Iso",
+            "L": "Leu",
+            "K": "Lys",
+            "M": "Met",
+            "F": "Phe",
+            "P": "Pro",
+            "S": "Ser",
+            "T": "Thr",
+            "W": "Trp",
+            "Y": "Tyr",
+            "V": "Val",
+            "B": "Asx",
+        }
+
+        self.inv_real_names = {v: k for k, v in self.real_names.items()}
+
+        self.Negative = ["D", "E"]
+        self.Positive = ["R", "K", "H"]
+        self.Aromatic = ["F", "W", "Y", "H"]
+        self.Polar = ["N", "Q", "S", "T", "Y"]
+        self.Aliphatic = ["A", "G", "I", "L", "V"]
+        self.Amide = ["N", "Q"]
+        self.Sulfur = ["C", "M"]
+        self.Hydroxil = ["S", "T"]
+        self.Small = ["A", "S", "T", "P", "G", "V"]
+        self.Medium = ["M", "L", "I", "C", "N", "Q", "K", "D", "E"]
+        self.Large = ["R", "H", "W", "F", "Y"]
+        self.Hydro = ["M", "L", "I", "V", "A"]
+        self.Cyclic = ["P"]
+        self.Random = ["F", "W", "L", "S", "D"]
+
+    def get_real_name(self, name):
+        out = []
+        for i in name:
+            out.append(self.real_names[i])
+        return out
+
+    def data_summary(self):
+        y = self.data["Fitness"].values
+        maximum = np.max(y)
+        minimum = np.min(y)
+        return (maximum, minimum)
+
+    def translate(self, X):
+        f = lambda x: self.dictionary[x]
+        Y = np.zeros(shape=X.shape).astype(int)
+        for i in range(X.shape[0]):
+            for j in range(X.shape[1]):
+                Y[i, j] = f(X[i, j])
+        return Y
+
+    def translate_one_hot(self, X):
+        try:
+            Y = self.translate(X)
+        except:
+            Y = X
+        n, d = list(X.shape)
+        Z = np.zeros(shape=(n, d * self.total))
+        for i in range(n):
+            for j in range(d):
+                Z[i, Y[i, j] + j * self.total] = 1.0
+
+        return Z
+
+    def self_translate(self):
+        """
+        self translate from
+        :return:
+        """
+        f = lambda x: self.dictionary[x]
+        for j in range(4):
+            self.data["P" + str(j + 1)] = self.data["P" + str(j + 1)].apply(f)
+
+    def set_fidelity(self, F):
+        self.Fidelity = F
+
+    def scale(self):
+        self.scale = 1
+
+    def eval_noiseless(self, X):
+        """
+        evaluate depends on the dimension
+        """
+        res = []
+
+        # append
+        n = X.shape[0]
+        C = np.tile(self.ref_translated[self.dim : 4], (n, 1))
+        X_ = np.concatenate((X, C), axis=1)
+        for i in range(n):
+            x = X_[i, :]
+            mask = np.full(self.data.shape[0], True, dtype=bool)
+            for j in range(4):
+                # print (x[j],self.data["P" + str(j + 1)])
+                mask = np.logical_and(mask, self.data["P" + str(j + 1)] == x[j])
+            res.append(self.data[mask]["Fitness"].values)
+        return np.array(res).reshape(-1, 1)
+
+    # def actions(self):
+    # 	number_of_actions = self.dim*(20**(self.dim-1))
+    #
+    # 	actions = []
+    #
+    # 	## this includes (20,d) actions
+    # 	one_dim = self.interval_onehot(dim = 1)
+    # 	#print (one_dim)
+    # 	#print ("one dim",one_dim.shape)
+    # 	if self.dim - 1>0:
+    # 		# this includes (20**(d-1), d) actions
+    # 		others = self.interval_onehot(dim = self.dim - 1)
+    # 		#print ("others:", others.shape)
+    # 		for fix_dim in range(self.dim):
+    # 			#print (fix_dim)
+    # 			action = np.zeros(shape=(20 ** (self.dim - 1), 20 * self.dim))
+    # 			for elem in one_dim:
+    # 				#print (fix_dim*20+(fix_dim+1)*20)
+    # 				action[:,fix_dim*20:(fix_dim+1)*20]=elem
+    # 				action[:,0:fix_dim*20] = others[:,0:fix_dim*20]
+    # 				action[:,(fix_dim+1) * 20:] = others[:,fix_dim*20:]
+    # 				actions.append(action)
+    # 		return actions
+    # 	else:
+    # 		return one_dim
+
+    def actions(self):
+        number_of_actions = self.dim * (20 ** (self.dim - 1))
+
+        actions = []
+
+        ## this includes (20,d) actions
+        one_dim = self.interval_onehot(dim=1)
+        # print (one_dim)
+        # print ("one dim",one_dim.shape)
+        if self.dim - 1 > 0:
+            # this includes (20**(d-1), d) actions
+            others = self.interval_onehot(dim=self.dim - 1)
+            # print ("others:", others.shape)
+            for elem in others:
+                for fix_dim in range(self.dim):
+                    action = np.zeros(shape=(20, 20 * self.dim))
+                    action[:, fix_dim * 20 : (fix_dim + 1) * 20] = one_dim
+                    j = 0
+                    for i in range(self.dim):
+                        if i != fix_dim:
+                            action[:, i * 20 : (i + 1) * 20] = elem[
+                                j * 20 : (j + 1) * 20
+                            ]
+                            j = j + 1
+
+                    actions.append(action)
+            return actions
+        else:
+            return one_dim
+
+    def subsample_dts_indice_only(self, N, split=0.9):
+        self.self_translate()
+        xtest = self.interval_onehot()
+
+        indices = np.arange(0, N, 1)
+        sample = indices
+        np.random.shuffle(indices)
+
+        train = sample[0 : int(np.round(split * N))]
+        test = sample[int(np.round(split * N)) : N]
+
+        return (train, test)
+
+    def subsample_dts(self, N, split=0.90):
+        self.self_translate()
+        xtest = self.interval_onehot()
+        indices = np.arange(0, N, 1)
+
+        indices = np.random.shuffle(indices)
+        sample = xtest[indices, :]
+
+        y_sample = self.eval_one_hot(sample)
+
+        x_train = sample[0 : int(np.round(split * N)), :]
+        y_train = y_sample[0 : int(np.round(split * N)), :]
+        x_test = sample[int(np.round(split * N)) : N, :]
+        y_test = y_sample[int(np.round(split * N)) : N, :]
+
+        return (x_train, y_train, x_test, y_test)
+
+    def eval_fidelity(self, X):
+        return self.Fidelity(X)
+
+    def eval(self, X):
+        z = self.eval_noiseless(X)
+        return z
+
+    def eval_one_hot(self, X):
+        n, d = list(X.shape)
+        Z = np.zeros(shape=(n, self.dim))
+        for i in range(n):
+            for j in range(d):
+                if X[i, j] > 0:
+                    Z[i, j // self.total] = j % self.total
+        Z = Z.astype(int)
+        Y = self.eval(Z)
+        return Y
+
+    def plot_one_site_map(self, kernel, save=None, dim=1):
+        plt.figure()
+        names = list(self.dictionary.keys())
+        names.remove("B")
+        real_names = self.get_real_name(names)
+        real_names = helper.cartesian([real_names for i in range(dim)])
+
+        xtest = torch.from_numpy(self.interval_onehot(dim=dim))
+        real_names = [",".join(list(i)) for i in real_names]
+        ax = plt.imshow(kernel(xtest, xtest).detach().numpy())
+        plt.colorbar()
+        plt.xticks(range(xtest.shape[0]), real_names, fontsize=10, rotation=60)
+        plt.yticks(range(xtest.shape[0]), real_names, fontsize=10)
+        plt.margins(0.2)
+        if save is not None:
+            plt.savefig(save)
+        else:
+            plt.show()
 
 
 if __name__ == "__main__":
-	Benchmark = ProteinBenchmark("protein_data_gb1.h5", dim=2, ref=['A', 'B', 'C', 'D'])
-	# print (Benchmark.data)
-	Benchmark.self_translate()
-	Benchmark.data.plot.scatter(x='P1', y='P2', c=Benchmark.data['Fitness'], s=200)
-	# print (Benchmark.data)
-	X = np.array([['F', 'C'], ['D', 'C']])
-	X_ = Benchmark.translate(X)
-	print(X, X_)
-	X__ = Benchmark.translate_one_hot(X)
+    Benchmark = ProteinBenchmark("protein_data_gb1.h5", dim=2, ref=["A", "B", "C", "D"])
+    # print (Benchmark.data)
+    Benchmark.self_translate()
+    Benchmark.data.plot.scatter(x="P1", y="P2", c=Benchmark.data["Fitness"], s=200)
+    # print (Benchmark.data)
+    X = np.array([["F", "C"], ["D", "C"]])
+    X_ = Benchmark.translate(X)
+    print(X, X_)
+    X__ = Benchmark.translate_one_hot(X)
 
-	print(Benchmark.translate_one_hot(X))
+    print(Benchmark.translate_one_hot(X))
 
-	print(Benchmark.eval(X_))
+    print(Benchmark.eval(X_))
 
-	print(Benchmark.eval_one_hot(X__))
+    print(Benchmark.eval_one_hot(X__))
diff --git a/stpy/test_functions/swissfel_simulator.py b/stpy/test_functions/swissfel_simulator.py
index f7eaec1..f82c51d 100755
--- a/stpy/test_functions/swissfel_simulator.py
+++ b/stpy/test_functions/swissfel_simulator.py
@@ -5,116 +5,123 @@
 from stpy.helpers.helper import *
 
 
-class FelSimulator():
-
-	def __init__(self, d, sigma, name):
-		self.d = d
-		self.sigma = sigma
-		self.exp_name = name
-
-	def help(self, reload=False):
-		print("Help for the FelSimulator")
-
-	def load_pickle(self, file_name):
-
-		self.GP = pickle.load(open(file_name, "rb"))
-		self.d = self.GP.d
-		self.exp_name = self.GP.exp_name
-
-	def save(self, file_name):
-		self.GP.exp_name = self.exp_name
-		pickle.dump(self.GP, open(file_name, "wb"), -1)
-
-	def load_fresh(self, file_name, dts='1'):
-		f = File(file_name, 'r')
-		dset = f[dts]
-		print(dset)
-		n = dset[str("x")].shape[0]
-		mask = np.full(n, False, dtype=bool)
-		for j in range(self.d):
-			maskNew = dset["line_id"] == j
-			mask = np.logical_or(mask, maskNew)
-		print("Using ", np.sum(mask), "points to fit the model.")
-		self.x = dset["x"][mask, 0:self.d].reshape(-1, self.d)
-		self.y = dset["y"][mask].reshape(-1, 1)
-		# y response and scale, x scale to [-0.5,0.5]
-		scale = np.max(np.abs(self.y))
-		self.y = self.y / scale
-		for j in range(self.d):
-			a = np.min(self.x[:, j])
-			b = np.max(self.x[:, j])
-			self.x[:, j] = (self.x[:, j] / (b - a)) - 0.5 - a / (b - a)
-		# noise structure
-		self.s = np.max(dset["y_std"][mask] / scale)
-		print("The noise level estimated to be:", self.s)
-		self.x = torch.from_numpy(self.x)
-		self.y = torch.from_numpy(self.y)
-
-		f.close()
-
-	def fit_simulator(self, GP, optimize="bandwidth", restarts=10):
-		self.GP = GP
-		self.GP.s = self.s
-		self.GP.fit(self.x, self.y)
-		print("Model fitted.")
-		self.GP.optimize_params(type=optimize, restarts=restarts)
-		self.GP.back_prop = True
-
-	def bounds(self, N, n):
-		x = torch.from_numpy(np.random.uniform(-0.5, 0.5, size=(N, self.GP.d)))
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(self.GP.d)]
-			xtest = cartesian(arrays)
-			xtest = torch.from_numpy(xtest)
-		return (x, xtest, self.GP.d, None)
-
-	def opt_bounds(self):
-		bounds = tuple([(-0.5, 0.5) for i in range(self.GP.d)])
-		return bounds
-
-	def constraint(self, X):
-		return True
-
-	def eval(self, X, sigma=None):
-		if sigma is None:
-			sigma = self.sigma
-		[mu, _] = self.GP.mean_std(X)
-		return mu + sigma * torch.randn(X.size()[0], 1, dtype=torch.float64)
-
-	def eval_sample(self, X, sigma=None):
-		if sigma is None:
-			sigma = self.sigma
-		f = self.GP.sample(X)
-		self.x = torch.cat((self.x, X), dim=0)
-		self.y = torch.cat((self.y, f), dim=0)
-		self.GP.fit(self.x, self.y)
-		return f
-
-	def optimum(self):
-		## find optimum using backpropagation optimize eval_sample given X
-		x = torch.randn(self.d, 1, requires_grad=True)
-		x0 = x
-
-		from scipy.optimize import minimize
-
-		def fun(x):
-			x = np.array([x])
-			return -self.eval(torch.from_numpy(x)).numpy()[0][0]
-
-		def grad(x):
-			z = torch.from_numpy(np.array([x]))
-			z.requires_grad_(True)
-			y = -self.eval(z)
-			y.backward()
-			return z.grad.numpy()[0]
-
-		mybounds = self.opt_bounds()
-		res = minimize(fun, x0.detach().numpy(), method="L-BFGS-B", jac=grad, tol=0.0001, bounds=mybounds)
-		solution = res.x
-
-		val = self.eval(torch.from_numpy(solution).unsqueeze(0))
-		loc = torch.from_numpy(solution).unsqueeze(0)
-
-		return (val, loc)
+class FelSimulator:
+
+    def __init__(self, d, sigma, name):
+        self.d = d
+        self.sigma = sigma
+        self.exp_name = name
+
+    def help(self, reload=False):
+        print("Help for the FelSimulator")
+
+    def load_pickle(self, file_name):
+
+        self.GP = pickle.load(open(file_name, "rb"))
+        self.d = self.GP.d
+        self.exp_name = self.GP.exp_name
+
+    def save(self, file_name):
+        self.GP.exp_name = self.exp_name
+        pickle.dump(self.GP, open(file_name, "wb"), -1)
+
+    def load_fresh(self, file_name, dts="1"):
+        f = File(file_name, "r")
+        dset = f[dts]
+        print(dset)
+        n = dset[str("x")].shape[0]
+        mask = np.full(n, False, dtype=bool)
+        for j in range(self.d):
+            maskNew = dset["line_id"] == j
+            mask = np.logical_or(mask, maskNew)
+        print("Using ", np.sum(mask), "points to fit the model.")
+        self.x = dset["x"][mask, 0 : self.d].reshape(-1, self.d)
+        self.y = dset["y"][mask].reshape(-1, 1)
+        # y response and scale, x scale to [-0.5,0.5]
+        scale = np.max(np.abs(self.y))
+        self.y = self.y / scale
+        for j in range(self.d):
+            a = np.min(self.x[:, j])
+            b = np.max(self.x[:, j])
+            self.x[:, j] = (self.x[:, j] / (b - a)) - 0.5 - a / (b - a)
+        # noise structure
+        self.s = np.max(dset["y_std"][mask] / scale)
+        print("The noise level estimated to be:", self.s)
+        self.x = torch.from_numpy(self.x)
+        self.y = torch.from_numpy(self.y)
+
+        f.close()
+
+    def fit_simulator(self, GP, optimize="bandwidth", restarts=10):
+        self.GP = GP
+        self.GP.s = self.s
+        self.GP.fit(self.x, self.y)
+        print("Model fitted.")
+        self.GP.optimize_params(type=optimize, restarts=restarts)
+        self.GP.back_prop = True
+
+    def bounds(self, N, n):
+        x = torch.from_numpy(np.random.uniform(-0.5, 0.5, size=(N, self.GP.d)))
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(self.GP.d)]
+            xtest = cartesian(arrays)
+            xtest = torch.from_numpy(xtest)
+        return (x, xtest, self.GP.d, None)
+
+    def opt_bounds(self):
+        bounds = tuple([(-0.5, 0.5) for i in range(self.GP.d)])
+        return bounds
+
+    def constraint(self, X):
+        return True
+
+    def eval(self, X, sigma=None):
+        if sigma is None:
+            sigma = self.sigma
+        [mu, _] = self.GP.mean_std(X)
+        return mu + sigma * torch.randn(X.size()[0], 1, dtype=torch.float64)
+
+    def eval_sample(self, X, sigma=None):
+        if sigma is None:
+            sigma = self.sigma
+        f = self.GP.sample(X)
+        self.x = torch.cat((self.x, X), dim=0)
+        self.y = torch.cat((self.y, f), dim=0)
+        self.GP.fit(self.x, self.y)
+        return f
+
+    def optimum(self):
+        ## find optimum using backpropagation optimize eval_sample given X
+        x = torch.randn(self.d, 1, requires_grad=True)
+        x0 = x
+
+        from scipy.optimize import minimize
+
+        def fun(x):
+            x = np.array([x])
+            return -self.eval(torch.from_numpy(x)).numpy()[0][0]
+
+        def grad(x):
+            z = torch.from_numpy(np.array([x]))
+            z.requires_grad_(True)
+            y = -self.eval(z)
+            y.backward()
+            return z.grad.numpy()[0]
+
+        mybounds = self.opt_bounds()
+        res = minimize(
+            fun,
+            x0.detach().numpy(),
+            method="L-BFGS-B",
+            jac=grad,
+            tol=0.0001,
+            bounds=mybounds,
+        )
+        solution = res.x
+
+        val = self.eval(torch.from_numpy(solution).unsqueeze(0))
+        loc = torch.from_numpy(solution).unsqueeze(0)
+
+        return (val, loc)
diff --git a/stpy/test_functions/test_functions.py b/stpy/test_functions/test_functions.py
index 11e6fed..84121ab 100755
--- a/stpy/test_functions/test_functions.py
+++ b/stpy/test_functions/test_functions.py
@@ -4,677 +4,792 @@
 
 import stpy
 import stpy.continuous_processes.gauss_procc
+
 # from tensorflow.examples.tutorials.mnist import input_data
 from stpy.helpers.helper import *
 from stpy.test_functions.neural_net import train_network
 
 
 def isin(element, test_elements, assume_unique=False):
-	(n, d) = element.shape
-	(m, d) = test_elements.shape
-	maskFull = np.full((n), False, dtype=bool)
-	for j in range(m):
-		mask = np.full((n), True, dtype=bool)
-		for i in range(d):
-			# mask = np.logical_and(mask,np.in1d(element[:,i],test_elements[j,i], assume_unique=assume_unique))
-			mask = np.logical_and(mask, np.isclose(element[:, i], test_elements[j, i], atol=1e-01))
-		# print (j, i, mask)
-		maskFull = np.logical_or(mask, maskFull)
-	# print (maskFull)
-	return maskFull
+    (n, d) = element.shape
+    (m, d) = test_elements.shape
+    maskFull = np.full((n), False, dtype=bool)
+    for j in range(m):
+        mask = np.full((n), True, dtype=bool)
+        for i in range(d):
+            # mask = np.logical_and(mask,np.in1d(element[:,i],test_elements[j,i], assume_unique=assume_unique))
+            mask = np.logical_and(
+                mask, np.isclose(element[:, i], test_elements[j, i], atol=1e-01)
+            )
+        # print (j, i, mask)
+        maskFull = np.logical_or(mask, maskFull)
+    # print (maskFull)
+    return maskFull
 
 
 class test_function:
 
-	def __init__(self):
-		"nothing"
-		self.sampled = False
-		self.init = False
-		self.scale = 1.0
-
-	## General F
-	def f(self, X, sigma=0.00001, a=0.5):
-		# in X rows are points, cols are features
-		X = X * 8
-		y = -np.sin(a * np.sum(X ** 2, axis=1)).reshape(X.shape[0], 1)
-		y = y + sigma * np.random.randn(X.shape[0], 1)
-		return y
-
-	def f_bounds(self, N, n, d=1, L_infinity_ball=1.):
-		x = np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d))
-		# grid
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-L_infinity_ball, L_infinity_ball, n).reshape(n, 1) for i in range(d)]
-			xtest = cartesian(arrays)
-		return (d, xtest, x, 0.15)
-
-	def f_opt_bounds(self, d=1, L_infinity_ball=1):
-		b = tuple([(-L_infinity_ball, L_infinity_ball) for i in range(d)])
-		return b
-
-	def optimize_f(self, d=1, a=0.5, L_infinity_ball=1):
-		from scipy.optimize import minimize
-
-		grad = lambda x: np.cos(np.sum(x ** 2) / 2) * x
-		fun = lambda x: np.sin(np.sum(x ** 2) / 2) + 1
-
-		bounds = self.f_opt_bounds(d=d, L_infinity_ball=L_infinity_ball)
-		r = []
-		for _ in range(500):
-			(d, _, x, _) = self.f_bounds(1, None, d=d, L_infinity_ball=L_infinity_ball)
-			x0 = x[0, :]
-			res = minimize(fun, x0, method="SLSQP", jac=grad, tol=0.0001, bounds=bounds)
-			r.append(fun(res.x))
-
-		print(d, max(r))
-
-	def sample_ss(self, X, sigma=0.001, gamma=1.0, GP=None):
-		# in X rows are points, cols are features
-		if self.sampled == False:
-			# print ("sampling")
-			if GP == None:
-				GP = stpy.continuous_processes.gauss_procc.GaussianProcess(s=sigma, gamma=gamma)
-				self.sample = GP.sample(torch.from_numpy(self.xtest)).numpy()
-				mask = isin(self.xtest, X)
-				self.sampled = True
-				return self.sample[mask, :].numpy() + np.random.randn(X.shape[0], 1) * sigma
-			else:
-				self.sample = GP.sample(torch.from_numpy(self.xtest)).numpy()
-				mask = isin(self.xtest, X)
-				self.sampled = True
-				return self.sample[mask, :] + np.random.randn(X.shape[0], 1) * sigma
-		else:
-			mask = isin(self.xtest, X)
-			return self.sample[mask, :] + np.random.randn(X.shape[0], 1) * sigma
-
-	def sample_ss_bounds(self, N, n, d=1, L_infinity_ball=1., gamma=1.0):
-		# self.sampled = False
-		# grid
-		arrays = [np.linspace(-L_infinity_ball, L_infinity_ball, n).reshape(n, 1) for i in range(d)]
-		xtest = cartesian(arrays)
-		self.xtest = xtest
-		self.n = n
-		# x = self.xtest[np.random.randint(0,n,size = N),:]
-		x = self.xtest[np.random.permutation(np.arange(0, self.xtest.shape[0], 1))[0:N], :]
-		x = np.sort(x, axis=0)
-		return (d, xtest, x, gamma)
-
-	def sample_ss_reset(self):
-		self.samples = False
-
-	def optimize(self, xtest, ytest, groups, s):
-		(n, d) = xtest.size()
-		kernel = stpy.kernels.KernelFunction(kernel_name="ard", gamma=torch.ones(d, dtype=torch.float64) * 0.1,
-											 groups=groups)
-		GP = stpy.continuous_processes.gauss_procc.GaussianProcess(kernel_custom=kernel, s=s, d=d)
-		GP.fit_gp(xtest, ytest)
-		GP.optimize_params(type="bandwidth")
-		print("Optimized")
-		return torch.min(kernel.gamma)
-
-	## Branin Function
-	def branin(self, X, sigma=0.1):
-		if X.shape[1] != 2:
-			raise AssertionError("Invalid dimension of grid with Branin Function")
-		else:
-			xx = X[:, 0]
-			yy = X[:, 1]
-			y = ((yy - (5.1 / (4. * np.pi)) * (xx ** 2) + 5. / np.pi - 6.) ** 2 + 10. * (
-						1. - 1. / (8. * np.pi)) * np.cos(xx) + 10.) / 150
-			y = -y.reshape(X.shape[0], 1)
-			return y
-
-	def branin_bounds(self, N, n):
-		x = np.random.uniform(0, 10, size=(N, 2))
-		# grid
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-5, 10, n).reshape(n, 1), np.linspace(0, 15, n).reshape(n, 1)]
-			xtest = cartesian(arrays)
-		return (2, xtest, x, 2.5)
-
-	def branin_opt_bounds(self):
-		b = tuple([(-5, 10), (0, 15)])
-		return b
-
-	## Camelback Function 
-	def camelback(self, X, sigma=0.1):
-		if X.shape[1] != 2:
-			raise AssertionError("Invalid dimension of grid with Branin Function")
-		else:
-			xx = X[:, 0] * 4
-			yy = X[:, 1] * 2
-			y = (4. - 2.1 * xx ** 2 + (xx ** 4) / 3.) * (xx ** 2) + xx * yy + (-4. + 4 * (yy ** 2)) * (yy ** 2)
-			y = -y.reshape(X.shape[0], 1)
-			# y = np.tanh(y)
-			y = y / 5.
-			return y / self.scale + sigma * np.random.randn(X.shape[0], 1)
-
-	def camelback_bounds(self, N, n, adv_inv=False):
-		if adv_inv == False:
-			x = np.random.uniform(-0.5, 0.5, size=(N, 2))
-		else:
-			x = np.random.uniform(-0.5, -0.4, size=(N, 2))
-		# grid
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1), np.linspace(-0.5, 0.5, n).reshape(n, 1)]
-			xtest = cartesian(arrays)
-		return (2, xtest, x, 0.1)
-
-	def camelback_opt_bounds(self):
-		b = tuple([(-0.5, 0.5), (-0.5, 0.5)])
-		return b
-
-	def camelback_scale(self, xtest):
-		self.scale = np.max((self.camelback(xtest, sigma=0)))
-		print("Scaling:", self.scale)
-
-	## Hartmann 6
-	def hartmann6(self, X, sigma=0.1):
-		if X.shape[1] != 6:
-			raise AssertionError("Invalid dimension of grid with Branin Function")
-		else:
-			# opt = np.array([[0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573]])
-			# fopt = np.array([[-3.32237]])
-
-			alpha = [1.00, 1.20, 3.00, 3.20]
-			A = np.array([[10.00, 3.00, 17.00, 3.50, 1.70, 8.00],
-						  [0.05, 10.00, 17.00, 0.10, 8.00, 14.00],
-						  [3.00, 3.50, 1.70, 10.00, 17.00, 8.00],
-						  [17.00, 8.00, 0.05, 10.00, 0.10, 14.00]])
-			P = 0.0001 * np.array([[1312, 1696, 5569, 124, 8283, 5886],
-								   [2329, 4135, 8307, 3736, 1004, 9991],
-								   [2348, 1451, 3522, 2883, 3047, 6650],
-								   [4047, 8828, 8732, 5743, 1091, 381]])
-
-			"""6d Hartmann test function
+    def __init__(self):
+        "nothing"
+        self.sampled = False
+        self.init = False
+        self.scale = 1.0
+
+    ## General F
+    def f(self, X, sigma=0.00001, a=0.5):
+        # in X rows are points, cols are features
+        X = X * 8
+        y = -np.sin(a * np.sum(X**2, axis=1)).reshape(X.shape[0], 1)
+        y = y + sigma * np.random.randn(X.shape[0], 1)
+        return y
+
+    def f_bounds(self, N, n, d=1, L_infinity_ball=1.0):
+        x = np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d))
+        # grid
+        if n == None:
+            xtest = None
+        else:
+            arrays = [
+                np.linspace(-L_infinity_ball, L_infinity_ball, n).reshape(n, 1)
+                for i in range(d)
+            ]
+            xtest = cartesian(arrays)
+        return (d, xtest, x, 0.15)
+
+    def f_opt_bounds(self, d=1, L_infinity_ball=1):
+        b = tuple([(-L_infinity_ball, L_infinity_ball) for i in range(d)])
+        return b
+
+    def optimize_f(self, d=1, a=0.5, L_infinity_ball=1):
+        from scipy.optimize import minimize
+
+        grad = lambda x: np.cos(np.sum(x**2) / 2) * x
+        fun = lambda x: np.sin(np.sum(x**2) / 2) + 1
+
+        bounds = self.f_opt_bounds(d=d, L_infinity_ball=L_infinity_ball)
+        r = []
+        for _ in range(500):
+            (d, _, x, _) = self.f_bounds(1, None, d=d, L_infinity_ball=L_infinity_ball)
+            x0 = x[0, :]
+            res = minimize(fun, x0, method="SLSQP", jac=grad, tol=0.0001, bounds=bounds)
+            r.append(fun(res.x))
+
+        print(d, max(r))
+
+    def sample_ss(self, X, sigma=0.001, gamma=1.0, GP=None):
+        # in X rows are points, cols are features
+        if self.sampled == False:
+            # print ("sampling")
+            if GP == None:
+                GP = stpy.continuous_processes.gauss_procc.GaussianProcess(
+                    s=sigma, gamma=gamma
+                )
+                self.sample = GP.sample(torch.from_numpy(self.xtest)).numpy()
+                mask = isin(self.xtest, X)
+                self.sampled = True
+                return (
+                    self.sample[mask, :].numpy()
+                    + np.random.randn(X.shape[0], 1) * sigma
+                )
+            else:
+                self.sample = GP.sample(torch.from_numpy(self.xtest)).numpy()
+                mask = isin(self.xtest, X)
+                self.sampled = True
+                return self.sample[mask, :] + np.random.randn(X.shape[0], 1) * sigma
+        else:
+            mask = isin(self.xtest, X)
+            return self.sample[mask, :] + np.random.randn(X.shape[0], 1) * sigma
+
+    def sample_ss_bounds(self, N, n, d=1, L_infinity_ball=1.0, gamma=1.0):
+        # self.sampled = False
+        # grid
+        arrays = [
+            np.linspace(-L_infinity_ball, L_infinity_ball, n).reshape(n, 1)
+            for i in range(d)
+        ]
+        xtest = cartesian(arrays)
+        self.xtest = xtest
+        self.n = n
+        # x = self.xtest[np.random.randint(0,n,size = N),:]
+        x = self.xtest[
+            np.random.permutation(np.arange(0, self.xtest.shape[0], 1))[0:N], :
+        ]
+        x = np.sort(x, axis=0)
+        return (d, xtest, x, gamma)
+
+    def sample_ss_reset(self):
+        self.samples = False
+
+    def optimize(self, xtest, ytest, groups, s):
+        (n, d) = xtest.size()
+        kernel = stpy.kernels.KernelFunction(
+            kernel_name="ard",
+            gamma=torch.ones(d, dtype=torch.float64) * 0.1,
+            groups=groups,
+        )
+        GP = stpy.continuous_processes.gauss_procc.GaussianProcess(
+            kernel_custom=kernel, s=s, d=d
+        )
+        GP.fit_gp(xtest, ytest)
+        GP.optimize_params(type="bandwidth")
+        print("Optimized")
+        return torch.min(kernel.gamma)
+
+    ## Branin Function
+    def branin(self, X, sigma=0.1):
+        if X.shape[1] != 2:
+            raise AssertionError("Invalid dimension of grid with Branin Function")
+        else:
+            xx = X[:, 0]
+            yy = X[:, 1]
+            y = (
+                (yy - (5.1 / (4.0 * np.pi)) * (xx**2) + 5.0 / np.pi - 6.0) ** 2
+                + 10.0 * (1.0 - 1.0 / (8.0 * np.pi)) * np.cos(xx)
+                + 10.0
+            ) / 150
+            y = -y.reshape(X.shape[0], 1)
+            return y
+
+    def branin_bounds(self, N, n):
+        x = np.random.uniform(0, 10, size=(N, 2))
+        # grid
+        if n == None:
+            xtest = None
+        else:
+            arrays = [
+                np.linspace(-5, 10, n).reshape(n, 1),
+                np.linspace(0, 15, n).reshape(n, 1),
+            ]
+            xtest = cartesian(arrays)
+        return (2, xtest, x, 2.5)
+
+    def branin_opt_bounds(self):
+        b = tuple([(-5, 10), (0, 15)])
+        return b
+
+    ## Camelback Function
+    def camelback(self, X, sigma=0.1):
+        if X.shape[1] != 2:
+            raise AssertionError("Invalid dimension of grid with Branin Function")
+        else:
+            xx = X[:, 0] * 4
+            yy = X[:, 1] * 2
+            y = (
+                (4.0 - 2.1 * xx**2 + (xx**4) / 3.0) * (xx**2)
+                + xx * yy
+                + (-4.0 + 4 * (yy**2)) * (yy**2)
+            )
+            y = -y.reshape(X.shape[0], 1)
+            # y = np.tanh(y)
+            y = y / 5.0
+            return y / self.scale + sigma * np.random.randn(X.shape[0], 1)
+
+    def camelback_bounds(self, N, n, adv_inv=False):
+        if adv_inv == False:
+            x = np.random.uniform(-0.5, 0.5, size=(N, 2))
+        else:
+            x = np.random.uniform(-0.5, -0.4, size=(N, 2))
+        # grid
+        if n == None:
+            xtest = None
+        else:
+            arrays = [
+                np.linspace(-0.5, 0.5, n).reshape(n, 1),
+                np.linspace(-0.5, 0.5, n).reshape(n, 1),
+            ]
+            xtest = cartesian(arrays)
+        return (2, xtest, x, 0.1)
+
+    def camelback_opt_bounds(self):
+        b = tuple([(-0.5, 0.5), (-0.5, 0.5)])
+        return b
+
+    def camelback_scale(self, xtest):
+        self.scale = np.max((self.camelback(xtest, sigma=0)))
+        print("Scaling:", self.scale)
+
+    ## Hartmann 6
+    def hartmann6(self, X, sigma=0.1):
+        if X.shape[1] != 6:
+            raise AssertionError("Invalid dimension of grid with Branin Function")
+        else:
+            # opt = np.array([[0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573]])
+            # fopt = np.array([[-3.32237]])
+
+            alpha = [1.00, 1.20, 3.00, 3.20]
+            A = np.array(
+                [
+                    [10.00, 3.00, 17.00, 3.50, 1.70, 8.00],
+                    [0.05, 10.00, 17.00, 0.10, 8.00, 14.00],
+                    [3.00, 3.50, 1.70, 10.00, 17.00, 8.00],
+                    [17.00, 8.00, 0.05, 10.00, 0.10, 14.00],
+                ]
+            )
+            P = 0.0001 * np.array(
+                [
+                    [1312, 1696, 5569, 124, 8283, 5886],
+                    [2329, 4135, 8307, 3736, 1004, 9991],
+                    [2348, 1451, 3522, 2883, 3047, 6650],
+                    [4047, 8828, 8732, 5743, 1091, 381],
+                ]
+            )
+
+            """6d Hartmann test function
 				input bounds:  0 <= xi <= 1, i = 1..6
 				global optimum: (0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573),
 				min function value = -3.32237
 			"""
 
-			external_sum = 0
-			for i in range(4):
-				internal_sum = 0
-				for j in range(6):
-					internal_sum = internal_sum + A[i, j] * (X[:, j] - P[i, j]) ** 2
-				external_sum = external_sum + alpha[i] * np.exp(-internal_sum)
-
-			return external_sum[:, np.newaxis]
-
-	def hartmann6_bounds(self, N, n):
-		x = np.random.uniform(0, 1, size=(N, 6))
-		# grid
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(0, 1, n).reshape(n, 1) for i in range(6)]
-			xtest = cartesian(arrays)
-		return (6, xtest, x, 0.5)
-
-	def hartmann6_opt_bounds(self):
-		b = tuple([(0, 1) for i in range(6)])
-		return b
-
-	## Hartmann 4
-	def hartmann4(self, X, sigma=0.1):
-		if X.shape[1] != 4:
-			raise AssertionError("Invalid dimension of grid with Branin Function")
-		else:
-
-			alpha = [1.00, 1.20, 3.00, 3.20]
-
-			A = np.array([[10.00, 3.00, 17.00, 3.50, 1.70, 8.00],
-						  [0.05, 10.00, 17.00, 0.10, 8.00, 14.00],
-						  [3.00, 3.50, 1.70, 10.00, 17.00, 8.00],
-						  [17.00, 8.00, 0.05, 10.00, 0.10, 14.00]])
-
-			P = 0.0001 * np.array([[1312, 1696, 5569, 124, 8283, 5886],
-								   [2329, 4135, 8307, 3736, 1004, 9991],
-								   [2348, 1451, 3522, 2883, 3047, 6650],
-								   [4047, 8828, 8732, 5743, 1091, 381]])
-
-			"""6d Hartmann test function
+            external_sum = 0
+            for i in range(4):
+                internal_sum = 0
+                for j in range(6):
+                    internal_sum = internal_sum + A[i, j] * (X[:, j] - P[i, j]) ** 2
+                external_sum = external_sum + alpha[i] * np.exp(-internal_sum)
+
+            return external_sum[:, np.newaxis]
+
+    def hartmann6_bounds(self, N, n):
+        x = np.random.uniform(0, 1, size=(N, 6))
+        # grid
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(0, 1, n).reshape(n, 1) for i in range(6)]
+            xtest = cartesian(arrays)
+        return (6, xtest, x, 0.5)
+
+    def hartmann6_opt_bounds(self):
+        b = tuple([(0, 1) for i in range(6)])
+        return b
+
+    ## Hartmann 4
+    def hartmann4(self, X, sigma=0.1):
+        if X.shape[1] != 4:
+            raise AssertionError("Invalid dimension of grid with Branin Function")
+        else:
+
+            alpha = [1.00, 1.20, 3.00, 3.20]
+
+            A = np.array(
+                [
+                    [10.00, 3.00, 17.00, 3.50, 1.70, 8.00],
+                    [0.05, 10.00, 17.00, 0.10, 8.00, 14.00],
+                    [3.00, 3.50, 1.70, 10.00, 17.00, 8.00],
+                    [17.00, 8.00, 0.05, 10.00, 0.10, 14.00],
+                ]
+            )
+
+            P = 0.0001 * np.array(
+                [
+                    [1312, 1696, 5569, 124, 8283, 5886],
+                    [2329, 4135, 8307, 3736, 1004, 9991],
+                    [2348, 1451, 3522, 2883, 3047, 6650],
+                    [4047, 8828, 8732, 5743, 1091, 381],
+                ]
+            )
+
+            """6d Hartmann test function
 				input bounds:  0 <= xi <= 1, i = 1..6
 				global optimum: (0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573),
 				min function value = -3.32237
 			"""
 
-			external_sum = 0
-			for i in range(4):
-				internal_sum = 0
-				for j in range(4):
-					internal_sum = internal_sum + A[i, j] * (X[:, j] - P[i, j]) ** 2
-				external_sum = external_sum + alpha[i] * np.exp(-internal_sum)
-
-			return external_sum[:, np.newaxis]
-
-	def hartmann4_bounds(self, N, n):
-		x = np.random.uniform(0, 1, size=(N, 4))
-		# grid
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(0, 1, n).reshape(n, 1) for i in range(4)]
-			xtest = cartesian(arrays)
-		return (4, xtest, x, 0.5)
-
-	def hartmann4_opt_bounds(self):
-		b = tuple([(0, 1) for i in range(4)])
-		return b
-
-	def hartmann3(self, X, sigma=0.1):
-
-		X_lower = np.array([0, 0, 0])
-		X_upper = np.array([1, 1, 1])
-		# opt = np.array([[0.114614, 0.555649, 0.852547]])
-		# fopt = np.array([[-3.86278]])
-		alpha = [1.0, 1.2, 3.0, 3.2]
-		A = np.array([[3.0, 10.0, 30.0],
-					  [0.1, 10.0, 35.0],
-					  [3.0, 10.0, 30.0],
-					  [0.1, 10.0, 35.0]])
-		P = 0.0001 * np.array([[3689, 1170, 2673],
-							   [4699, 4387, 7470],
-							   [1090, 8732, 5547],
-							   [381, 5743, 8828]])
-
-		external_sum = 0
-		for i in range(4):
-			internal_sum = 0
-			for j in range(3):
-				internal_sum = internal_sum + A[i, j] * (X[:, j] - P[i, j]) ** 2
-
-			external_sum = external_sum + alpha[i] * np.exp(-internal_sum)
-
-		return external_sum[:, np.newaxis]
-
-	def hartmann3_bounds(self, N, n):
-		x = np.random.uniform(0, 1, size=(N, 3))
-		# grid
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(0, 1, n).reshape(n, 1) for i in range(3)]
-			xtest = cartesian(arrays)
-
-		return (3, xtest, x, 0.4)
-
-	def hartmann3_opt_bounds(self):
-		b = tuple([(0, 1) for i in range(3)])
-		return b
-
-	def michal_old(self, X, sigma=0.1):
-		(n, d) = X.shape
-		sum_ = np.zeros(shape=(X.shape[0], 1))
-
-		for ii in range(d):
-			xi = X[:, ii]
-			# print ("xi",xi)
-			i = ii + 1
-			new = np.sin(xi) * np.power((np.sin(i * np.power(xi, 2) / np.pi)), (2 * d))
-			sum_ += new.reshape(n, 1)
-		return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def stang_old(self, X, sigma=0.1):
-		(n, d) = X.shape
-		sum_ = np.zeros(shape=(X.shape[0], 1))
-
-		for ii in range(d):
-			xi = X[:, ii]
-			new = xi ** 4 - 16. * xi ** 2 + 5 * xi
-			sum_ += new.reshape(n, 1)
-
-		sum_ = sum_ / (38.7122 * d)
-		# sum_ = sum_/d
-
-		return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def michal_un(self, X, sigma=0.1):
-		(n, d) = X.shape
-		X = (X + 0.5) * np.pi
-		ar = np.arange(1, d + 1, 1)
-		sum_ = np.sin(X) * np.power((np.sin(ar * X / np.pi)), (2 * d))
-		sum_ = np.sum(sum_, axis=1).reshape(-1, 1)
-		return sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def michal(self, X, sigma=0.1):
-		(n, d) = X.shape
-		X = (X + 0.5) * np.pi
-		ar = np.arange(1, d + 1, 1)
-		sum_ = np.sin(X) * np.power((np.sin(ar * X / np.pi)), (2 * d))
-		sum_ = np.sum(sum_, axis=1).reshape(-1, 1)
-		sum_ = sum_ / self.michal_optimum(d)[1]
-		return sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def michal_bounds(self, N, n, d=1, adv_inv=False):
-		if adv_inv == False:
-			x = np.random.uniform(-0.5, 0.5, size=(N, d))
-		else:
-			x = np.random.uniform(-0.5, 0., size=(N, d))
-
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(d)]
-			xtest = cartesian(arrays)
-
-		return (d, xtest, x, 0.3)
-
-	def michal_opt_bounds(self, d):
-		b = tuple([(-0.5, 0.5) for i in range(d)])
-		return b
-
-	def michal_optimum(self, d):
-		q = 20
-		opt = np.ones(shape=(q))
-		opt[0] = 2.93254
-		opt[1] = 2.34661
-		opt[2] = 1.64107
-		opt[3] = 1.24415
-		opt[4] = 0.999643
-		opt[5] = 0.834879
-		opt[6] = 2.1089
-		opt[7] = 1.84835
-		opt[8] = 1.64448
-		opt[9] = 1.48089
-		opt[10] = 1.34678
-		opt[11] = 1.2349
-		opt[12] = 1.89701
-		opt[13] = 1.76194
-		opt[14] = 1.64477
-		opt[15] = 1.54218
-		opt[16] = 1.45162
-		opt[17] = 1.37109
-		opt[18] = 1.81774
-		opt = opt[0:d].reshape(1, -1)
-		opt = (opt / np.pi) - 0.5
-		value = self.michal_un(opt, sigma=0)
-		return (opt, value[0][0])
-
-	def stang_un(self, X, sigma=0.1):
-		(n, d) = X.shape
-		X = X * 8
-		Y = X ** 2
-		sum_ = np.sum(Y ** 2 - 16. * Y + 5 * X, axis=1).reshape(-1, 1)
-		sum_ = sum_
-		return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def stang(self, X, sigma=0.1):
-		(n, d) = X.shape
-		X = X * 8
-		Y = X ** 2
-		sum_ = np.sum(Y ** 2 - 16. * Y + 5 * X, axis=1).reshape(-1, 1)
-		sum_ = sum_ / self.stang_optimum(d)[1]
-		return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def stang_bounds(self, N, n, d=1, adv_inv=False):
-		if adv_inv == False:
-			x = np.random.uniform(-0.5, 0.5, size=(N, d))
-		else:
-			print("Adversarially initiallized")
-			x = np.random.uniform(0.4, 0.5, size=(N, d))
-
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(d)]
-			xtest = cartesian(arrays)
-
-		return (d, xtest, x, 0.6)
-
-	def stang_opt_bounds(self, d):
-		b = tuple([(-0.5, 0.5) for i in range(d)])
-		return b
-
-	def stang_optimum(self, d):
-		opt = np.ones(shape=(d)) * (-2.9035)
-		opt = opt / 8
-		opt = opt.reshape(1, -1)
-
-		value = self.stang_un(opt, sigma=0.0)
-		return (opt, value[0][0])
-
-	def double_group_un(self, X, sigma=0.1):
-		sum_ = np.sum(np.exp(-(np.diff(X, axis=1) / 0.25) ** 2), axis=1).reshape(-1, 1)
-		return 0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def double_group(self, X, sigma=0.1):
-		(n, d) = X.shape
-		sum_ = np.sum(np.exp(-(np.diff(X, axis=1) / 0.25) ** 2), axis=1).reshape(-1, 1)
-		sum_ = sum_ / self.double_group_optimum(d)[1]
-		return 0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
-
-	def double_group_bounds(self, N, n, d=1, adv_inv=False):
-		if adv_inv == False:
-			x = np.random.uniform(-0.5, 0.5, size=(N, d))
-		else:
-			print("Adversarially initiallized")
-			x = np.random.uniform(-0.5, -0.4, size=(N, d))
-
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(d)]
-			xtest = cartesian(arrays)
-
-		return (d, xtest, x, 0.6)
-
-	def double_group_opt_bounds(self, d):
-		b = tuple([(-0.5, 0.5) for i in range(d)])
-		return b
-
-	def double_group_optimum(self, d):
-		opt = np.zeros(shape=(1, d))
-		value = self.double_group_un(opt, 0)[0][0]
-		return (opt, value)
-
-	def swissfel(self, X, sigma=0.1):
-		if self.init == False:
-			raise AssertionError("Need to run bounds first.")
-		else:
-			if sigma == 0.0:
-				return self.model.predict(X)[0]
-			else:
-				return self.model.predict(X)[0] + np.random.randn(X.shape[0], 1) * self.noise
-
-	def swissfel_bounds(self, N, n):
-		if self.init == False:
-			import os.path
-			fname = "/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_model.p"
-			if not os.path.isfile(fname):
-				f = File('/home/mojko/Documents/PhD/RFFinBO/code/test_problems/evaluations.hdf5')
-				dset = f['1']
-				X = dset["x"][:].reshape(-1, 5)
-
-				# y response and scale
-				Y = dset["y"][:].reshape(-1, 1)
-				Y = Y / np.max(np.abs(Y))
-
-				# noise structure
-				Yerr = dset["y_std"] / np.max(np.abs(Y))
-				self.noise = np.std(Yerr)
-				print("Estimated noise level", self.noise)
-
-				# data scale to [-0.5,0.5]
-				X = dset["x"][:].reshape(-1, 5)
-				for j in range(5):
-					a = np.min(X[:, j])
-					b = np.max(X[:, j])
-					X[:, j] = (X[:, j] / (b - a)) - 0.5 - a / (b - a)
-
-				## fully additive kernel s
-				self.kernel = GPy.kern.RBF(1, active_dims=[0]) + GPy.kern.RBF(1, active_dims=[1]) \
-							  + GPy.kern.RBF(1, active_dims=[2]) + GPy.kern.RBF(1, active_dims=[3]) \
-							  + GPy.kern.RBF(1, active_dims=[4])
-				self.model = GPy.models.GPRegression(X, Y, self.kernel)
-				print("Model fit")
-				self.model.optimize(messages=True)
-				print("ML likelihood fit")
-				self.init = True
-				# save pickle
-				pickle.dump(self.model,
-							open("/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_model.p", "wb"))
-				pickle.dump(self.noise,
-							open("/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_noise.p", "wb"))
-			else:
-				self.init = True
-				self.model = pickle.load(
-					open("/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_model.p", "rb"))
-				self.noise = pickle.load(
-					open("/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_noise.p", "rb"))
-
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(5)]
-			xtest = cartesian(arrays)
-
-		# bw = np.min(self.kernel.lengthscale)
-		x = np.random.uniform(-0.5, 0.5, size=(N, 5))
-		return (5, xtest, x, 0.1)
-
-	def swissfel_opt_bounds(self):
-		b = tuple([(-0.5, 0.5) for i in range(5)])
-		return b
-
-	def swissfel_optimum(self):
-		from scipy.optimize import minimize
-		# maximize the function
-		mybounds = self.swissfel_opt_bounds()
-		fun = lambda x: -self.swissfel(x.reshape(1, -1), sigma=0.0)[0][0]
-
-		best = -10.
-		repeats = 10
-		for i in range(repeats):
-			x0 = np.random.uniform(-0.5, 0.5, size=(5,))
-			res = minimize(fun, x0, method="L-BFGS-B", tol=0.0001, bounds=mybounds)
-			value = self.swissfel(res.x.reshape(1, -1), sigma=0)
-			if value > best:
-				best = value
-				self.opt_loc = res.x.reshape(1, -1)
-		return (self.opt_loc, best)
-
-	def neural_net(self, X, sigma=0.1):
-		(n, d) = X.shape
-		res = []
-		val_size = 400
-		if self.sampled == False:
-			self.sampled = True
-			try:
-				self.mnist = input_data.read_data_sets("~/.", one_hot=True, validation_size=val_size)
-			except:
-				self.mnist = input_data.read_data_sets("~/.", one_hot=True)
-
-		for x in X:
-			(it, acc) = train_network(self.mnist, dropout=x[0], verbose=False,
-									  val_size=val_size, maxiter=300, initialization_params=x[1:], no_filters_1=self.NN,
-									  no_filters_2=self.NN2, val_count=30)
-			res.append(acc)
-
-		return np.array(acc).reshape(n, 1)
-
-	def neural_net_bounds(self, N, n, NN=16, NN2=22):
-		self.NN = NN
-		self.NN2 = NN2
-		d = self.NN + self.NN2
-
-		x = np.random.uniform(0, 10, size=(N, d))
-		dropout = np.random.uniform(0, 1, size=(N, 1))
-		x = np.concatenate((x, dropout), axis=1)
-
-		if n == None:
-			xtest = None
-		else:
-			arrays = [np.linspace(0, 1, n).reshape(n, 1)] + [np.linspace(0, 10, n).reshape(n, 1) for i in range(d)]
-			xtest = cartesian(arrays)
-
-		return (d + 1, xtest, x, 0.9)
-
-	def neural_net_opt_bounds(self):
-		d = self.NN + self.NN2
-		b = tuple([(0, 1)] + [(0, 10) for i in range(d)])
-		return b
+            external_sum = 0
+            for i in range(4):
+                internal_sum = 0
+                for j in range(4):
+                    internal_sum = internal_sum + A[i, j] * (X[:, j] - P[i, j]) ** 2
+                external_sum = external_sum + alpha[i] * np.exp(-internal_sum)
+
+            return external_sum[:, np.newaxis]
+
+    def hartmann4_bounds(self, N, n):
+        x = np.random.uniform(0, 1, size=(N, 4))
+        # grid
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(0, 1, n).reshape(n, 1) for i in range(4)]
+            xtest = cartesian(arrays)
+        return (4, xtest, x, 0.5)
+
+    def hartmann4_opt_bounds(self):
+        b = tuple([(0, 1) for i in range(4)])
+        return b
+
+    def hartmann3(self, X, sigma=0.1):
+
+        X_lower = np.array([0, 0, 0])
+        X_upper = np.array([1, 1, 1])
+        # opt = np.array([[0.114614, 0.555649, 0.852547]])
+        # fopt = np.array([[-3.86278]])
+        alpha = [1.0, 1.2, 3.0, 3.2]
+        A = np.array(
+            [[3.0, 10.0, 30.0], [0.1, 10.0, 35.0], [3.0, 10.0, 30.0], [0.1, 10.0, 35.0]]
+        )
+        P = 0.0001 * np.array(
+            [
+                [3689, 1170, 2673],
+                [4699, 4387, 7470],
+                [1090, 8732, 5547],
+                [381, 5743, 8828],
+            ]
+        )
+
+        external_sum = 0
+        for i in range(4):
+            internal_sum = 0
+            for j in range(3):
+                internal_sum = internal_sum + A[i, j] * (X[:, j] - P[i, j]) ** 2
+
+            external_sum = external_sum + alpha[i] * np.exp(-internal_sum)
+
+        return external_sum[:, np.newaxis]
+
+    def hartmann3_bounds(self, N, n):
+        x = np.random.uniform(0, 1, size=(N, 3))
+        # grid
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(0, 1, n).reshape(n, 1) for i in range(3)]
+            xtest = cartesian(arrays)
+
+        return (3, xtest, x, 0.4)
+
+    def hartmann3_opt_bounds(self):
+        b = tuple([(0, 1) for i in range(3)])
+        return b
+
+    def michal_old(self, X, sigma=0.1):
+        (n, d) = X.shape
+        sum_ = np.zeros(shape=(X.shape[0], 1))
+
+        for ii in range(d):
+            xi = X[:, ii]
+            # print ("xi",xi)
+            i = ii + 1
+            new = np.sin(xi) * np.power((np.sin(i * np.power(xi, 2) / np.pi)), (2 * d))
+            sum_ += new.reshape(n, 1)
+        return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def stang_old(self, X, sigma=0.1):
+        (n, d) = X.shape
+        sum_ = np.zeros(shape=(X.shape[0], 1))
+
+        for ii in range(d):
+            xi = X[:, ii]
+            new = xi**4 - 16.0 * xi**2 + 5 * xi
+            sum_ += new.reshape(n, 1)
+
+        sum_ = sum_ / (38.7122 * d)
+        # sum_ = sum_/d
+
+        return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def michal_un(self, X, sigma=0.1):
+        (n, d) = X.shape
+        X = (X + 0.5) * np.pi
+        ar = np.arange(1, d + 1, 1)
+        sum_ = np.sin(X) * np.power((np.sin(ar * X / np.pi)), (2 * d))
+        sum_ = np.sum(sum_, axis=1).reshape(-1, 1)
+        return sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def michal(self, X, sigma=0.1):
+        (n, d) = X.shape
+        X = (X + 0.5) * np.pi
+        ar = np.arange(1, d + 1, 1)
+        sum_ = np.sin(X) * np.power((np.sin(ar * X / np.pi)), (2 * d))
+        sum_ = np.sum(sum_, axis=1).reshape(-1, 1)
+        sum_ = sum_ / self.michal_optimum(d)[1]
+        return sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def michal_bounds(self, N, n, d=1, adv_inv=False):
+        if adv_inv == False:
+            x = np.random.uniform(-0.5, 0.5, size=(N, d))
+        else:
+            x = np.random.uniform(-0.5, 0.0, size=(N, d))
+
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(d)]
+            xtest = cartesian(arrays)
+
+        return (d, xtest, x, 0.3)
+
+    def michal_opt_bounds(self, d):
+        b = tuple([(-0.5, 0.5) for i in range(d)])
+        return b
+
+    def michal_optimum(self, d):
+        q = 20
+        opt = np.ones(shape=(q))
+        opt[0] = 2.93254
+        opt[1] = 2.34661
+        opt[2] = 1.64107
+        opt[3] = 1.24415
+        opt[4] = 0.999643
+        opt[5] = 0.834879
+        opt[6] = 2.1089
+        opt[7] = 1.84835
+        opt[8] = 1.64448
+        opt[9] = 1.48089
+        opt[10] = 1.34678
+        opt[11] = 1.2349
+        opt[12] = 1.89701
+        opt[13] = 1.76194
+        opt[14] = 1.64477
+        opt[15] = 1.54218
+        opt[16] = 1.45162
+        opt[17] = 1.37109
+        opt[18] = 1.81774
+        opt = opt[0:d].reshape(1, -1)
+        opt = (opt / np.pi) - 0.5
+        value = self.michal_un(opt, sigma=0)
+        return (opt, value[0][0])
+
+    def stang_un(self, X, sigma=0.1):
+        (n, d) = X.shape
+        X = X * 8
+        Y = X**2
+        sum_ = np.sum(Y**2 - 16.0 * Y + 5 * X, axis=1).reshape(-1, 1)
+        sum_ = sum_
+        return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def stang(self, X, sigma=0.1):
+        (n, d) = X.shape
+        X = X * 8
+        Y = X**2
+        sum_ = np.sum(Y**2 - 16.0 * Y + 5 * X, axis=1).reshape(-1, 1)
+        sum_ = sum_ / self.stang_optimum(d)[1]
+        return -0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def stang_bounds(self, N, n, d=1, adv_inv=False):
+        if adv_inv == False:
+            x = np.random.uniform(-0.5, 0.5, size=(N, d))
+        else:
+            print("Adversarially initiallized")
+            x = np.random.uniform(0.4, 0.5, size=(N, d))
+
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(d)]
+            xtest = cartesian(arrays)
+
+        return (d, xtest, x, 0.6)
+
+    def stang_opt_bounds(self, d):
+        b = tuple([(-0.5, 0.5) for i in range(d)])
+        return b
+
+    def stang_optimum(self, d):
+        opt = np.ones(shape=(d)) * (-2.9035)
+        opt = opt / 8
+        opt = opt.reshape(1, -1)
+
+        value = self.stang_un(opt, sigma=0.0)
+        return (opt, value[0][0])
+
+    def double_group_un(self, X, sigma=0.1):
+        sum_ = np.sum(np.exp(-((np.diff(X, axis=1) / 0.25) ** 2)), axis=1).reshape(
+            -1, 1
+        )
+        return 0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def double_group(self, X, sigma=0.1):
+        (n, d) = X.shape
+        sum_ = np.sum(np.exp(-((np.diff(X, axis=1) / 0.25) ** 2)), axis=1).reshape(
+            -1, 1
+        )
+        sum_ = sum_ / self.double_group_optimum(d)[1]
+        return 0.5 * sum_ + np.random.randn(X.shape[0], 1) * sigma
+
+    def double_group_bounds(self, N, n, d=1, adv_inv=False):
+        if adv_inv == False:
+            x = np.random.uniform(-0.5, 0.5, size=(N, d))
+        else:
+            print("Adversarially initiallized")
+            x = np.random.uniform(-0.5, -0.4, size=(N, d))
+
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(d)]
+            xtest = cartesian(arrays)
+
+        return (d, xtest, x, 0.6)
+
+    def double_group_opt_bounds(self, d):
+        b = tuple([(-0.5, 0.5) for i in range(d)])
+        return b
+
+    def double_group_optimum(self, d):
+        opt = np.zeros(shape=(1, d))
+        value = self.double_group_un(opt, 0)[0][0]
+        return (opt, value)
+
+    def swissfel(self, X, sigma=0.1):
+        if self.init == False:
+            raise AssertionError("Need to run bounds first.")
+        else:
+            if sigma == 0.0:
+                return self.model.predict(X)[0]
+            else:
+                return (
+                    self.model.predict(X)[0]
+                    + np.random.randn(X.shape[0], 1) * self.noise
+                )
+
+    def swissfel_bounds(self, N, n):
+        if self.init == False:
+            import os.path
+
+            fname = (
+                "/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_model.p"
+            )
+            if not os.path.isfile(fname):
+                f = File(
+                    "/home/mojko/Documents/PhD/RFFinBO/code/test_problems/evaluations.hdf5"
+                )
+                dset = f["1"]
+                X = dset["x"][:].reshape(-1, 5)
+
+                # y response and scale
+                Y = dset["y"][:].reshape(-1, 1)
+                Y = Y / np.max(np.abs(Y))
+
+                # noise structure
+                Yerr = dset["y_std"] / np.max(np.abs(Y))
+                self.noise = np.std(Yerr)
+                print("Estimated noise level", self.noise)
+
+                # data scale to [-0.5,0.5]
+                X = dset["x"][:].reshape(-1, 5)
+                for j in range(5):
+                    a = np.min(X[:, j])
+                    b = np.max(X[:, j])
+                    X[:, j] = (X[:, j] / (b - a)) - 0.5 - a / (b - a)
+
+                ## fully additive kernel s
+                self.kernel = (
+                    GPy.kern.RBF(1, active_dims=[0])
+                    + GPy.kern.RBF(1, active_dims=[1])
+                    + GPy.kern.RBF(1, active_dims=[2])
+                    + GPy.kern.RBF(1, active_dims=[3])
+                    + GPy.kern.RBF(1, active_dims=[4])
+                )
+                self.model = GPy.models.GPRegression(X, Y, self.kernel)
+                print("Model fit")
+                self.model.optimize(messages=True)
+                print("ML likelihood fit")
+                self.init = True
+                # save pickle
+                pickle.dump(
+                    self.model,
+                    open(
+                        "/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_model.p",
+                        "wb",
+                    ),
+                )
+                pickle.dump(
+                    self.noise,
+                    open(
+                        "/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_noise.p",
+                        "wb",
+                    ),
+                )
+            else:
+                self.init = True
+                self.model = pickle.load(
+                    open(
+                        "/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_model.p",
+                        "rb",
+                    )
+                )
+                self.noise = pickle.load(
+                    open(
+                        "/home/mojko/Documents/PhD/RFFinBO/code/test_problems/swissfel_noise.p",
+                        "rb",
+                    )
+                )
+
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(-0.5, 0.5, n).reshape(n, 1) for i in range(5)]
+            xtest = cartesian(arrays)
+
+        # bw = np.min(self.kernel.lengthscale)
+        x = np.random.uniform(-0.5, 0.5, size=(N, 5))
+        return (5, xtest, x, 0.1)
+
+    def swissfel_opt_bounds(self):
+        b = tuple([(-0.5, 0.5) for i in range(5)])
+        return b
+
+    def swissfel_optimum(self):
+        from scipy.optimize import minimize
+
+        # maximize the function
+        mybounds = self.swissfel_opt_bounds()
+        fun = lambda x: -self.swissfel(x.reshape(1, -1), sigma=0.0)[0][0]
+
+        best = -10.0
+        repeats = 10
+        for i in range(repeats):
+            x0 = np.random.uniform(-0.5, 0.5, size=(5,))
+            res = minimize(fun, x0, method="L-BFGS-B", tol=0.0001, bounds=mybounds)
+            value = self.swissfel(res.x.reshape(1, -1), sigma=0)
+            if value > best:
+                best = value
+                self.opt_loc = res.x.reshape(1, -1)
+        return (self.opt_loc, best)
+
+    def neural_net(self, X, sigma=0.1):
+        (n, d) = X.shape
+        res = []
+        val_size = 400
+        if self.sampled == False:
+            self.sampled = True
+            try:
+                self.mnist = input_data.read_data_sets(
+                    "~/.", one_hot=True, validation_size=val_size
+                )
+            except:
+                self.mnist = input_data.read_data_sets("~/.", one_hot=True)
+
+        for x in X:
+            (it, acc) = train_network(
+                self.mnist,
+                dropout=x[0],
+                verbose=False,
+                val_size=val_size,
+                maxiter=300,
+                initialization_params=x[1:],
+                no_filters_1=self.NN,
+                no_filters_2=self.NN2,
+                val_count=30,
+            )
+            res.append(acc)
+
+        return np.array(acc).reshape(n, 1)
+
+    def neural_net_bounds(self, N, n, NN=16, NN2=22):
+        self.NN = NN
+        self.NN2 = NN2
+        d = self.NN + self.NN2
+
+        x = np.random.uniform(0, 10, size=(N, d))
+        dropout = np.random.uniform(0, 1, size=(N, 1))
+        x = np.concatenate((x, dropout), axis=1)
+
+        if n == None:
+            xtest = None
+        else:
+            arrays = [np.linspace(0, 1, n).reshape(n, 1)] + [
+                np.linspace(0, 10, n).reshape(n, 1) for i in range(d)
+            ]
+            xtest = cartesian(arrays)
+
+        return (d + 1, xtest, x, 0.9)
+
+    def neural_net_opt_bounds(self):
+        d = self.NN + self.NN2
+        b = tuple([(0, 1)] + [(0, 10) for i in range(d)])
+        return b
 
 
 if __name__ == "__main__":
-	s = 0
-	TT = test_function()
-	Fs = [lambda x: TT.f(x, sigma=s), lambda x: TT.branin(x, sigma=s), lambda x: TT.camelback(x, sigma=s),
-		  lambda x: TT.hartmann3(x, sigma=s), lambda x: TT.hartmann4(x, sigma=s), lambda x: TT.hartmann6(x, sigma=s)]
-	Fbounds = [lambda n: TT.f_bounds(1, n), lambda n: TT.branin_bounds(1, n), lambda n: TT.camelback_bounds(1, n),
-			   lambda n: TT.hartmann3_bounds(1, n), lambda n: TT.hartmann4_bounds(1, n),
-			   lambda n: TT.hartmann6_bounds(1, n)]
-	ns = [4000, 200, 200, 100, 50, 10]
-	tests = ["1D", "Branin", "Camelback", "Hartmann3", "Hartmann4", "Hartmann6"]
-	z = []
-	for i in range(6):
-		(d, xtest, x, _) = Fbounds[i](ns[i])
-		z.append(np.max(Fs[i](xtest)))
-		print(tests[i], np.max(Fs[i](xtest)))
-	print(z)
-
-	for d, n in zip([1, 2, 3, 4], [900, 100, 50, 3]):
-		G = lambda x: TT.stang(x, sigma=s)
-		(q, xtest, x, _) = TT.stang_bounds(1, n, d=d)
-		print(d, np.max(G(xtest)), np.max(G(xtest)) / d)
-
-	# G = lambda x: TT.michal(x, sigma = s)
-	# (d,xtest,x,_) = TT.michal_bounds(1,5, d = 10)
-	# print (d, np.max(G(xtest)), np.max(G(xtest))/d)
-
-	# for d in np.arange(1,31,1):
-	# 	TT.optimize_f(d = d)
-
-	print("==== Optimized vs Non-Optimized ==== ")
-	print("Michal")
-	multistart = 400
-	d = 10
-	G1 = lambda x: TT.michal(x, sigma=0.)
-	fun = lambda x: -TT.michal(x.reshape(-1, 1), sigma=0.)[0][0]
-	(d, xtest, x, _) = TT.michal_bounds(20, None, d=d)
-	mybounds = TT.michal_opt_bounds(d=d)
-
-	from scipy.optimize import minimize
-
-	results = []
-	for i in range(multistart):
-		x0 = np.random.randn(d)
-		for i in range(d):
-			x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-		res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.00001, bounds=mybounds)
-		# res = minimize(fun, x0, method = "SLSQP", jac = None, tol = 0.00001, bounds=mybounds)
-		solution = res.x
-		results.append([solution, -fun(solution)])
-	results = np.array(results)
-	print(np.max(results[:, 1]))
-
-	print("Stybtang")
-	for d in [10, 20]:
-		multistart = 400
-		G1 = lambda x: TT.stang(x, sigma=0.)
-		fun = lambda x: -TT.stang(x.reshape(-1, 1), sigma=0.)[0][0]
-		(d, xtest, x, _) = TT.stang_bounds(20, None, d=d)
-		mybounds = TT.stang_opt_bounds(d=d)
-		from scipy.optimize import minimize
-
-		results = []
-		for i in range(multistart):
-			x0 = np.random.randn(d)
-			for i in range(d):
-				x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
-			res = minimize(fun, x0, method="L-BFGS-B", jac=None, tol=0.00001, bounds=mybounds)
-			# res = minimize(fun, x0, method = "SLSQP", jac = None, tol = 0.00001, bounds=mybounds)
-			solution = res.x
-			results.append([solution, -fun(solution)])
-
-		results = np.array(results)
-		print(d, np.max(results[:, 1]))
+    s = 0
+    TT = test_function()
+    Fs = [
+        lambda x: TT.f(x, sigma=s),
+        lambda x: TT.branin(x, sigma=s),
+        lambda x: TT.camelback(x, sigma=s),
+        lambda x: TT.hartmann3(x, sigma=s),
+        lambda x: TT.hartmann4(x, sigma=s),
+        lambda x: TT.hartmann6(x, sigma=s),
+    ]
+    Fbounds = [
+        lambda n: TT.f_bounds(1, n),
+        lambda n: TT.branin_bounds(1, n),
+        lambda n: TT.camelback_bounds(1, n),
+        lambda n: TT.hartmann3_bounds(1, n),
+        lambda n: TT.hartmann4_bounds(1, n),
+        lambda n: TT.hartmann6_bounds(1, n),
+    ]
+    ns = [4000, 200, 200, 100, 50, 10]
+    tests = ["1D", "Branin", "Camelback", "Hartmann3", "Hartmann4", "Hartmann6"]
+    z = []
+    for i in range(6):
+        (d, xtest, x, _) = Fbounds[i](ns[i])
+        z.append(np.max(Fs[i](xtest)))
+        print(tests[i], np.max(Fs[i](xtest)))
+    print(z)
+
+    for d, n in zip([1, 2, 3, 4], [900, 100, 50, 3]):
+        G = lambda x: TT.stang(x, sigma=s)
+        (q, xtest, x, _) = TT.stang_bounds(1, n, d=d)
+        print(d, np.max(G(xtest)), np.max(G(xtest)) / d)
+
+    # G = lambda x: TT.michal(x, sigma = s)
+    # (d,xtest,x,_) = TT.michal_bounds(1,5, d = 10)
+    # print (d, np.max(G(xtest)), np.max(G(xtest))/d)
+
+    # for d in np.arange(1,31,1):
+    # 	TT.optimize_f(d = d)
+
+    print("==== Optimized vs Non-Optimized ==== ")
+    print("Michal")
+    multistart = 400
+    d = 10
+    G1 = lambda x: TT.michal(x, sigma=0.0)
+    fun = lambda x: -TT.michal(x.reshape(-1, 1), sigma=0.0)[0][0]
+    (d, xtest, x, _) = TT.michal_bounds(20, None, d=d)
+    mybounds = TT.michal_opt_bounds(d=d)
+
+    from scipy.optimize import minimize
+
+    results = []
+    for i in range(multistart):
+        x0 = np.random.randn(d)
+        for i in range(d):
+            x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+        res = minimize(
+            fun, x0, method="L-BFGS-B", jac=None, tol=0.00001, bounds=mybounds
+        )
+        # res = minimize(fun, x0, method = "SLSQP", jac = None, tol = 0.00001, bounds=mybounds)
+        solution = res.x
+        results.append([solution, -fun(solution)])
+    results = np.array(results)
+    print(np.max(results[:, 1]))
+
+    print("Stybtang")
+    for d in [10, 20]:
+        multistart = 400
+        G1 = lambda x: TT.stang(x, sigma=0.0)
+        fun = lambda x: -TT.stang(x.reshape(-1, 1), sigma=0.0)[0][0]
+        (d, xtest, x, _) = TT.stang_bounds(20, None, d=d)
+        mybounds = TT.stang_opt_bounds(d=d)
+        from scipy.optimize import minimize
+
+        results = []
+        for i in range(multistart):
+            x0 = np.random.randn(d)
+            for i in range(d):
+                x0[i] = np.random.uniform(mybounds[i][0], mybounds[i][1])
+            res = minimize(
+                fun, x0, method="L-BFGS-B", jac=None, tol=0.00001, bounds=mybounds
+            )
+            # res = minimize(fun, x0, method = "SLSQP", jac = None, tol = 0.00001, bounds=mybounds)
+            solution = res.x
+            results.append([solution, -fun(solution)])
+
+        results = np.array(results)
+        print(d, np.max(results[:, 1]))
 
 # print (G1(x))
 # print (G2(x))
diff --git a/tests/SRI_test.py b/tests/SRI_test.py
index 2598ca0..e6b97ae 100755
--- a/tests/SRI_test.py
+++ b/tests/SRI_test.py
@@ -4,89 +4,93 @@
 
 
 def get_angle(R):
-	v = torch.Tensor([1.0,1.0]).double()
-	a1 =  np.arccos((torch.dot(v,R@v)/torch.dot(v,v)).numpy())
-	a2 =  np.arccos(-(torch.dot(v,R@v)/torch.dot(v,v)).numpy())
-	return np.min([a1,a2])
+    v = torch.tensor([1.0, 1.0]).double()
+    a1 = np.arccos((torch.dot(v, R @ v) / torch.dot(v, v)).numpy())
+    a2 = np.arccos(-(torch.dot(v, R @ v) / torch.dot(v, v)).numpy())
+    return np.min([a1, a2])
 
 
 if __name__ == "__main__":
-	from stpy.embeddings.embedding import HermiteEmbedding
-	N = 1
-	s = 0.0001
-	n = 20
-	L_infinity_ball = 0.5
-
-	d = 2
-
-	thetae = np.radians(35.)
-	ce, se = np.cos(thetae), np.sin(thetae)
-	R = torch.from_numpy(np.array(((ce, -se), (se, ce))))
-
-	BenchmarkFunc = MichalBenchmark(d = d, R = R)
-
-	x = BenchmarkFunc.initial_guess(N)
-	xtest = BenchmarkFunc.interval(n)
-	gamma = BenchmarkFunc.bandwidth()
-	bounds = BenchmarkFunc.bounds()
-	BenchmarkFunc.scale_max(xtest=xtest)
-
-	print ("Gamma:",gamma)
-
-	F = lambda x: BenchmarkFunc.eval(x, sigma=s)
-	F0 = lambda x: BenchmarkFunc.eval(x, sigma=0)
-
-
-	rot_out = open("rotOut.txt",'w')
-
-
-	m = 64
-	GP = GaussianProcessFF(d=d, s=s, m = torch.ones(d)*m, gamma=gamma*torch.ones(d), bounds=bounds, groups = stpy.helpers.helper.full_group(d))
-	#GP = GaussianProcess(d =d ,s = s, gamma = gamma*torch.ones(d) ,groups = stpy.helper.full_group(d))
-	#GP = GaussianProcess(d=d, s=s, gamma=gamma, groups=None)
-
-	m = 512
-	embedding = HermiteEmbedding(gamma=gamma, m=m, d=d, diameter=1, approx = "hermite")
-	Map = lambda x: embedding.embed(x)
-
-
-
-	x0 = torch.Tensor([0., 0.]).double().view(-1, d)
-#	Bandit = OPPR_TS_GP(x0, F, GP, Map, finite_dim=False, s = 10e-8)
-	Bandit = OPPR_TS_GP(x0, F, GP, Map, finite_dim=True, s = s, GPMap = True)
-
-	Rep = 2
-	Bandit.decolerate(x0,10e-9,Rep)
-
-	print ("True:",thetae)
-	print (R)
-	print("Angle:",get_angle(R))
-
-	rot_out.write(str(get_angle(R))+"\n")
-
-	print ("E design:\n",Bandit.Q)
-	print("Angle:",get_angle(Bandit.Q.detach()))
-	rot_out.write(str(get_angle(Bandit.Q.detach()))+"\n")
-
-	# Gaussian Design
-	#Design = torch.randn(size = (Nd,d),dtype = torch.float64)*0.1
-
-	Design = Bandit.design
-	y = Bandit.value_design
-	for repeats in range(5):
-		B = Bandit.inverse_sliced_regression(Design,y,slices = Rep)
-		print ("Recovered from SRI:\n",B)
-		print (get_angle(B))
-		rot_out.write(str(get_angle(B)) + " ")
-
-	rot_out.write("\n")
-	BB = Bandit.bootstrap_inverse_sliced_regression(Design,y,slices = Rep,repeats = 20)
-	print ("Bootstrap",BB)
-	rot_out.write(str(get_angle(torch.from_numpy(BB)))+"\n")
-
-	for _ in range(5):
-		Bandit.GP2.optimize_params(type="rots", restarts=1)
-		print (Bandit.GP2.Rot)
-		rot_out.write(str(get_angle(Bandit.GP2.Rot))+" ")
-	rot_out.write("\n")
-	rot_out.close()
+    from stpy.embeddings.embedding import HermiteEmbedding
+
+    N = 1
+    s = 0.0001
+    n = 20
+    L_infinity_ball = 0.5
+
+    d = 2
+
+    thetae = np.radians(35.0)
+    ce, se = np.cos(thetae), np.sin(thetae)
+    R = torch.from_numpy(np.array(((ce, -se), (se, ce))))
+
+    BenchmarkFunc = MichalBenchmark(d=d, R=R)
+
+    x = BenchmarkFunc.initial_guess(N)
+    xtest = BenchmarkFunc.interval(n)
+    gamma = BenchmarkFunc.bandwidth()
+    bounds = BenchmarkFunc.bounds()
+    BenchmarkFunc.scale_max(xtest=xtest)
+
+    print("Gamma:", gamma)
+
+    F = lambda x: BenchmarkFunc.eval(x, sigma=s)
+    F0 = lambda x: BenchmarkFunc.eval(x, sigma=0)
+
+    rot_out = open("rotOut.txt", "w")
+
+    m = 64
+    GP = GaussianProcessFF(
+        d=d,
+        s=s,
+        m=torch.ones(d) * m,
+        gamma=gamma * torch.ones(d),
+        bounds=bounds,
+        groups=stpy.helpers.helper.full_group(d),
+    )
+    # GP = GaussianProcess(d =d ,s = s, gamma = gamma*torch.ones(d) ,groups = stpy.helper.full_group(d))
+    # GP = GaussianProcess(d=d, s=s, gamma=gamma, groups=None)
+
+    m = 512
+    embedding = HermiteEmbedding(gamma=gamma, m=m, d=d, diameter=1, approx="hermite")
+    Map = lambda x: embedding.embed(x)
+
+    x0 = torch.tensor([0.0, 0.0]).double().view(-1, d)
+    # 	Bandit = OPPR_TS_GP(x0, F, GP, Map, finite_dim=False, s = 10e-8)
+    Bandit = OPPR_TS_GP(x0, F, GP, Map, finite_dim=True, s=s, GPMap=True)
+
+    Rep = 2
+    Bandit.decolerate(x0, 10e-9, Rep)
+
+    print("True:", thetae)
+    print(R)
+    print("Angle:", get_angle(R))
+
+    rot_out.write(str(get_angle(R)) + "\n")
+
+    print("E design:\n", Bandit.Q)
+    print("Angle:", get_angle(Bandit.Q.detach()))
+    rot_out.write(str(get_angle(Bandit.Q.detach())) + "\n")
+
+    # Gaussian Design
+    # Design = torch.randn(size = (Nd,d),dtype = torch.float64)*0.1
+
+    Design = Bandit.design
+    y = Bandit.value_design
+    for repeats in range(5):
+        B = Bandit.inverse_sliced_regression(Design, y, slices=Rep)
+        print("Recovered from SRI:\n", B)
+        print(get_angle(B))
+        rot_out.write(str(get_angle(B)) + " ")
+
+    rot_out.write("\n")
+    BB = Bandit.bootstrap_inverse_sliced_regression(Design, y, slices=Rep, repeats=20)
+    print("Bootstrap", BB)
+    rot_out.write(str(get_angle(torch.from_numpy(BB))) + "\n")
+
+    for _ in range(5):
+        Bandit.GP2.optimize_params(type="rots", restarts=1)
+        print(Bandit.GP2.Rot)
+        rot_out.write(str(get_angle(Bandit.GP2.Rot)) + " ")
+    rot_out.write("\n")
+    rot_out.close()
diff --git a/tests/clenshaw_curtis_test.py b/tests/clenshaw_curtis_test.py
index b3f96d1..c684070 100644
--- a/tests/clenshaw_curtis_test.py
+++ b/tests/clenshaw_curtis_test.py
@@ -3,35 +3,40 @@
 
 if __name__ == "__main__":
 
-	### Generate data - a sample from a Gaussian process
-	n = 1024
-	N = 5
-	gamma = 0.09
-	#gamma = 1.
-	s = 0.2
-	# benchmark = stpy.test_functions.benchmarks.GaussianProcessSample(d =1, gamma = gamma, sigma = s, n = n)
-	benchmark = stpy.test_functions.benchmarks.Simple1DFunction(d=1, sigma=s)
-	for j in range(10):
-		m = (2*(j+1)) ** 2
-		#m = 64
-		x = benchmark.initial_guess(N, adv_inv=False)
-		y = benchmark.eval(x)
-		xtest = benchmark.interval(1024)
+    ### Generate data - a sample from a Gaussian process
+    n = 1024
+    N = 5
+    gamma = 0.09
+    # gamma = 1.
+    s = 0.2
+    # benchmark = stpy.test_functions.benchmarks.GaussianProcessSample(d =1, gamma = gamma, sigma = s, n = n)
+    benchmark = stpy.test_functions.benchmarks.Simple1DFunction(d=1, sigma=s)
+    for j in range(10):
+        m = (2 * (j + 1)) ** 2
+        # m = 64
+        x = benchmark.initial_guess(N, adv_inv=False)
+        y = benchmark.eval(x)
+        xtest = benchmark.interval(1024)
 
-		#print (x)
-		CFF = stpy.continuous_processes.fourier_fea.GaussianProcessFF(gamma=gamma, approx="ccff", m=m, s=s)
-		QFF = stpy.continuous_processes.fourier_fea.GaussianProcessFF(gamma=gamma, approx="hermite", m=m, s=s)
-		TFF = stpy.continuous_processes.fourier_fea.GaussianProcessFF(gamma=gamma, approx="trapezoidal", m=m, s=s)
+        # print (x)
+        CFF = stpy.continuous_processes.fourier_fea.GaussianProcessFF(
+            gamma=gamma, approx="ccff", m=m, s=s
+        )
+        QFF = stpy.continuous_processes.fourier_fea.GaussianProcessFF(
+            gamma=gamma, approx="hermite", m=m, s=s
+        )
+        TFF = stpy.continuous_processes.fourier_fea.GaussianProcessFF(
+            gamma=gamma, approx="trapezoidal", m=m, s=s
+        )
 
-		K1 = TFF.embed(x)@TFF.embed(x).T
-		K2 = QFF.embed(x) @ QFF.embed(x).T
-		K3 = CFF.embed(x) @ CFF.embed(x).T
-		#	print(K2)
-		# print("----------------")
-		#print(K3)
-		# print("----------------")
-		print(m, torch.norm(K1 - K2), torch.norm(K2 -K3))
-
-	#CFF.fit_gp(x,y)
-	#CFF.visualize(xtest)
+        K1 = TFF.embed(x) @ TFF.embed(x).T
+        K2 = QFF.embed(x) @ QFF.embed(x).T
+        K3 = CFF.embed(x) @ CFF.embed(x).T
+        # 	print(K2)
+        # print("----------------")
+        # print(K3)
+        # print("----------------")
+        print(m, torch.norm(K1 - K2), torch.norm(K2 - K3))
 
+    # CFF.fit_gp(x,y)
+    # CFF.visualize(xtest)
diff --git a/tests/constrained_mean.py b/tests/constrained_mean.py
index 1bec7ee..7491a48 100644
--- a/tests/constrained_mean.py
+++ b/tests/constrained_mean.py
@@ -5,19 +5,19 @@
 import matplotlib.pyplot as plt
 
 if __name__ == "__main__":
-	d = 1
-	p = 4
-	embed_p = ChebyschevEmbedding(d=d, p=p)
-	m = embed_p.size
-	GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
+    d = 1
+    p = 4
+    embed_p = ChebyschevEmbedding(d=d, p=p)
+    m = embed_p.size
+    GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
 
-	x = torch.from_numpy(interval(10,d))
-	xtest = torch.from_numpy(interval(1024, d))
-	GP.fit_gp(x, x**8)
+    x = torch.from_numpy(interval(10, d))
+    xtest = torch.from_numpy(interval(1024, d))
+    GP.fit_gp(x, x**8)
 
-	mu = GP.mean_constrained(xtest, B = 0.5)
+    mu = GP.mean_constrained(xtest, B=0.5)
 
-	GP.visualize(xtest, show = False)
-	#plt.plot(x, x**8,'o')
-	plt.plot(xtest,mu)
-	plt.show()
\ No newline at end of file
+    GP.visualize(xtest, show=False)
+    # plt.plot(x, x**8,'o')
+    plt.plot(xtest, mu)
+    plt.show()
diff --git a/tests/continous_processes/psd_minimization/eigenvector_constraint.py b/tests/continous_processes/psd_minimization/eigenvector_constraint.py
index 5cadc87..cad3c1f 100644
--- a/tests/continous_processes/psd_minimization/eigenvector_constraint.py
+++ b/tests/continous_processes/psd_minimization/eigenvector_constraint.py
@@ -4,7 +4,12 @@
 import torch
 
 
-from stpy.embeddings.embedding import HermiteEmbedding, RFFEmbedding, ConcatEmbedding, MaskedEmbedding
+from stpy.embeddings.embedding import (
+    HermiteEmbedding,
+    RFFEmbedding,
+    ConcatEmbedding,
+    MaskedEmbedding,
+)
 from stpy.kernels import KernelFunction
 from stpy.helpers.helper import interval, interval_torch
 from stpy.probability.gaussian_likelihood import GaussianLikelihood
@@ -21,12 +26,14 @@
     m = 32
 
     def stable_rank(A):
-        return np.trace(A)/np.max(np.linalg.eigh(A)[0])
-
+        return np.trace(A) / np.max(np.linalg.eigh(A)[0])
 
     V = torch.linalg.qr(torch.randn(size=(m, m)).double())[0]
 
-    f = lambda x: 0.5*torch.sin(x * 20) * (x > 0).double() + 0.5*torch.sin(x * 30) * (x > 0).double()
+    f = (
+        lambda x: 0.5 * torch.sin(x * 20) * (x > 0).double()
+        + 0.5 * torch.sin(x * 30) * (x > 0).double()
+    )
     Xtrain = interval_torch(n=N, d=1)
     ytrain = f(Xtrain)
 
@@ -45,8 +52,8 @@ def stable_rank(A):
     A1 = cp.Variable((m // 2, m // 2), PSD=True)
     A2 = cp.Variable((m // 2, m // 2), PSD=True)
     A3 = cp.Variable((m // 2, m // 2))
-    l = cp.Variable((1,1))
-    s = cp.Parameter((1, 1), nonneg = True)
+    l = cp.Variable((1, 1))
+    s = cp.Parameter((1, 1), nonneg=True)
 
     likelihood = GaussianLikelihood(sigma=s)
     estimator = RegularizedDictionary(embedding, likelihood)
@@ -55,43 +62,61 @@ def stable_rank(A):
     likelihood = estimator.likelihood
     likelihood.load_data(data)
 
-    total_trace = 2.
+    total_trace = 2.0
     objective = likelihood.get_objective_cvxpy()(theta)
     A = cp.bmat([[A1, A3], [A3, A2]])
-    s.value = np.array([[1.]])
-    constraints = [cp.matrix_frac(theta, A) <= 1, cp.trace(A) <= total_trace*l, A >> 0,cp.lambda_max(A)<=l]
+    s.value = np.array([[1.0]])
+    constraints = [
+        cp.matrix_frac(theta, A) <= 1,
+        cp.trace(A) <= total_trace * l,
+        A >> 0,
+        cp.lambda_max(A) <= l,
+    ]
     prob = cp.Problem(cp.Minimize(objective), constraints)
     prob.solve(solver=cp.MOSEK, verbose=True)
 
     estimator.theta_fit = theta.value
     estimator.fitted = True
-    print (prob.value)
-    print (np.max(np.linalg.eigh(A.value)[0]))
-    print (l.value)
+    print(prob.value)
+    print(np.max(np.linalg.eigh(A.value)[0]))
+    print(l.value)
     print("--------------")
 
     if theta.value is not None:
         mu = estimator.mean(xtest)
-        plt.plot(xtest,mu, 'b', lw = 3, label = 'opt')
-
-    plt.plot(Xtrain,ytrain,'ko', lw = 3)
-    plt.plot(xtest,f(xtest),'k--', lw = 3)
-
-    constraints = [cp.matrix_frac(theta, A) <= 1, cp.trace(A) <= total_trace*l, A >> 0,cp.lambda_max(A)<=l, l<=s]
+        plt.plot(xtest, mu, "b", lw=3, label="opt")
+
+    plt.plot(Xtrain, ytrain, "ko", lw=3)
+    plt.plot(xtest, f(xtest), "k--", lw=3)
+
+    constraints = [
+        cp.matrix_frac(theta, A) <= 1,
+        cp.trace(A) <= total_trace * l,
+        A >> 0,
+        cp.lambda_max(A) <= l,
+        l <= s,
+    ]
     prob = cp.Problem(cp.Minimize(objective), constraints)
     prob.solve(solver=cp.MOSEK, verbose=True)
 
     def cost(z):
         s.value = z
         prob.solve()
-        return prob.value, total_trace * l.value, l.value,  (np.max(np.linalg.eigh(A.value)[0])), np.trace(A.value), stable_rank(A.value)
-
-    z_vals = np.logspace(-5,5,20, base = 2)
+        return (
+            prob.value,
+            total_trace * l.value,
+            l.value,
+            (np.max(np.linalg.eigh(A.value)[0])),
+            np.trace(A.value),
+            stable_rank(A.value),
+        )
+
+    z_vals = np.logspace(-5, 5, 20, base=2)
     l_vals = []
     eigvals = []
     differences = []
     for z in z_vals:
-        prob_val, _, l_val, eigv, _ , _  = cost(np.array([[z]]))
+        prob_val, _, l_val, eigv, _, _ = cost(np.array([[z]]))
         estimator.theta_fit = theta.value
         estimator.fitted = True
         mu = estimator.mean(xtest)
@@ -99,19 +124,18 @@ def cost(z):
         eigvals.append(float(eigv))
         differences.append(float(l_val) - float(eigv))
 
-        print (z, float(l_val) - float(eigv))
+        print(z, float(l_val) - float(eigv))
 
-        if float(l_val) - float(eigv) <= 1e-2 and float(l_val) - float(eigv)>=0:
-            plt.plot(xtest,mu, 'g--', lw = 3, label = 'stable-rank')
+        if float(l_val) - float(eigv) <= 1e-2 and float(l_val) - float(eigv) >= 0:
+            plt.plot(xtest, mu, "g--", lw=3, label="stable-rank")
     plt.show()
 
-    plt.plot(z_vals.reshape(-1),l_vals, label = 'lvals')
-    plt.plot(z_vals.reshape(-1),eigvals, label = 'eig')
+    plt.plot(z_vals.reshape(-1), l_vals, label="lvals")
+    plt.plot(z_vals.reshape(-1), eigvals, label="eig")
     # plt.plot(z_vals.reshape(-1), differences, label='diff')
     plt.legend()
     plt.show()
 
-
     #
     # # Fix an eigenvector
     # v_init = np.zeros(shape=(m, 1))
@@ -173,4 +197,4 @@ def cost(z):
     # #     grad = euclidean_gradient(w)
     # #     w = w - eta * grad
     # #     w = proj(w)
-    # #     print (i, value(w))
\ No newline at end of file
+    # #     print (i, value(w))
diff --git a/tests/continous_processes/psd_minimization/psd_minimization.py b/tests/continous_processes/psd_minimization/psd_minimization.py
index 80c9e70..686ccdf 100644
--- a/tests/continous_processes/psd_minimization/psd_minimization.py
+++ b/tests/continous_processes/psd_minimization/psd_minimization.py
@@ -1,11 +1,21 @@
-from stpy.embeddings.embedding import HermiteEmbedding, RFFEmbedding, ConcatEmbedding, MaskedEmbedding
+from stpy.embeddings.embedding import (
+    HermiteEmbedding,
+    RFFEmbedding,
+    ConcatEmbedding,
+    MaskedEmbedding,
+)
 import pymanopt
 import cvxpy as cp
 import numpy as np
 import torch
 from cvxpylayers.torch import CvxpyLayer
 
-from stpy.embeddings.embedding import HermiteEmbedding, RFFEmbedding, ConcatEmbedding, MaskedEmbedding
+from stpy.embeddings.embedding import (
+    HermiteEmbedding,
+    RFFEmbedding,
+    ConcatEmbedding,
+    MaskedEmbedding,
+)
 from stpy.kernels import KernelFunction
 from stpy.helpers.helper import interval, interval_torch
 from stpy.probability.gaussian_likelihood import GaussianLikelihood
@@ -15,7 +25,6 @@
 
 if __name__ == "__main__":
 
-
     N = 10
     n = 256
     d = 1
@@ -32,71 +41,71 @@
     xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1))
     kernel_object = KernelFunction(gamma=0.05, d=1)
 
-    #embedding = HermiteEmbedding(m=m, gamma = 1.)
+    # embedding = HermiteEmbedding(m=m, gamma = 1.)
 
-    embedding1 = NystromFeatures(kernel_object=kernel_object, m=m//2)
+    embedding1 = NystromFeatures(kernel_object=kernel_object, m=m // 2)
     embedding1.fit_gp(xtest / 2 - 0.5, None)
-    embedding2 = NystromFeatures(kernel_object=kernel_object, m=m//2)
+    embedding2 = NystromFeatures(kernel_object=kernel_object, m=m // 2)
     embedding2.fit_gp(xtest / 2 + 0.5, None)
     embedding = ConcatEmbedding([embedding1, embedding2])
 
-
-    theta = cp.Variable((m,1))
-    A1 = cp.Variable((m//2,m//2), PSD = True)
-    A2 = cp.Variable((m//2, m//2), PSD=True)
-    A3 = cp.Variable((m//2, m//2))
+    theta = cp.Variable((m, 1))
+    A1 = cp.Variable((m // 2, m // 2), PSD=True)
+    A2 = cp.Variable((m // 2, m // 2), PSD=True)
+    A3 = cp.Variable((m // 2, m // 2))
     t = cp.Variable()
 
     likelihood = GaussianLikelihood(sigma=s)
     estimator = RegularizedDictionary(embedding, likelihood)
-    data = (embedding.embed(Xtrain),ytrain)
+    data = (embedding.embed(Xtrain), ytrain)
     estimator.load_data(data)
     likelihood = estimator.likelihood
     likelihood.load_data(data)
 
-    total_trace = 5.
+    total_trace = 5.0
 
     objective = likelihood.get_objective_cvxpy()(theta)
-    A = cp.bmat([[A1,A3],[A3,A2]])
-    constraints = [cp.matrix_frac(theta, A) <= 1,  cp.trace(A) <= total_trace, A >> 0]
+    A = cp.bmat([[A1, A3], [A3, A2]])
+    constraints = [cp.matrix_frac(theta, A) <= 1, cp.trace(A) <= total_trace, A >> 0]
     prob = cp.Problem(cp.Minimize(objective), constraints)
-    prob.solve(solver = cp.MOSEK, verbose = True)
+    prob.solve(solver=cp.MOSEK, verbose=True)
 
     estimator.theta_fit = theta.value
     estimator.fitted = True
-    print (prob.value)
-    #plt.imshow(A.value)
-    #plt.show()
+    print(prob.value)
+    # plt.imshow(A.value)
+    # plt.show()
     if theta.value is not None:
         mu = estimator.mean(xtest)
-        plt.plot(xtest,mu, 'b', lw = 3, label = 'opt')
-
-    plt.plot(Xtrain,ytrain,'ko', lw = 3)
-    plt.plot(xtest,f(xtest),'k--', lw = 3)
+        plt.plot(xtest, mu, "b", lw=3, label="opt")
 
+    plt.plot(Xtrain, ytrain, "ko", lw=3)
+    plt.plot(xtest, f(xtest), "k--", lw=3)
 
-    theta = cp.Variable((m,1))
+    theta = cp.Variable((m, 1))
     V = cp.Parameter((m, m))
     objective = likelihood.get_objective_cvxpy()(theta)
     a = cp.Variable(m)
-    A = cp.Variable((m,m))
+    A = cp.Variable((m, m))
 
-    constraints = [cp.matrix_frac(V.T@theta, cp.diag(a)) <= 1., a>=0, cp.sum(a)<=total_trace]
+    constraints = [
+        cp.matrix_frac(V.T @ theta, cp.diag(a)) <= 1.0,
+        a >= 0,
+        cp.sum(a) <= total_trace,
+    ]
     prob = cp.Problem(cp.Minimize(objective), constraints)
 
-
-
-    manifold = pymanopt.manifolds.Stiefel(m,m)
+    manifold = pymanopt.manifolds.Stiefel(m, m)
 
     def opt(V_val):
         V.value = V_val
-        prob.solve(solver = cp.MOSEK, verbose = False)
+        prob.solve(solver=cp.MOSEK, verbose=False)
         return theta.value
 
     @pymanopt.function.numpy(manifold)
     def cost(V_val):
         V.value = V_val
-        prob.solve(requires_grad=True, solver = cp.SCS)
+        prob.solve(requires_grad=True, solver=cp.SCS)
         return prob.value
 
     @pymanopt.function.numpy(manifold)
@@ -106,56 +115,53 @@ def euclidean_gradient(V_val):
         prob.backward()
         return V.gradient
 
-    print ("INITIAL COST:", cost(np.eye(m)))
+    print("INITIAL COST:", cost(np.eye(m)))
     problem = pymanopt.Problem(manifold, cost, euclidean_gradient=euclidean_gradient)
     optimizer = pymanopt.optimizers.SteepestDescent(min_step_size=1e-15)
-    result = optimizer.run(problem, initial_point = np.eye(m))
+    result = optimizer.run(problem, initial_point=np.eye(m))
     V_val = result.point
-    #V_val = np.eye(m)
-    #print (result)
-    print (V_val@V_val.T)
-    print ("END COST:", cost(V_val))
+    # V_val = np.eye(m)
+    # print (result)
+    print(V_val @ V_val.T)
+    print("END COST:", cost(V_val))
 
     estimator.theta_fit = opt(V_val)
     estimator.fitted = True
 
     mu = estimator.mean(xtest)
-    plt.plot(xtest,mu, 'r--', lw = 3, label = 'ortho opt')
-
-
-
-
-
-
+    plt.plot(xtest, mu, "r--", lw=3, label="ortho opt")
 
     estimator.theta_fit = opt(np.eye(m))
     mu = estimator.mean(xtest)
-    plt.plot(xtest,mu, 'g--', lw = 3, label = 'A identity')
-
+    plt.plot(xtest, mu, "g--", lw=3, label="A identity")
 
     # simplified objective
-    theta = cp.Variable((m,1))
+    theta = cp.Variable((m, 1))
     objective = likelihood.get_objective_cvxpy()(theta)
-    constraints = [cp.sum_squares(theta) <= total_trace/m]
+    constraints = [cp.sum_squares(theta) <= total_trace / m]
     prob_simple = cp.Problem(cp.Minimize(objective), constraints)
     prob_simple.solve()
-    print ("SIMPLE COST:",prob_simple.value)
+    print("SIMPLE COST:", prob_simple.value)
     estimator.theta_fit = theta.value
     mu = estimator.mean(xtest)
-    plt.plot(xtest,mu, 'tab:purple', lw = 3, label = 'simple solution')
+    plt.plot(xtest, mu, "tab:purple", lw=3, label="simple solution")
 
-    theta = cp.Variable((m,1))
+    theta = cp.Variable((m, 1))
     V = cp.Parameter((m, m))
     objective = likelihood.get_objective_cvxpy()(theta)
     a = cp.Variable(m)
-    A = cp.Variable((m,m), PSD=True)
-    constraints = [cp.matrix_frac(theta, cp.diag(a)) <= 1., a>=0, cp.sum(a)<=total_trace]
+    A = cp.Variable((m, m), PSD=True)
+    constraints = [
+        cp.matrix_frac(theta, cp.diag(a)) <= 1.0,
+        a >= 0,
+        cp.sum(a) <= total_trace,
+    ]
     prob_complicated = cp.Problem(cp.Minimize(objective), constraints)
-    prob_complicated.solve(solver = cp.MOSEK , verbose = True)
+    prob_complicated.solve(solver=cp.MOSEK, verbose=True)
     estimator.theta_fit = theta.value
     mu = estimator.mean(xtest)
 
-    plt.plot(xtest,mu, 'tab:brown', lw = 3, label = 'soln')
+    plt.plot(xtest, mu, "tab:brown", lw=3, label="soln")
 
     plt.legend()
-    plt.show()
\ No newline at end of file
+    plt.show()
diff --git a/tests/continous_processes/test_estimators/domain_non_stationarity.py b/tests/continous_processes/test_estimators/domain_non_stationarity.py
index 035d8be..b772dfb 100644
--- a/tests/continous_processes/test_estimators/domain_non_stationarity.py
+++ b/tests/continous_processes/test_estimators/domain_non_stationarity.py
@@ -13,19 +13,23 @@
 m = 200
 d = 1
 sigma = 0.005
-lam = 1.
+lam = 1.0
 n = 256
 
 I = torch.eye(m).double()
-budget = m*1
-kernel_object = KernelFunction(gamma = 0.1, d = 1)
+budget = m * 1
+kernel_object = KernelFunction(gamma=0.1, d=1)
 
-embedding1 = TriangleEmbedding(m = m, d = 1, kernel_object=kernel_object, interval=[-1,0], offset=0.0)
-embedding2 = TriangleEmbedding(m = m, d = 1, kernel_object=kernel_object, interval=[0,1], offset=0.0)
+embedding1 = TriangleEmbedding(
+    m=m, d=1, kernel_object=kernel_object, interval=[-1, 0], offset=0.0
+)
+embedding2 = TriangleEmbedding(
+    m=m, d=1, kernel_object=kernel_object, interval=[0, 1], offset=0.0
+)
 
-embedding = ConcatEmbedding([embedding1,embedding2])
+embedding = ConcatEmbedding([embedding1, embedding2])
 
-likelihood_base = GaussianLikelihood(sigma = sigma)
+likelihood_base = GaussianLikelihood(sigma=sigma)
 
 
 # for w,g in zip(weights,new_groups):
@@ -38,17 +42,19 @@
 N = 20
 torch.manual_seed(2)
 
+
 def zeroing(X):
     Y = X.clone()
-    Y[ X < 0.] = 0.
+    Y[X < 0.0] = 0.0
     return Y
 
-F = lambda X: (np.cos(X*10.)+np.sin(X*10.))*zeroing(X)
+
+F = lambda X: (np.cos(X * 10.0) + np.sin(X * 10.0)) * zeroing(X)
 # X = torch.rand(size = (N,d)).double()*0.25+0.5
 # y = F(X)
 #
-# Xpoint = torch.Tensor([[0.],[0.5]]).double()
-# ypoint = torch.Tensor([[0.],[0.]]).double()
+# Xpoint = torch.tensor([[0.],[0.5]]).double()
+# ypoint = torch.tensor([[0.],[0.]]).double()
 #
 # X = torch.vstack([X,Xpoint])
 # y = torch.vstack([y,ypoint])
@@ -58,73 +64,99 @@ def zeroing(X):
 # F = lambda X: estimator.mean(X)
 
 
-Xtrain = torch.rand(size=(10, d)).double()/2
+Xtrain = torch.rand(size=(10, d)).double() / 2
 ytrain = F(Xtrain) + sigma * torch.randn(size=(Xtrain.size()[0], 1))
 
 
-
 def update():
     pass
-alphas = [5,10]#,0.01,0.001]
-lams_uns = [0.01,0.05,0.1]
+
+
+alphas = [5, 10]  # ,0.01,0.001]
+lams_uns = [0.01, 0.05, 0.1]
 # alphas = [0.01]
 # lams_uns = [0.1]
 
 fig, axs = plt.subplots(len(alphas), len(lams_uns))
 
 for index1, alpha in enumerate(alphas):
-    lams = [la/alpha for la in lams_uns]#, 0.01/alpha]#,16.,32.,64.,128.]
+    lams = [la / alpha for la in lams_uns]  # , 0.01/alpha]#,16.,32.,64.,128.]
 
     for index2, lam in enumerate(lams):
-        print ("Regularizer:", alpha, lam)
+        print("Regularizer:", alpha, lam)
 
-        xtest = interval_torch(n = n,d = 1)
+        xtest = interval_torch(n=n, d=1)
         groups = [list(range(m)), list(range(m, 2 * m, 1))]
         new_groups = groups.copy()
         weights = [alpha**2 for g in groups]
         for j in range(len(groups)):
             for i in range(j + 1, len(groups), 1):
                 new_groups.append(groups[j] + groups[i])
-                weights.append(1.)
+                weights.append(1.0)
 
-        regularizer = NestedGroupL1L2Regularizer(lam = lam, groups = new_groups, weights = weights)
+        regularizer = NestedGroupL1L2Regularizer(
+            lam=lam, groups=new_groups, weights=weights
+        )
         constraint = regularizer.get_constraint_object(budget)
         likelihood = GaussianLikelihood(sigma=sigma)
-        estimator_train = RegularizedDictionary(embedding, likelihood, regularizer, constraints = constraint, use_constraint=True)
-
-        estimator_train.load_data((Xtrain,ytrain))
+        estimator_train = RegularizedDictionary(
+            embedding,
+            likelihood,
+            regularizer,
+            constraints=constraint,
+            use_constraint=True,
+        )
+
+        estimator_train.load_data((Xtrain, ytrain))
         estimator_train.fit()
         mean = estimator_train.mean(xtest)
 
-
-
-
-        if max(len(alphas),len(lams_uns))>1:
-            #axs[index1,index2].subplot(len(lams),len(alphas),index1+1, index2+1)
-            axs[index1,index2].plot(Xtrain, ytrain, 'ro', ms=15)
-            axs[index1,index2].plot(xtest, F(xtest), lw = 4)
-            p = axs[index1,index2].plot(xtest, mean, lw = 4, label = "$\\lambda = "+str(lam)+", \\alpha ="+str(alpha)+" $")
+        if max(len(alphas), len(lams_uns)) > 1:
+            # axs[index1,index2].subplot(len(lams),len(alphas),index1+1, index2+1)
+            axs[index1, index2].plot(Xtrain, ytrain, "ro", ms=15)
+            axs[index1, index2].plot(xtest, F(xtest), lw=4)
+            p = axs[index1, index2].plot(
+                xtest,
+                mean,
+                lw=4,
+                label="$\\lambda = " + str(lam) + ", \\alpha =" + str(alpha) + " $",
+            )
 
             # xtest1 = torch.linspace(0.0,0.5,n//4).double().view(-1,1)
             # xtest2 = torch.linspace(-1.0,-0.5,n//4).double().view(-1,1)
             # conf_xtest = torch.vstack([xtest1,xtest2])
-            ucb = estimator_train.ucb(xtest, type = "LR_static")
-            lcb = estimator_train.lcb(xtest, type = "LR_static")
-            axs[index1,index2].fill_between(xtest.view(-1), lcb.view(-1), ucb.view(-1), alpha = 0.1, color = p[0].get_color())
-            #axs[index1,index2].legend(fontsize = 15)
+            ucb = estimator_train.ucb(xtest, type="LR_static")
+            lcb = estimator_train.lcb(xtest, type="LR_static")
+            axs[index1, index2].fill_between(
+                xtest.view(-1),
+                lcb.view(-1),
+                ucb.view(-1),
+                alpha=0.1,
+                color=p[0].get_color(),
+            )
+            # axs[index1,index2].legend(fontsize = 15)
         else:
-            axs.plot(Xtrain, ytrain, 'ro', ms=15)
+            axs.plot(Xtrain, ytrain, "ro", ms=15)
             axs.plot(xtest, F(xtest), lw=4)
-            p = axs.plot(xtest, mean, lw=4,
-                                     label="$\\lambda = " + str(lam) + ", \\alpha =" + str(alpha) + " $")
+            p = axs.plot(
+                xtest,
+                mean,
+                lw=4,
+                label="$\\lambda = " + str(lam) + ", \\alpha =" + str(alpha) + " $",
+            )
 
             # xtest1 = torch.linspace(0.0,0.5,n//4).double().view(-1,1)
             # xtest2 = torch.linspace(-1.0,-0.5,n//4).double().view(-1,1)
             # conf_xtest = torch.vstack([xtest1,xtest2])
             ucb = estimator_train.ucb(xtest, type="LR_static")
             lcb = estimator_train.lcb(xtest, type="LR_static")
-            axs.fill_between(xtest.view(-1), lcb.view(-1), ucb.view(-1), alpha=0.1,
-                                             color=p[0].get_color())
-            #axs.legend(fontsize=15)
-plt.savefig("image.png", dpi = 300)
-plt.show()
\ No newline at end of file
+            axs.fill_between(
+                xtest.view(-1),
+                lcb.view(-1),
+                ucb.view(-1),
+                alpha=0.1,
+                color=p[0].get_color(),
+            )
+            # axs.legend(fontsize=15)
+plt.savefig("image.png", dpi=300)
+plt.show()
diff --git a/tests/continous_processes/test_estimators/group l_q_estimator.py b/tests/continous_processes/test_estimators/group l_q_estimator.py
index 86a72e1..d431303 100644
--- a/tests/continous_processes/test_estimators/group l_q_estimator.py	
+++ b/tests/continous_processes/test_estimators/group l_q_estimator.py	
@@ -16,48 +16,48 @@
 m = 128
 d = 1
 sigma = 0.01
-lam = 1.
+lam = 1.0
 n = 256
 N = 10
 
-kernel_object = KernelFunction(gamma = 0.05, d = 1)
-#embedding = HermiteEmbedding(m = m, d = 1)
-xtest = interval_torch(n = n,d = 1)
+kernel_object = KernelFunction(gamma=0.05, d=1)
+# embedding = HermiteEmbedding(m = m, d = 1)
+xtest = interval_torch(n=n, d=1)
 
-embedding1 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding1.fit_gp(xtest/2-0.5,None)
-embedding2 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding2.fit_gp(xtest/2+0.5,None)
-embedding = ConcatEmbedding([embedding1,embedding2])
+embedding1 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding1.fit_gp(xtest / 2 - 0.5, None)
+embedding2 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding2.fit_gp(xtest / 2 + 0.5, None)
+embedding = ConcatEmbedding([embedding1, embedding2])
 
-qs = [0.01, 0.2,0.5,0.8]
+qs = [0.01, 0.2, 0.5, 0.8]
 groups = [list(range(m)), list(range(m, 2 * m, 1))]
-print (groups)
+print(groups)
 
 regularizers = []
-#regularizers += [L1Regularizer(lam = lam), L2Regularizer(lam = lam)]
-#regularizers += [NonConvexLqRegularizer(lam = lam, q = q) for q in qs]
-regularizers += [GroupNonCovexLqRegularizer(lam = lam, q = q, groups=groups) for q in qs]
+# regularizers += [L1Regularizer(lam = lam), L2Regularizer(lam = lam)]
+# regularizers += [NonConvexLqRegularizer(lam = lam, q = q) for q in qs]
+regularizers += [GroupNonCovexLqRegularizer(lam=lam, q=q, groups=groups) for q in qs]
 
 likelihood = GaussianLikelihood(sigma=sigma)
 names = []
-#names += ["L1", "L2"]
-#names += ["L"+str(q) for q in qs]
-names += ["group L"+str(q) for q in qs]
+# names += ["L1", "L2"]
+# names += ["L"+str(q) for q in qs]
+names += ["group L" + str(q) for q in qs]
 
-f = lambda x: torch.sin(x*20)*(x>0).double()
-Xtrain = interval_torch(n = N, d= 1)
+f = lambda x: torch.sin(x * 20) * (x > 0).double()
+Xtrain = interval_torch(n=N, d=1)
 ytrain = f(Xtrain)
 
-for name,regularizer in zip(names,regularizers):
+for name, regularizer in zip(names, regularizers):
     estimator = RegularizedDictionary(embedding, likelihood, regularizer)
-    estimator.load_data((Xtrain,ytrain))
+    estimator.load_data((Xtrain, ytrain))
     estimator.fit()
     mean = estimator.mean(xtest)
     print(name, "support:", torch.sum(estimator.theta_fit > 1e-8))
-    plt.plot(xtest, mean, label = name, lw = 3, alpha = 0.5)
+    plt.plot(xtest, mean, label=name, lw=3, alpha=0.5)
 
-plt.plot(Xtrain,ytrain,'ko', lw = 3)
-plt.plot(xtest,f(xtest),'k--', lw = 3)
+plt.plot(Xtrain, ytrain, "ko", lw=3)
+plt.plot(xtest, f(xtest), "k--", lw=3)
 plt.legend()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/tests/continous_processes/test_estimators/group_l_q_estimator_budget.py b/tests/continous_processes/test_estimators/group_l_q_estimator_budget.py
index c22ffc5..0798487 100644
--- a/tests/continous_processes/test_estimators/group_l_q_estimator_budget.py
+++ b/tests/continous_processes/test_estimators/group_l_q_estimator_budget.py
@@ -6,6 +6,7 @@
 from stpy.kernels import KernelFunction
 from stpy.regularization.simplex_regularizer import SupRegularizer
 from stpy.continuous_processes.mkl_estimator import MultipleKernelLearner
+
 """
 This script test and compares Lq estimators 
  compare L1, L2 and Lq estimators
@@ -14,35 +15,41 @@
 m = 128
 d = 1
 sigma = 0.01
-lam = 1.
+lam = 1.0
 n = 128
 N = 10
 
-kernel_object = KernelFunction(gamma = 0.05, d = 1)
-#embedding = HermiteEmbedding(m = m, d = 1)
-xtest = interval_torch(n = n,d = 1)
+kernel_object = KernelFunction(gamma=0.05, d=1)
+# embedding = HermiteEmbedding(m = m, d = 1)
+xtest = interval_torch(n=n, d=1)
+
+embedding1 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding1.fit_gp(xtest / 2 - 0.7, None)
+embedding2 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding2.fit_gp(xtest / 2 + 0.7, None)
+embedding = ConcatEmbedding([embedding1, embedding2])
+
+
+def k1(x, y, **kwagrs):
+    return (embedding1.embed(x) @ embedding1.embed(y).T).T
 
-embedding1 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding1.fit_gp(xtest/2-0.7,None)
-embedding2 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding2.fit_gp(xtest/2+0.7,None)
-embedding = ConcatEmbedding([embedding1,embedding2])
 
-def k1(x,y,**kwagrs):
-    return (embedding1.embed(x)@embedding1.embed(y).T).T
+def k2(x, y, **kwagrs):
+    return (embedding2.embed(x) @ embedding2.embed(y).T).T
 
-def k2(x,y,**kwagrs):
-    return (embedding2.embed(x)@embedding2.embed(y).T).T
 
-kernel_object_1 = KernelFunction(kernel_function = k1)
-kernel_object_2 = KernelFunction(kernel_function = k2)
+kernel_object_1 = KernelFunction(kernel_function=k1)
+kernel_object_2 = KernelFunction(kernel_function=k2)
 
 kernels = [kernel_object_1, kernel_object_2]
 regularizer = SupRegularizer(d=len(kernels), lam=0.99, constrained=True)
 mkl = MultipleKernelLearner(kernels, regularizer=regularizer)
 
-f = lambda x: torch.sin(x*20)*(x<0).double() + (1e-5)*torch.sin(x*20)*(x>0).double()
-Xtrain = interval_torch(n = N, d= 1, L_infinity_ball=0.25) - 0.75
+f = (
+    lambda x: torch.sin(x * 20) * (x < 0).double()
+    + (1e-5) * torch.sin(x * 20) * (x > 0).double()
+)
+Xtrain = interval_torch(n=N, d=1, L_infinity_ball=0.25) - 0.75
 ytrain = f(Xtrain)
 
 #
@@ -85,9 +92,9 @@ def k2(x,y,**kwagrs):
 mkl.load_data((Xtrain, ytrain))
 mkl.fit()
 mean = mkl.mean(xtest)
-p = plt.plot(xtest, mean, label="MKL", linestyle="-", lw=3, color='tab:purple')
+p = plt.plot(xtest, mean, label="MKL", linestyle="-", lw=3, color="tab:purple")
 
-plt.plot(Xtrain,ytrain,'ko', lw = 3)
-plt.plot(xtest,f(xtest),'k--', lw = 3)
+plt.plot(Xtrain, ytrain, "ko", lw=3)
+plt.plot(xtest, f(xtest), "k--", lw=3)
 plt.legend()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/tests/continous_processes/test_estimators/l_q_estimator.py b/tests/continous_processes/test_estimators/l_q_estimator.py
index f04484d..931aed5 100644
--- a/tests/continous_processes/test_estimators/l_q_estimator.py
+++ b/tests/continous_processes/test_estimators/l_q_estimator.py
@@ -16,54 +16,69 @@
 m = 64
 d = 1
 sigma = 0.01
-lam = 1.
+lam = 1.0
 n = 4
 N = 3
-total_norm = 1.
-xtest = interval_torch(n = n,d = 1)
-kernel_object = KernelFunction(gamma = 0.05, d = 1)
-embedding = HermiteEmbedding(m = m, d = 1)
+total_norm = 1.0
+xtest = interval_torch(n=n, d=1)
+kernel_object = KernelFunction(gamma=0.05, d=1)
+embedding = HermiteEmbedding(m=m, d=1)
 
 total_norms = [1]
-for pos,total_norm in enumerate(total_norms):
-    lasso_regularizer = L1Regularizer(lam = lam)
-    l2_regularizer = L2Regularizer(lam = lam)
+for pos, total_norm in enumerate(total_norms):
+    lasso_regularizer = L1Regularizer(lam=lam)
+    l2_regularizer = L2Regularizer(lam=lam)
     qs = [0.1]
 
-    regularizers = [l2_regularizer,l2_regularizer]
-    #regularizers +=  [NonConvexLqRegularizer(lam = lam, q = q) for q in qs]
-    constraints = [lasso_regularizer.get_constraint_object(total_norm), l2_regularizer.get_constraint_object(total_norm)]
-    #constraints=+ [None for q in qs]
+    regularizers = [l2_regularizer, l2_regularizer]
+    # regularizers +=  [NonConvexLqRegularizer(lam = lam, q = q) for q in qs]
+    constraints = [
+        lasso_regularizer.get_constraint_object(total_norm),
+        l2_regularizer.get_constraint_object(total_norm),
+    ]
+    # constraints=+ [None for q in qs]
 
-
-    constraints += [ NonConvexNormConstraint(0.5, total_norm, m)]
-    regularizers += [L2Regularizer(lam = lam)]
+    constraints += [NonConvexNormConstraint(0.5, total_norm, m)]
+    regularizers += [L2Regularizer(lam=lam)]
 
     likelihood = GaussianLikelihood(sigma=sigma)
     names = ["L1", "L2"]
-    #names +=  ["L"+str(q) for q in qs]
+    # names +=  ["L"+str(q) for q in qs]
     names += ["Lspecial"]
 
-    f = lambda x: torch.sin(x*20)
-    Xtrain = interval_torch(n = N, d= 1)
+    f = lambda x: torch.sin(x * 20)
+    Xtrain = interval_torch(n=N, d=1)
     ytrain = f(Xtrain)
-    linestyles = ['-.','-','--']
-    #plt.subplot(2,len(total_norms)//2,pos+1)
-    for name,regularizer,constraint, linestyle  in zip(names,regularizers,constraints,linestyles):
-        print (name)
-        estimator = RegularizedDictionary(embedding, likelihood, regularizer, constraints=constraint, use_constraint=True)
-        estimator.load_data((Xtrain,ytrain))
+    linestyles = ["-.", "-", "--"]
+    # plt.subplot(2,len(total_norms)//2,pos+1)
+    for name, regularizer, constraint, linestyle in zip(
+        names, regularizers, constraints, linestyles
+    ):
+        print(name)
+        estimator = RegularizedDictionary(
+            embedding,
+            likelihood,
+            regularizer,
+            constraints=constraint,
+            use_constraint=True,
+        )
+        estimator.load_data((Xtrain, ytrain))
         estimator.fit()
         mean = estimator.mean(xtest)
         lcb = estimator.lcb(xtest)
         ucb = estimator.ucb(xtest)
-        p = plt.plot(xtest, mean, label=name, linestyle = linestyle)
-        plt.fill_between(xtest.view(-1),lcb.view(-1),ucb.view(-1), alpha = 0.1, color = p[0].get_color())
+        p = plt.plot(xtest, mean, label=name, linestyle=linestyle)
+        plt.fill_between(
+            xtest.view(-1),
+            lcb.view(-1),
+            ucb.view(-1),
+            alpha=0.1,
+            color=p[0].get_color(),
+        )
         print(name, "support:", torch.sum(estimator.theta_fit > 0.01))
-        print (estimator.theta_fit.T)
-
+        print(estimator.theta_fit.T)
 
-    plt.plot(Xtrain,ytrain,'o')
-    plt.plot(xtest,f(xtest),'k--')
+    plt.plot(Xtrain, ytrain, "o")
+    plt.plot(xtest, f(xtest), "k--")
     plt.legend()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/tests/continous_processes/test_estimators/qff_nonstationary.py b/tests/continous_processes/test_estimators/qff_nonstationary.py
index 19407fc..97f0fb3 100644
--- a/tests/continous_processes/test_estimators/qff_nonstationary.py
+++ b/tests/continous_processes/test_estimators/qff_nonstationary.py
@@ -12,92 +12,105 @@
 m = 128
 d = 1
 sigma = 0.01
-lam = 1.
+lam = 1.0
 n = 256
 n_small = 16
 
 
 I = torch.eye(m).double()
 budget = 100
-kernel_object = KernelFunction(gamma = 0.05, d = 1)
+kernel_object = KernelFunction(gamma=0.05, d=1)
 
-embedding_base = HermiteEmbedding(m = m, d = 1)
+embedding_base = HermiteEmbedding(m=m, d=1)
 
-def zero_out_interval(x,interval):
-    mask1 = x[:,0] > interval[0]
-    mask2 = x[:,0] < interval[1]
-    #return torch.from_numpy(gaussian_filter(torch.logical_and(mask1,mask2).double(),sigma=10))
-    return torch.logical_and(mask1,mask2).double()
 
+def zero_out_interval(x, interval):
+    mask1 = x[:, 0] > interval[0]
+    mask2 = x[:, 0] < interval[1]
+    # return torch.from_numpy(gaussian_filter(torch.logical_and(mask1,mask2).double(),sigma=10))
+    return torch.logical_and(mask1, mask2).double()
 
-xtest = interval_torch(n = n,d = 1)
 
+xtest = interval_torch(n=n, d=1)
 
-embedding1 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding1.fit_gp((xtest-1)/2-0.5,None)
-embedding2 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding2.fit_gp((xtest-1)/2,None)
-embedding3 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding3.fit_gp((xtest+1)/2,None)
-embedding4 = NystromFeatures(kernel_object = kernel_object, m = m )
-embedding4.fit_gp((xtest+1)/2+0.5,None)
 
-embedding = ConcatEmbedding([embedding1,embedding2,embedding3,embedding4])
+embedding1 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding1.fit_gp((xtest - 1) / 2 - 0.5, None)
+embedding2 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding2.fit_gp((xtest - 1) / 2, None)
+embedding3 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding3.fit_gp((xtest + 1) / 2, None)
+embedding4 = NystromFeatures(kernel_object=kernel_object, m=m)
+embedding4.fit_gp((xtest + 1) / 2 + 0.5, None)
 
-likelihood_base = GaussianLikelihood(sigma = sigma)
-groups = [list(range(m)),list(range(m,2*m,1)),list(range(2*m,3*m,1)),list(range(3*m,4*m,1))]
+embedding = ConcatEmbedding([embedding1, embedding2, embedding3, embedding4])
 
-regularizer_base = GroupL1L2Regularizer(lam=1., groups=groups)
+likelihood_base = GaussianLikelihood(sigma=sigma)
+groups = [
+    list(range(m)),
+    list(range(m, 2 * m, 1)),
+    list(range(2 * m, 3 * m, 1)),
+    list(range(3 * m, 4 * m, 1)),
+]
+
+regularizer_base = GroupL1L2Regularizer(lam=1.0, groups=groups)
 constraint_base = regularizer_base.get_constraint_object(budget)
 
-estimator = RegularizedDictionary(embedding, likelihood_base, regularizer_base, constraints=constraint_base, use_constraint=False)
+estimator = RegularizedDictionary(
+    embedding,
+    likelihood_base,
+    regularizer_base,
+    constraints=constraint_base,
+    use_constraint=False,
+)
 
-lams = [1.]#,16.,32.,64.,128.]
+lams = [1.0]  # ,16.,32.,64.,128.]
 N = 3
-v = torch.randn(size = (embedding.get_m(),1)).double()
-for i in [0,1,3]:
-    v[groups[i]] = 0.
-v = (v/np.sqrt(regularizer_base.eval(v)))
+v = torch.randn(size=(embedding.get_m(), 1)).double()
+for i in [0, 1, 3]:
+    v[groups[i]] = 0.0
+v = v / np.sqrt(regularizer_base.eval(v))
 
-F = lambda X: embedding.embed(X)@v*np.sqrt(budget)
-X = torch.rand(size = (10,d)).double()*0.25+0.1
+F = lambda X: embedding.embed(X) @ v * np.sqrt(budget)
+X = torch.rand(size=(10, d)).double() * 0.25 + 0.1
 y = F(X)
 
-#Xpoint = torch.Tensor([[0.],[0.5]]).double()
-#ypoint = torch.Tensor([[0.],[0.]]).double()
+# Xpoint = torch.tensor([[0.],[0.5]]).double()
+# ypoint = torch.tensor([[0.],[0.]]).double()
 
-#X = torch.vstack([X,Xpoint])
-#y = torch.vstack([y,ypoint])
-estimator.load_data((X,y))
+# X = torch.vstack([X,Xpoint])
+# y = torch.vstack([y,ypoint])
+estimator.load_data((X, y))
 estimator.fit()
 
 F = lambda X: estimator.mean(X)
 Xtrain = torch.rand(size=(N, d)).double() * 0.5
 ytrain = F(Xtrain) + sigma * torch.randn(size=(Xtrain.size()[0], 1))
 
-lams = [8.,16.,32.]#,16.,32.,64.,128.]
+lams = [8.0, 16.0, 32.0]  # ,16.,32.,64.,128.]
 ##lams = [1.,128.]
 epsilon = 1e-1
-#lams = [1.]
+# lams = [1.]
 for index, lam in enumerate(lams):
 
-    print (index,':',lam)
-    print ("budget:",budget)
+    print(index, ":", lam)
+    print("budget:", budget)
 
-    plt.subplot(len(lams),1,index+1)
-    plt.plot(Xtrain, ytrain, 'ro', ms=25)
-    plt.ylim([-3,3])
-    regularizer = GroupL1L2Regularizer(lam = lam, groups = groups)
+    plt.subplot(len(lams), 1, index + 1)
+    plt.plot(Xtrain, ytrain, "ro", ms=25)
+    plt.ylim([-3, 3])
+    regularizer = GroupL1L2Regularizer(lam=lam, groups=groups)
     constraint = regularizer.get_constraint_object(budget)
     likelihood = GaussianLikelihood(sigma=sigma)
-    estimator_train = RegularizedDictionary(embedding, likelihood, regularizer, constraints = constraint, use_constraint=True)
-
+    estimator_train = RegularizedDictionary(
+        embedding, likelihood, regularizer, constraints=constraint, use_constraint=True
+    )
 
-    xtest = interval_torch(n = n,d = 1)
-    xtest_small = interval_torch(n = n_small, d = 1)
-    plt.plot(xtest, F(xtest), lw = 5)
+    xtest = interval_torch(n=n, d=1)
+    xtest_small = interval_torch(n=n_small, d=1)
+    plt.plot(xtest, F(xtest), lw=5)
 
-    estimator_train.load_data((Xtrain,ytrain))
+    estimator_train.load_data((Xtrain, ytrain))
     estimator_train.fit()
 
     mean = estimator_train.mean(xtest)
@@ -107,15 +120,21 @@ def zero_out_interval(x,interval):
     print(regularizer.eval(estimator_train.theta_fit))
     print(regularizer_base.eval(estimator_train.theta_fit))
 
-    p = plt.plot(xtest, mean, lw = 4, label = "$||f|| \leq "+str(budget/lam)+"$")
-    #p2 = plt.plot(xtest_small, mean_small,'o-', ms = 25, lw = 4, label = "$||f|| \leq "+str(budget/lam)+"$")
+    p = plt.plot(xtest, mean, lw=4, label="$||f|| \leq " + str(budget / lam) + "$")
+    # p2 = plt.plot(xtest_small, mean_small,'o-', ms = 25, lw = 4, label = "$||f|| \leq "+str(budget/lam)+"$")
 
     #
-    ucb = estimator_train.ucb(xtest_small, type = "LR_static")
-    lcb = estimator_train.lcb(xtest_small, type = "LR_static")
+    ucb = estimator_train.ucb(xtest_small, type="LR_static")
+    lcb = estimator_train.lcb(xtest_small, type="LR_static")
     #
-    #plt.errorbar(xtest_small.view(-1), mean_small.view(-1),yerr = ucb.view(-1), ms = 25,alpha = 1., color = p[0].get_color(), lw=5)
-    plt.fill_between(xtest_small.view(-1),lcb.view(-1), ucb.view(-1),alpha = 0.1, color = p[0].get_color())
-    plt.plot(xtest, xtest*0 + epsilon, 'k--')
-    plt.legend(fontsize = 35)
+    # plt.errorbar(xtest_small.view(-1), mean_small.view(-1),yerr = ucb.view(-1), ms = 25,alpha = 1., color = p[0].get_color(), lw=5)
+    plt.fill_between(
+        xtest_small.view(-1),
+        lcb.view(-1),
+        ucb.view(-1),
+        alpha=0.1,
+        color=p[0].get_color(),
+    )
+    plt.plot(xtest, xtest * 0 + epsilon, "k--")
+    plt.legend(fontsize=35)
 plt.show()
diff --git a/tests/continous_processes/test_estimators/test_regularized_dictionary_l2.py b/tests/continous_processes/test_estimators/test_regularized_dictionary_l2.py
index 82467ad..cdcbe48 100644
--- a/tests/continous_processes/test_estimators/test_regularized_dictionary_l2.py
+++ b/tests/continous_processes/test_estimators/test_regularized_dictionary_l2.py
@@ -8,73 +8,99 @@
 from stpy.probability.gaussian_likelihood import GaussianLikelihood
 from stpy.regularization.regularizer import L2Regularizer, L1Regularizer
 from stpy.helpers.helper import interval_torch
-from stpy.regularization.constraints import QuadraticInequalityConstraint, AbsoluteValueConstraint
+from stpy.regularization.constraints import (
+    QuadraticInequalityConstraint,
+    AbsoluteValueConstraint,
+)
 from stpy.kernels import KernelFunction
 
 m = 64
 d = 1
 sigma = 0.1
-lam = 1.
+lam = 1.0
 n = 256
 
 I = torch.eye(m).double()
-budget = m*10e10
-kernel_object = KernelFunction(gamma = 0.1, d = 1)
-#embedding = TriangleEmbedding(m = m, d = 1, B = 10, b = -10, kernel_object=kernel_object)
+budget = m * 10e10
+kernel_object = KernelFunction(gamma=0.1, d=1)
+# embedding = TriangleEmbedding(m = m, d = 1, B = 10, b = -10, kernel_object=kernel_object)
 
-embedding_base = FaberSchauderEmbedding(m = m, d = 1, kernel_object=None, offset=0)
-# this defines the decay of the functions
-def decay_function(emb):
-    return (emb.hierarchical_mask()+1)**(-15)
+embedding_base = FaberSchauderEmbedding(m=m, d=1, kernel_object=None, offset=0)
 
-print (decay_function(embedding_base))
 
-embedding = WeightedEmbedding(embedding_base,weight_function=decay_function)
-
-# embedding = RFFEmbeddQing(m = m, d=1, gamma = 0.1)
+# this defines the decay of the functions
+def decay_function(emb):
+    return (emb.hierarchical_mask() + 1) ** (-15)
 
-likelihood = GaussianLikelihood(sigma = sigma)
-regularizer_L2 = L2Regularizer(lam = lam)
-regularizer_L1 = L1Regularizer(lam = lam)
 
+print(decay_function(embedding_base))
 
-constraint_L2 = QuadraticInequalityConstraint(Q = I, c = budget)
-constraint_L1 = AbsoluteValueConstraint(c = np.sqrt(budget))
+embedding = WeightedEmbedding(embedding_base, weight_function=decay_function)
 
-estimator_L2_L2 = RegularizedDictionary(embedding, likelihood, regularizer_L2,
-                                        constraints = constraint_L2, use_constraint=False)
-estimator_L1_L2 = RegularizedDictionary(embedding, likelihood, regularizer_L1,
-                                        constraints = constraint_L2, use_constraint=False)
-estimator_L2_L1 = RegularizedDictionary(embedding, likelihood, regularizer_L2,
-                                        constraints = constraint_L1, use_constraint=False)
-estimator_L1_L1 = RegularizedDictionary(embedding, likelihood, regularizer_L1,
-                                        constraints = constraint_L1, use_constraint=False)
+# embedding = RFFEmbeddQing(m = m, d=1, gamma = 0.1)
 
-estimators = [estimator_L2_L2,estimator_L2_L1,estimator_L1_L2,estimator_L1_L1]
+likelihood = GaussianLikelihood(sigma=sigma)
+regularizer_L2 = L2Regularizer(lam=lam)
+regularizer_L1 = L1Regularizer(lam=lam)
+
+
+constraint_L2 = QuadraticInequalityConstraint(Q=I, c=budget)
+constraint_L1 = AbsoluteValueConstraint(c=np.sqrt(budget))
+
+estimator_L2_L2 = RegularizedDictionary(
+    embedding,
+    likelihood,
+    regularizer_L2,
+    constraints=constraint_L2,
+    use_constraint=False,
+)
+estimator_L1_L2 = RegularizedDictionary(
+    embedding,
+    likelihood,
+    regularizer_L1,
+    constraints=constraint_L2,
+    use_constraint=False,
+)
+estimator_L2_L1 = RegularizedDictionary(
+    embedding,
+    likelihood,
+    regularizer_L2,
+    constraints=constraint_L1,
+    use_constraint=False,
+)
+estimator_L1_L1 = RegularizedDictionary(
+    embedding,
+    likelihood,
+    regularizer_L1,
+    constraints=constraint_L1,
+    use_constraint=False,
+)
+
+estimators = [estimator_L2_L2, estimator_L2_L1, estimator_L1_L2, estimator_L1_L1]
 names = ["reg:L2 con:L2", "reg:L2 con:L1", "reg:L1 con:L2", "reg:L1 con:L1"]
-styles = ["-","--","-","--"]
+styles = ["-", "--", "-", "--"]
 N = 1
-v = torch.randn(size = (m,1)).double()
-F = lambda X: embedding.embed(X)@v
-X = torch.Tensor([[0.5]]).double()
+v = torch.randn(size=(m, 1)).double()
+F = lambda X: embedding.embed(X) @ v
+X = torch.tensor([[0.5]]).double()
 y = F(X)
-xtest = interval_torch(n = n,d = 1)
+xtest = interval_torch(n=n, d=1)
 
-plt.plot(xtest, F(xtest), lw = 5)
-plt.plot(X, y, 'ro', ms = 25)
+plt.plot(xtest, F(xtest), lw=5)
+plt.plot(X, y, "ro", ms=25)
 
-for j,estimator in enumerate(estimators):
-    print ("Calculating:",names[j])
-    estimator.load_data((X,y))
+for j, estimator in enumerate(estimators):
+    print("Calculating:", names[j])
+    estimator.load_data((X, y))
     estimator.fit()
     mean = estimator.mean(xtest)
 
-    #ucb = estimator.ucb(xtest, type = "LR_static")
-    #lcb = estimator.lcb(xtest, type = "LR_static")
+    # ucb = estimator.ucb(xtest, type = "LR_static")
+    # lcb = estimator.lcb(xtest, type = "LR_static")
 
-    #plt.title("Norm: "+str(torch.norm(estimator.theta_fit)**2))
-    plt.plot(xtest, mean, label = names[j], lw = 4, linestyle = styles[j])
-    #plt.fill_between(xtest.view(-1), lcb.view(-1), ucb.view(-1), alpha = 0.1)
+    # plt.title("Norm: "+str(torch.norm(estimator.theta_fit)**2))
+    plt.plot(xtest, mean, label=names[j], lw=4, linestyle=styles[j])
+    # plt.fill_between(xtest.view(-1), lcb.view(-1), ucb.view(-1), alpha = 0.1)
 
-plt.legend(fontsize = 35)
+plt.legend(fontsize=35)
 plt.show()
diff --git a/tests/continous_processes/test_huber_loss.py b/tests/continous_processes/test_huber_loss.py
index c76de41..edf105e 100644
--- a/tests/continous_processes/test_huber_loss.py
+++ b/tests/continous_processes/test_huber_loss.py
@@ -10,77 +10,97 @@
 d = 1
 eps = 0.01
 s = 1
-x = torch.rand(N,d).double()*2 - 1
-xtest = torch.from_numpy(interval(n,d,L_infinity_ball=1))
+x = torch.rand(N, d).double() * 2 - 1
+xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1))
 
 # true
 GP_true = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d)
 ytest = GP_true.sample(xtest)
-GP_true.fit_gp(xtest,ytest)
+GP_true.fit_gp(xtest, ytest)
 
-plt.plot(xtest,GP_true.mean(xtest),'b-')
+plt.plot(xtest, GP_true.mean(xtest), "b-")
 
 y = GP_true.mean(x).clone()
-GP = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, loss = 'huber', huber_delta=0.01, s = s)
+GP = GaussianProcess(
+    gamma=0.1,
+    kernel_name="squared_exponential",
+    d=d,
+    loss="huber",
+    huber_delta=0.01,
+    s=s,
+)
 
-xnew = x[0,:].view(1,1) + eps
-ynew = y[0,0].view(1,1) + 1
+xnew = x[0, :].view(1, 1) + eps
+ynew = y[0, 0].view(1, 1) + 1
 
-y2 = torch.vstack([y,ynew])
-x2 = torch.vstack([x,xnew])
+y2 = torch.vstack([y, ynew])
+x2 = torch.vstack([x, xnew])
 
-GP.fit_gp(x2,y2)
+GP.fit_gp(x2, y2)
 
 GP2 = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d)
 GP3 = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d)
 
-GP2.fit_gp(x2,y2)
-#GP2.visualize(xtest, show = False, fig = False)
-#plt.show()
-
+GP2.fit_gp(x2, y2)
+# GP2.visualize(xtest, show = False, fig = False)
+# plt.show()
 
 
 ### marginalized likelihood with normal loss_two_ways
 # plot true function
-plt.plot(xtest,GP_true.mean(xtest),'b--',label = "truth", lw = 3)
+plt.plot(xtest, GP_true.mean(xtest), "b--", label="truth", lw=3)
 
 # with noise optimize
-GP2.fit_gp(x2,y2)
-GP2.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
+GP2.fit_gp(x2, y2)
+GP2.optimize_params(
+    type="bandwidth", restarts=5, verbose=False, optimizer="pytorch-minimize", scale=1.0
+)
 mu = GP2.mean(xtest)
-plt.plot(xtest,mu, 'r-', label = "squared-corupted", lw = 3)
-#GP2.visualize(xtest, show = False, fig = False, size = 0)
+plt.plot(xtest, mu, "r-", label="squared-corupted", lw=3)
+# GP2.visualize(xtest, show = False, fig = False, size = 0)
 
 # no noise optimize
-GP2.fit_gp(x,y)
-GP2.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
+GP2.fit_gp(x, y)
+GP2.optimize_params(
+    type="bandwidth", restarts=5, verbose=False, optimizer="pytorch-minimize", scale=1.0
+)
 mu = GP2.mean(xtest)
-plt.plot(xtest,mu, '--x', color ="tab:brown" , label = 'squared-uncorrupted', lw = 3)
+plt.plot(xtest, mu, "--x", color="tab:brown", label="squared-uncorrupted", lw=3)
 
 # with huber optimize
-GP = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, loss = 'huber', huber_delta=1.3)
-GP.fit_gp(x2,y2)
-GP.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1., weight=1.)
+GP = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, loss="huber", huber_delta=1.3
+)
+GP.fit_gp(x2, y2)
+GP.optimize_params(
+    type="bandwidth",
+    restarts=5,
+    verbose=False,
+    optimizer="pytorch-minimize",
+    scale=1.0,
+    weight=1.0,
+)
 mu = GP2.mean(xtest)
-plt.plot(xtest,mu, color = "tab:green", label = 'huber-corupted', lw = 3)
+plt.plot(xtest, mu, color="tab:green", label="huber-corupted", lw=3)
 
 # GP = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, loss = 'huber', huber_delta=1.3)
 # GP.fit_gp(x2,y2)
 # mu = GP2.mean(xtest)
 # plt.plot(xtest,mu, 'r-', label = 'huber-true-model-corupted')
 
-GP = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, loss = 'huber', huber_delta=1.3)
-GP.fit_gp(x,y)
+GP = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, loss="huber", huber_delta=1.3
+)
+GP.fit_gp(x, y)
 mu = GP.mean(xtest)
-plt.plot(xtest,mu, '--', color = "tab:orange", label = 'huber-uncorrupted', lw = 3)
+plt.plot(xtest, mu, "--", color="tab:orange", label="huber-uncorrupted", lw=3)
 plt.legend()
 
-plt.plot(x,y, 'ro', ms = 5)
+plt.plot(x, y, "ro", ms=5)
 
-plt.plot(xnew,ynew, 'ko', ms = 10)
+plt.plot(xnew, ynew, "ko", ms=10)
 plt.show()
 # GP.fit_gp(x,y2)
 # GP.optimize_params(type="bandwidth", restarts=10, verbose = False, optimizer = 'pytorch-minimize', scale = 10.)
 # GP.visualize(xtest, show = True, fig = False, color = 'yellow')
 #
-
diff --git a/tests/continous_processes/test_marginalized_pytorch_minimize.py b/tests/continous_processes/test_marginalized_pytorch_minimize.py
index 8e97e47..beaeea0 100644
--- a/tests/continous_processes/test_marginalized_pytorch_minimize.py
+++ b/tests/continous_processes/test_marginalized_pytorch_minimize.py
@@ -3,27 +3,29 @@
 from stpy.kernels import KernelFunction
 from stpy.helpers.helper import interval
 
-#%%
+# %%
 
 n = 100
 d = 2
-x = torch.rand(n,d).double()*2 - 1
-xtest = torch.from_numpy(interval(50,2,L_infinity_ball=1))
+x = torch.rand(n, d).double() * 2 - 1
+xtest = torch.from_numpy(interval(50, 2, L_infinity_ball=1))
 
-#%%
+# %%
 
 GP = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=2)
 y = GP.sample(x)
-GP.fit_gp(x,y)
-GP.visualize_contour(xtest, ms = 10)
+GP.fit_gp(x, y)
+GP.visualize_contour(xtest, ms=10)
 
-#%%
+# %%
 
 ## Kernels can be defined as via kernel object
 # 2 dimensional additive kernel with groups [0] and [1]
-k = KernelFunction(kernel_name = "ard", d = 2, groups = [[0,1]] )
+k = KernelFunction(kernel_name="ard", d=2, groups=[[0, 1]])
 GP = GaussianProcess(kernel=k)
 
-GP.fit_gp(x,y)
-GP.optimize_params(type="bandwidth", restarts = 2, verbose = False, optimizer = 'pytorch-minimize')
-GP.visualize_contour(xtest, ms = 10)
+GP.fit_gp(x, y)
+GP.optimize_params(
+    type="bandwidth", restarts=2, verbose=False, optimizer="pytorch-minimize"
+)
+GP.visualize_contour(xtest, ms=10)
diff --git a/tests/continous_processes/test_svr_loss.py b/tests/continous_processes/test_svr_loss.py
index 4844b98..9d9d124 100644
--- a/tests/continous_processes/test_svr_loss.py
+++ b/tests/continous_processes/test_svr_loss.py
@@ -11,87 +11,102 @@
 eps = 0.01
 s = 0.1
 B = 0.001
-x = torch.rand(N,d).double()*2 - 1
-xtest = torch.from_numpy(interval(n,d,L_infinity_ball=1))
+x = torch.rand(N, d).double() * 2 - 1
+xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1))
 
 # true
 GP_true = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d)
 ytest = GP_true.sample(xtest)
-GP_true.fit_gp(xtest,ytest)
-
+GP_true.fit_gp(xtest, ytest)
 
 
 y = GP_true.mean(x).clone()
-xnew = x[0,:].view(1,1) + eps
-ynew = torch.rand(size = (1,1))*B
-y2 = torch.vstack([y,ynew])
-x2 = torch.vstack([x,xnew])
-
-GP  = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, loss = 'svr', svr_eps=0.1, s = s)
-GP2 = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, s = s, loss = "squared")
-GP3 = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, s = s, loss = 'unif', B = B)
-GP4 = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, s = s, loss = 'huber', huber_delta=1.35)
-
-GP.fit_gp(x,y)
-GP2.fit_gp(x,y)
-GP3.fit_gp(x,y)
-GP4.fit_gp(x,y)
-
-plt.plot(x,y, 'ro', label = 'points')
-plt.plot(xtest, ytest, 'b-', label = "truth")
-
-plt.plot(xtest, GP.mean_std(xtest)[0], 'g-', label = "svr")
-plt.plot(xtest, GP2.mean_std(xtest)[0], 'r-', label = "squared")
-plt.plot(xtest, GP3.mean_std(xtest)[0], 'y-', label = "unif")
-
-plt.plot(xtest, GP4.mean_std(xtest)[0], '--', color = "orange", label = "huber")
+xnew = x[0, :].view(1, 1) + eps
+ynew = torch.rand(size=(1, 1)) * B
+y2 = torch.vstack([y, ynew])
+x2 = torch.vstack([x, xnew])
+
+GP = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, loss="svr", svr_eps=0.1, s=s
+)
+GP2 = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, s=s, loss="squared"
+)
+GP3 = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, s=s, loss="unif", B=B
+)
+GP4 = GaussianProcess(
+    gamma=0.1,
+    kernel_name="squared_exponential",
+    d=d,
+    s=s,
+    loss="huber",
+    huber_delta=1.35,
+)
+
+GP.fit_gp(x, y)
+GP2.fit_gp(x, y)
+GP3.fit_gp(x, y)
+GP4.fit_gp(x, y)
+
+plt.plot(x, y, "ro", label="points")
+plt.plot(xtest, ytest, "b-", label="truth")
+
+plt.plot(xtest, GP.mean_std(xtest)[0], "g-", label="svr")
+plt.plot(xtest, GP2.mean_std(xtest)[0], "r-", label="squared")
+plt.plot(xtest, GP3.mean_std(xtest)[0], "y-", label="unif")
+
+plt.plot(xtest, GP4.mean_std(xtest)[0], "--", color="orange", label="huber")
 plt.legend()
 plt.show()
 
 
-GP.fit_gp(x2,y2)
-GP2.fit_gp(x2,y2)
-GP3.fit_gp(x2,y2)
-GP4.fit_gp(x2,y2)
+GP.fit_gp(x2, y2)
+GP2.fit_gp(x2, y2)
+GP3.fit_gp(x2, y2)
+GP4.fit_gp(x2, y2)
 
-plt.plot(x,y, 'ro', label = 'points')
-plt.plot(xnew,ynew, 'ko', label = 'corrupted')
-plt.plot(xtest, ytest, 'b-', label = "truth")
+plt.plot(x, y, "ro", label="points")
+plt.plot(xnew, ynew, "ko", label="corrupted")
+plt.plot(xtest, ytest, "b-", label="truth")
 
-plt.plot(xtest, GP.mean_std(xtest)[0], 'g-', label = "svr")
-plt.plot(xtest, GP2.mean_std(xtest)[0], 'r-', label = "squared")
-plt.plot(xtest, GP3.mean_std(xtest)[0], 'y-', label = "unif")
-plt.plot(xtest, GP4.mean_std(xtest)[0], '--', color = "orange", label = "huber")
+plt.plot(xtest, GP.mean_std(xtest)[0], "g-", label="svr")
+plt.plot(xtest, GP2.mean_std(xtest)[0], "r-", label="squared")
+plt.plot(xtest, GP3.mean_std(xtest)[0], "y-", label="unif")
+plt.plot(xtest, GP4.mean_std(xtest)[0], "--", color="orange", label="huber")
 
 plt.legend()
 plt.show()
 
 
-
-
-GP.fit_gp(x2,y2)
-GP2.fit_gp(x2,y2)
-GP3.fit_gp(x2,y2)
-GP4.fit_gp(x2,y2)
-
-plt.plot(x,y, 'ro', label = 'points')
-plt.plot(xnew,ynew, 'ko', label = 'corrupted')
-plt.plot(xtest, ytest, 'b-', label = "truth")
-
-GP.fit_gp(x2,y2)
-GP.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
-GP2.fit_gp(x2,y2)
-GP2.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
-GP3.fit_gp(x2,y2)
-#GP3.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
-GP4.fit_gp(x2,y2)
-GP4.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
-
-plt.plot(xtest, GP.mean_std(xtest)[0], 'g-', label = "svr")
-plt.plot(xtest, GP2.mean_std(xtest)[0], 'r-', label = "squared")
-plt.plot(xtest, GP3.mean_std(xtest)[0], 'y-', label = "unif")
-plt.plot(xtest, GP4.mean_std(xtest)[0], '--', color = "orange", label = "huber")
-
+GP.fit_gp(x2, y2)
+GP2.fit_gp(x2, y2)
+GP3.fit_gp(x2, y2)
+GP4.fit_gp(x2, y2)
+
+plt.plot(x, y, "ro", label="points")
+plt.plot(xnew, ynew, "ko", label="corrupted")
+plt.plot(xtest, ytest, "b-", label="truth")
+
+GP.fit_gp(x2, y2)
+GP.optimize_params(
+    type="bandwidth", restarts=5, verbose=False, optimizer="pytorch-minimize", scale=1.0
+)
+GP2.fit_gp(x2, y2)
+GP2.optimize_params(
+    type="bandwidth", restarts=5, verbose=False, optimizer="pytorch-minimize", scale=1.0
+)
+GP3.fit_gp(x2, y2)
+# GP3.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
+GP4.fit_gp(x2, y2)
+GP4.optimize_params(
+    type="bandwidth", restarts=5, verbose=False, optimizer="pytorch-minimize", scale=1.0
+)
+
+plt.plot(xtest, GP.mean_std(xtest)[0], "g-", label="svr")
+plt.plot(xtest, GP2.mean_std(xtest)[0], "r-", label="squared")
+plt.plot(xtest, GP3.mean_std(xtest)[0], "y-", label="unif")
+plt.plot(xtest, GP4.mean_std(xtest)[0], "--", color="orange", label="huber")
 
 
 plt.legend()
diff --git a/tests/continous_processes/test_unif_marginalized.py b/tests/continous_processes/test_unif_marginalized.py
index a27c508..0369049 100644
--- a/tests/continous_processes/test_unif_marginalized.py
+++ b/tests/continous_processes/test_unif_marginalized.py
@@ -12,37 +12,45 @@
 eps = 0.01
 s = 0.1
 B = 0.001
-x = torch.rand(N,d).double()*2 - 1
-xtest = torch.from_numpy(interval(n,d,L_infinity_ball=1))
+x = torch.rand(N, d).double() * 2 - 1
+xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1))
 
 # true
 GP_true = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d)
 ytest = GP_true.sample(xtest)
-GP_true.fit_gp(xtest,ytest)
-
+GP_true.fit_gp(xtest, ytest)
 
 
 y = GP_true.mean(x).clone()
-xnew = x[0,:].view(1,1) + eps
-ynew = torch.rand(size = (1,1))*B
-y2 = torch.vstack([y,ynew])
-x2 = torch.vstack([x,xnew])
-
-GP  = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, s = s, loss = 'huber')
-GP2 = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, s = s, loss = "squared")
-GP3 = GaussianProcess(gamma=0.1, kernel_name="squared_exponential", d=d, s = s, loss = "unif_new")
-
-
-
-GP.fit_gp(x2,y2)
-GP.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
-GP2.fit_gp(x2,y2)
-#GP2.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
-GP3.fit_gp(x2,y2)
-GP3.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
-
-plt.plot(xtest, GP.mean_std(xtest)[0], 'g-', label = "huber")
-plt.plot(xtest, GP2.mean_std(xtest)[0], 'r-', label = "squared")
-plt.plot(xtest, GP3.mean_std(xtest)[0], 'y-', label = "unif")
+xnew = x[0, :].view(1, 1) + eps
+ynew = torch.rand(size=(1, 1)) * B
+y2 = torch.vstack([y, ynew])
+x2 = torch.vstack([x, xnew])
+
+GP = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, s=s, loss="huber"
+)
+GP2 = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, s=s, loss="squared"
+)
+GP3 = GaussianProcess(
+    gamma=0.1, kernel_name="squared_exponential", d=d, s=s, loss="unif_new"
+)
+
+
+GP.fit_gp(x2, y2)
+GP.optimize_params(
+    type="bandwidth", restarts=5, verbose=False, optimizer="pytorch-minimize", scale=1.0
+)
+GP2.fit_gp(x2, y2)
+# GP2.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1.)
+GP3.fit_gp(x2, y2)
+GP3.optimize_params(
+    type="bandwidth", restarts=5, verbose=False, optimizer="pytorch-minimize", scale=1.0
+)
+
+plt.plot(xtest, GP.mean_std(xtest)[0], "g-", label="huber")
+plt.plot(xtest, GP2.mean_std(xtest)[0], "r-", label="squared")
+plt.plot(xtest, GP3.mean_std(xtest)[0], "y-", label="unif")
 plt.legend()
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/tests/convergence_test.py b/tests/convergence_test.py
index ae65243..b630a63 100755
--- a/tests/convergence_test.py
+++ b/tests/convergence_test.py
@@ -15,30 +15,53 @@
 # number of intial points
 N = 3
 # smoothness
-gamma = torch.ones(d, dtype= torch.float64)*1
+gamma = torch.ones(d, dtype=torch.float64) * 1
 # test problem
 
 xtest = torch.from_numpy(interval(n, d))
 x = torch.from_numpy(np.random.uniform(-L_infinity_ball, L_infinity_ball, size=(N, d)))
 
 f_no_noise = lambda q: torch.sin(torch.sum(q * 4, dim=1)).view(-1, 1)
-f = lambda q: f_no_noise(q) + torch.normal(mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.,
-										   out=None) * s
+f = (
+    lambda q: f_no_noise(q)
+    + torch.normal(
+        mean=torch.zeros(q.size()[0], 1, dtype=torch.float64), std=1.0, out=None
+    )
+    * s
+)
 # targets
 y = f(x)
 
 # GP model with squared exponential
 m = 12
 groups = None
-GP = GaussianProcess(kernel = "squared_exponential", s=s, gamma = gamma[0], d=d, groups = groups)
-GP_KL = GaussianProcessFF(kernel="squared_exponential", s=s, m=m, d=d, gamma=gamma[0], groups=groups, approx="kl")
-GP_He = GaussianProcessFF(kernel="squared_exponential", s=s, m=m, d=d, gamma=gamma[0], groups=groups, approx="hermite")
+GP = GaussianProcess(
+    kernel="squared_exponential", s=s, gamma=gamma[0], d=d, groups=groups
+)
+GP_KL = GaussianProcessFF(
+    kernel="squared_exponential",
+    s=s,
+    m=m,
+    d=d,
+    gamma=gamma[0],
+    groups=groups,
+    approx="kl",
+)
+GP_He = GaussianProcessFF(
+    kernel="squared_exponential",
+    s=s,
+    m=m,
+    d=d,
+    gamma=gamma[0],
+    groups=groups,
+    approx="hermite",
+)
 
 # fit GP
 GP.fit_gp(x, y)
 GP_KL.fit_gp(x, y)
 GP_He.fit_gp(x, y)
 
-print (GP.K)
-print (GP_KL.right_kernel())
-print (GP_He.right_kernel())
\ No newline at end of file
+print(GP.K)
+print(GP_KL.right_kernel())
+print(GP_He.right_kernel())
diff --git a/tests/cvxopt_integer_test.py b/tests/cvxopt_integer_test.py
index 977bd52..8591649 100755
--- a/tests/cvxopt_integer_test.py
+++ b/tests/cvxopt_integer_test.py
@@ -1,94 +1,99 @@
 import numpy as np
 import cvxopt
 import torch
-from cvxopt import glpk,solvers
+from cvxopt import glpk, solvers
 from stpy.continuous_processes.gauss_procc import GaussianProcess
 import matplotlib.pyplot as plt
+
 N = 128
 d = 30
 
 # Rotation
-theta = np.radians(45.)
+theta = np.radians(45.0)
 thetainv = np.pi - theta
 c, s = np.cos(theta), np.sin(theta)
 Q = torch.from_numpy(np.array(((c, -s), (s, c))))
-M = torch.randn(size = (d,d), dtype = torch.float64)
-[Q,R] = torch.qr(M)
-
-
-def solve(Q,c,n = 10, verbose = True, up = None, low = None):
-	print ("Starting Acq. Fucn solver...")
-	print ("Resolution: ", n)
-
-	# Grid
-
-	tau = torch.from_numpy(np.arange(-n,n+1,1).astype(np.double))/n
-	s = torch.ones(2*n+1)
-	Tau = torch.zeros(size = (d,d*(2*n+1)), dtype = torch.float64)
-	S = torch.zeros(size = (d,d*(2*n+1)), dtype = torch.float64)
-	for j in range(d):
-		Tau[j,j*(2*n+1):(j+1)*(2*n+1)] = tau
-		S[j, j * (2 * n + 1):(j + 1) * (2 * n + 1)] = s
-
-	B = Q @ Tau
-
-	if (up is not None) or (low is not None):
-		G = torch.cat((B, -B, S, -S, torch.t(c),-torch.t(c)))
-		h = torch.ones(4 * d + 2)
-		h[0:2 * d] = 1
-		h[3 * d:4 * d] = -1
-		h[4 * d ] = up
-		h[4 * d + 1] = -low
-	else:
-		G = torch.cat((B, -B, S, -S))
-		h = torch.ones(4 * d)
-		h[0:2 * d] = 1
-		h[3 * d:4 * d] = -1
-	# Indicator variables
-	x = torch.zeros(size = (d*(2*n+1),1),dtype = torch.float64)
-	print (h)
-	cc = cvxopt.matrix(c.view(-1).numpy().astype(np.double))
-	Gc = cvxopt.matrix(G.numpy().astype(np.double))
-	hc = cvxopt.matrix(h.numpy().astype(np.double))
-
-	glpk.options['it_lim'] = 10
-	
-	solvers.solve(solver=cp.CBC)
-	(status, x)= glpk.ilp(cc,Gc,hc,B=set(range(d*(2*n+1)))  )
-
-	return x
+M = torch.randn(size=(d, d), dtype=torch.float64)
+[Q, R] = torch.qr(M)
+
+
+def solve(Q, c, n=10, verbose=True, up=None, low=None):
+    print("Starting Acq. Fucn solver...")
+    print("Resolution: ", n)
+
+    # Grid
+
+    tau = torch.from_numpy(np.arange(-n, n + 1, 1).astype(np.double)) / n
+    s = torch.ones(2 * n + 1)
+    Tau = torch.zeros(size=(d, d * (2 * n + 1)), dtype=torch.float64)
+    S = torch.zeros(size=(d, d * (2 * n + 1)), dtype=torch.float64)
+    for j in range(d):
+        Tau[j, j * (2 * n + 1) : (j + 1) * (2 * n + 1)] = tau
+        S[j, j * (2 * n + 1) : (j + 1) * (2 * n + 1)] = s
+
+    B = Q @ Tau
+
+    if (up is not None) or (low is not None):
+        G = torch.cat((B, -B, S, -S, torch.t(c), -torch.t(c)))
+        h = torch.ones(4 * d + 2)
+        h[0 : 2 * d] = 1
+        h[3 * d : 4 * d] = -1
+        h[4 * d] = up
+        h[4 * d + 1] = -low
+    else:
+        G = torch.cat((B, -B, S, -S))
+        h = torch.ones(4 * d)
+        h[0 : 2 * d] = 1
+        h[3 * d : 4 * d] = -1
+    # Indicator variables
+    x = torch.zeros(size=(d * (2 * n + 1), 1), dtype=torch.float64)
+    print(h)
+    cc = cvxopt.matrix(c.view(-1).numpy().astype(np.double))
+    Gc = cvxopt.matrix(G.numpy().astype(np.double))
+    hc = cvxopt.matrix(h.numpy().astype(np.double))
+
+    glpk.options["it_lim"] = 10
+
+    solvers.solve(solver=cp.CBC)
+    (status, x) = glpk.ilp(cc, Gc, hc, B=set(range(d * (2 * n + 1))))
+
+    return x
+
 
 # def N is the desired resolution
-tau = torch.from_numpy(np.arange(-N,N+1,1).astype(np.double))/N
-gp = GaussianProcess(gamma = 0.5, s = 0.001)
-c = torch.randn(size = (d*(2*N+1),1), dtype = torch.float64)
+tau = torch.from_numpy(np.arange(-N, N + 1, 1).astype(np.double)) / N
+gp = GaussianProcess(gamma=0.5, s=0.001)
+c = torch.randn(size=(d * (2 * N + 1), 1), dtype=torch.float64)
 for i in range(d):
-	plt.plot(gp.sample(tau.view(-1,1)).numpy())
-	c[i*(2*N+1):(i+1)*(2*N+1)] = gp.sample(tau.view(-1,1))
+    plt.plot(gp.sample(tau.view(-1, 1)).numpy())
+    c[i * (2 * N + 1) : (i + 1) * (2 * N + 1)] = gp.sample(tau.view(-1, 1))
 plt.show()
 
-def select(c,N,n, low, up):
-	plt.subplot(211)
-	cs = torch.randn(size = (d*(2*n+1),1), dtype = torch.float64)
-	step = N//n
-	plt.plot(c.numpy())
-	for i in range(d):
-		for j in range(2*n+1):
-			cs[i*(2*n+1)+j] = c[i*(2*N+1)+(j*step)]
-			plt.plot(i*(2*N+1)+(j*step),cs[i*(2*n+1)+j].numpy(),"ro")
-
-	sum_c = c[0*(2*N+1):(0+1)*(2*N+1)] *0
-	for i in range(d):
-		sum_c = sum_c+ c[i*(2*N+1):(i+1)*(2*N+1)]
-	if low is not None:
-		plt.subplot(2, 1, 2)
-		plt.plot(sum_c.numpy())
-		plt.plot(sum_c.numpy()*0+low,"--", label = "low")
-		plt.plot(sum_c.numpy() * 0 + up, "--", label = "up")
-	plt.legend()
-
-	plt.show()
-	return cs
+
+def select(c, N, n, low, up):
+    plt.subplot(211)
+    cs = torch.randn(size=(d * (2 * n + 1), 1), dtype=torch.float64)
+    step = N // n
+    plt.plot(c.numpy())
+    for i in range(d):
+        for j in range(2 * n + 1):
+            cs[i * (2 * n + 1) + j] = c[i * (2 * N + 1) + (j * step)]
+            plt.plot(
+                i * (2 * N + 1) + (j * step), cs[i * (2 * n + 1) + j].numpy(), "ro"
+            )
+
+    sum_c = c[0 * (2 * N + 1) : (0 + 1) * (2 * N + 1)] * 0
+    for i in range(d):
+        sum_c = sum_c + c[i * (2 * N + 1) : (i + 1) * (2 * N + 1)]
+    if low is not None:
+        plt.subplot(2, 1, 2)
+        plt.plot(sum_c.numpy())
+        plt.plot(sum_c.numpy() * 0 + low, "--", label="low")
+        plt.plot(sum_c.numpy() * 0 + up, "--", label="up")
+    plt.legend()
+
+    plt.show()
+    return cs
 
 
 up = None
@@ -98,13 +103,13 @@ def select(c,N,n, low, up):
 x = solve(Q, c, n=N, up=up, low=low)
 
 for j in range(int(np.log2(N))):
-	n = np.power(2,j)
+    n = np.power(2, j)
 
-	print(N, n)
-	cs = select(c,N,n, low,up )
-	x = solve(Q,cs,n = n, up=up, low = low)
-	up = float(torch.mm(torch.t(cs),torch.from_numpy(np.array(x))))
-	low = float(torch.mm(torch.t(cs),torch.from_numpy(np.array(x)))) - L/n
+    print(N, n)
+    cs = select(c, N, n, low, up)
+    x = solve(Q, cs, n=n, up=up, low=low)
+    up = float(torch.mm(torch.t(cs), torch.from_numpy(np.array(x))))
+    low = float(torch.mm(torch.t(cs), torch.from_numpy(np.array(x)))) - L / n
 
 """
 m_value = 0
@@ -133,4 +138,4 @@ def select(c,N,n, low, up):
 #print (res['x'])
 #print (x)
 
-"""
\ No newline at end of file
+"""
diff --git a/tests/cvxpy_integer_test.py b/tests/cvxpy_integer_test.py
index 083d921..6048abf 100755
--- a/tests/cvxpy_integer_test.py
+++ b/tests/cvxpy_integer_test.py
@@ -8,7 +8,7 @@
 d = 20
 
 # Rotation
-theta = np.radians(45.)
+theta = np.radians(45.0)
 thetainv = np.pi - theta
 c, s = np.cos(theta), np.sin(theta)
 Q = torch.from_numpy(np.array(((c, -s), (s, c))))
@@ -16,48 +16,47 @@
 [Q, R] = torch.qr(M)
 
 
-def solve(Q, c, n=10, verbose=True, up=None, low=None, xwarm = None):
-	if verbose == True:
-		print("Starting Acq. Fucn solver...")
-		print("Resolution: ", n)
-	# Grid
-
-	tau = torch.from_numpy(np.arange(-n, n + 1, 1).astype(np.double)) / n
-	s = torch.ones(2 * n + 1)
-	Tau = torch.zeros(size=(d, d * (2 * n + 1)), dtype=torch.float64)
-	S = torch.zeros(size=(d, d * (2 * n + 1)), dtype=torch.float64)
-	for j in range(d):
-		Tau[j, j * (2 * n + 1):(j + 1) * (2 * n + 1)] = tau
-		S[j, j * (2 * n + 1):(j + 1) * (2 * n + 1)] = s
-
-	B = Q @ Tau
-
-	if (up is not None) or (low is not None):
-		G = torch.cat((B, -B, S, -S, torch.t(c), -torch.t(c)))
-		h = torch.ones(4 * d + 2)
-		h[0:2 * d] = 1
-		h[3 * d:4 * d] = -1
-		h[4 * d] = up
-		h[4 * d + 1] = -low
-	else:
-		G = torch.cat((B, -B, S, -S))
-		h = torch.ones(4 * d)
-		h[0:2 * d] = 1
-		h[3 * d:4 * d] = -1
-	# Indicator variables
-
-	x = cp.Variable(d * (2 * n + 1), boolean=True)
-	if xwarm is not None:
-		x.value = xwarm.numpy()
-	c = c.view(-1).numpy()
-
-	objective = cp.Maximize(c * x)
-	constraints = [0 <= x, x <= 1, G.numpy()*x <= h.view(-1).numpy()]
-	prob = cp.Problem(objective, constraints)
-	prob.solve(solver=cp.MOSEK,verbose=verbose, warm_start=True)
-
-
-	return (x.value,Tau.numpy() @ x.value, np.dot(c,x.value))
+def solve(Q, c, n=10, verbose=True, up=None, low=None, xwarm=None):
+    if verbose == True:
+        print("Starting Acq. Fucn solver...")
+        print("Resolution: ", n)
+    # Grid
+
+    tau = torch.from_numpy(np.arange(-n, n + 1, 1).astype(np.double)) / n
+    s = torch.ones(2 * n + 1)
+    Tau = torch.zeros(size=(d, d * (2 * n + 1)), dtype=torch.float64)
+    S = torch.zeros(size=(d, d * (2 * n + 1)), dtype=torch.float64)
+    for j in range(d):
+        Tau[j, j * (2 * n + 1) : (j + 1) * (2 * n + 1)] = tau
+        S[j, j * (2 * n + 1) : (j + 1) * (2 * n + 1)] = s
+
+    B = Q @ Tau
+
+    if (up is not None) or (low is not None):
+        G = torch.cat((B, -B, S, -S, torch.t(c), -torch.t(c)))
+        h = torch.ones(4 * d + 2)
+        h[0 : 2 * d] = 1
+        h[3 * d : 4 * d] = -1
+        h[4 * d] = up
+        h[4 * d + 1] = -low
+    else:
+        G = torch.cat((B, -B, S, -S))
+        h = torch.ones(4 * d)
+        h[0 : 2 * d] = 1
+        h[3 * d : 4 * d] = -1
+    # Indicator variables
+
+    x = cp.Variable(d * (2 * n + 1), boolean=True)
+    if xwarm is not None:
+        x.value = xwarm.numpy()
+    c = c.view(-1).numpy()
+
+    objective = cp.Maximize(c * x)
+    constraints = [0 <= x, x <= 1, G.numpy() * x <= h.view(-1).numpy()]
+    prob = cp.Problem(objective, constraints)
+    prob.solve(solver=cp.MOSEK, verbose=verbose, warm_start=True)
+
+    return (x.value, Tau.numpy() @ x.value, np.dot(c, x.value))
 
 
 # def N is the desired resolution
@@ -65,58 +64,58 @@ def solve(Q, c, n=10, verbose=True, up=None, low=None, xwarm = None):
 gp = GaussianProcess(gamma=0.5, s=0.001)
 c = torch.randn(size=(d * (2 * N + 1), 1), dtype=torch.float64)
 for i in range(d):
-	z = gp.sample(tau.view(-1, 1))
-	plt.plot(z.numpy())
-	c[i * (2 * N + 1):(i + 1) * (2 * N + 1)] = z
+    z = gp.sample(tau.view(-1, 1))
+    plt.plot(z.numpy())
+    c[i * (2 * N + 1) : (i + 1) * (2 * N + 1)] = z
 plt.show()
 
 
 def select(c, N, n, val):
-	cs = torch.randn(size=(d * (2 * n + 1), 1), dtype=torch.float64)
-	if val is not None:
-		sol = torch.randn(size=(d * (2 * n + 1), 1), dtype=torch.float64).view(-1)*0
-	else:
-		sol = None
-	step = N // n
-
-	for i in range(d):
-		#plt.plot(c[i * (2 * n + 1):(i+1) * (2 * n + 1)].numpy())
-		for j in range(2 * n + 1):
-			cs[i * (2 * n + 1) + j] = c[i * (2 * N + 1) + (j * step)]
-			if val is not None:
-				if (c[i * (2 * N + 1) + (j * step)] - val[i])**2 < 10e-10:
-					sol[i * (2 * N + 1) + (j * step)] = 1.0
-			#plt.plot((i * (2 * N + 1) + (j * step))/((i+1)*N), cs[i * (2 * n + 1) + j].numpy(), "ro")
-	#plt.show()
-	return cs,sol
+    cs = torch.randn(size=(d * (2 * n + 1), 1), dtype=torch.float64)
+    if val is not None:
+        sol = torch.randn(size=(d * (2 * n + 1), 1), dtype=torch.float64).view(-1) * 0
+    else:
+        sol = None
+    step = N // n
+
+    for i in range(d):
+        # plt.plot(c[i * (2 * n + 1):(i+1) * (2 * n + 1)].numpy())
+        for j in range(2 * n + 1):
+            cs[i * (2 * n + 1) + j] = c[i * (2 * N + 1) + (j * step)]
+            if val is not None:
+                if (c[i * (2 * N + 1) + (j * step)] - val[i]) ** 2 < 10e-10:
+                    sol[i * (2 * N + 1) + (j * step)] = 1.0
+            # plt.plot((i * (2 * N + 1) + (j * step))/((i+1)*N), cs[i * (2 * n + 1) + j].numpy(), "ro")
+    # plt.show()
+    return cs, sol
 
 
 up = None
 low = None
 L = 10e20
 
-#x = solve(Q, c, n=N, up=up, low=low)
+# x = solve(Q, c, n=N, up=up, low=low)
 sol = None
 val = None
 
-for j in range(int(np.log2(N))+1):
-	n = np.power(2, j)
+for j in range(int(np.log2(N)) + 1):
+    n = np.power(2, j)
 
-	print(N, n)
-	cs, sol = select(c, N, n, val)
-	x , val = solve(Q, cs, n=n, up=up, low=low, xwarm = sol)
-	print (x, val)
-	#up = float( torch.dot(cs.view(-1),torch.from_numpy(x)))
-	#low = float( torch.dot(cs.view(-1),torch.from_numpy(x))) - L/n
-	sol = x
+    print(N, n)
+    cs, sol = select(c, N, n, val)
+    x, val = solve(Q, cs, n=n, up=up, low=low, xwarm=sol)
+    print(x, val)
+    # up = float( torch.dot(cs.view(-1),torch.from_numpy(x)))
+    # low = float( torch.dot(cs.view(-1),torch.from_numpy(x))) - L/n
+    sol = x
 
 plt.figure()
-colors = ['b','k','r','g','y']
+colors = ["b", "k", "r", "g", "y"]
 for i in range(d):
-	z = c[i * (2 * N + 1):(i + 1) * (2 * N + 1)].view(-1).numpy()
-	x = np.linspace(-1,1,2*N+1)
-	plt.plot(x,z, color = colors[i % 5], label = str(i))
-	index = np.argmin(z)
-	plt.plot(val[i],z[index],'o', color = colors[i % 5],label = str(i), ms = 10)
-#plt.legend()
-plt.show()
\ No newline at end of file
+    z = c[i * (2 * N + 1) : (i + 1) * (2 * N + 1)].view(-1).numpy()
+    x = np.linspace(-1, 1, 2 * N + 1)
+    plt.plot(x, z, color=colors[i % 5], label=str(i))
+    index = np.argmin(z)
+    plt.plot(val[i], z[index], "o", color=colors[i % 5], label=str(i), ms=10)
+# plt.legend()
+plt.show()
diff --git a/tests/dpps_tests.py b/tests/dpps_tests.py
index 64689ae..ceb5444 100644
--- a/tests/dpps_tests.py
+++ b/tests/dpps_tests.py
@@ -1,7 +1,7 @@
-from stpy.helpers.helper import select_subset,select_subset_inv
+from stpy.helpers.helper import select_subset, select_subset_inv
 import numpy as np
 
 
 if __name__ == "__main__":
-	n = 50
-	M = np.random.randn(n,n)
+    n = 50
+    M = np.random.randn(n, n)
diff --git a/tests/efficient_triangle_basis.py b/tests/efficient_triangle_basis.py
new file mode 100644
index 0000000..305a5a2
--- /dev/null
+++ b/tests/efficient_triangle_basis.py
@@ -0,0 +1,35 @@
+from stpy.borel_set import BorelSet, HierarchicalBorelSets
+from stpy.embeddings.bump_bases import TriangleEmbedding
+from stpy.embeddings.triangle_base import EfficientTriangleEmbedding
+import torch
+
+
+if __name__ == "__main__":
+    d = 1
+    m = 2
+    S = BorelSet(1, torch.tensor([[-1, 1]]))
+
+    inefficient = TriangleEmbedding(d=d, m=m, interval=(-1, 1))
+    efficient = EfficientTriangleEmbedding(d, m, interval=(-1, 1))
+
+    for x, j in [(0.5, 1), (0.1, 0)]:
+        x = torch.tensor(x, dtype=torch.float64)
+        assert torch.allclose(
+            inefficient.basis_fun(x, j).double(), efficient.basis_fun(x, j)
+        )
+
+    for set in [[-1, 1], [-0.25, 1], [-2, 2]]:
+        s = BorelSet(1, torch.tensor([set]))
+        assert torch.allclose(inefficient.integral(s), efficient.integral(s))
+
+    d = 2
+    m = 2
+
+    inefficient = TriangleEmbedding(d=d, m=m, interval=(-1, 1))
+    efficient = EfficientTriangleEmbedding(d, m, interval=(-1, 1))
+
+    for x, j in [([0.5, 0.1], 1), ([0.7, 0.1], 0)]:
+        x = torch.tensor(x, dtype=torch.float64)
+        assert torch.allclose(
+            inefficient.basis_fun(x, j).double(), efficient.basis_fun(x, j)
+        )
diff --git a/tests/embedding/faber_schauder_embedding.py b/tests/embedding/faber_schauder_embedding.py
index 7fb9fb3..7ebd1d5 100644
--- a/tests/embedding/faber_schauder_embedding.py
+++ b/tests/embedding/faber_schauder_embedding.py
@@ -10,15 +10,15 @@
 from stpy.helpers.helper import interval
 
 m = 16
-B4 = FaberSchauderEmbedding(m = m, d = 1)
+B4 = FaberSchauderEmbedding(m=m, d=1)
 
-plt.figure(figsize = (20,20))
-basis = lambda x,j: B4.basis_fun(x,j)
-x = torch.from_numpy(np.linspace(-1,1,1024)).view(-1,1)
-print (B4.hierarchical_mask())
+plt.figure(figsize=(20, 20))
+basis = lambda x, j: B4.basis_fun(x, j)
+x = torch.from_numpy(np.linspace(-1, 1, 1024)).view(-1, 1)
+print(B4.hierarchical_mask())
 for j in range(m):
-	plt.plot(x,basis(x,j), lw = 6)
-	plt.grid(ls = '--', lw = 4)
-	plt.xlim((-1,1))
+    plt.plot(x, basis(x, j), lw=6)
+    plt.grid(ls="--", lw=4)
+    plt.xlim((-1, 1))
 
 plt.show()
diff --git a/tests/fourier-features-multidimensional.py b/tests/fourier-features-multidimensional.py
index b1cfdd3..c46254b 100644
--- a/tests/fourier-features-multidimensional.py
+++ b/tests/fourier-features-multidimensional.py
@@ -2,25 +2,27 @@
 from stpy.continuous_processes.fourier_fea import GaussianProcessFF
 from stpy.embeddings.embedding import QuadratureEmbedding
 from stpy.helpers.helper import interval
-if __name__ == "__main__":
 
-	m = 128
+if __name__ == "__main__":
 
-	def cost_function():
-		gamma = 0.2
-		torch.manual_seed(245)
-		z2 = QuadratureEmbedding(gamma=gamma, m=m, d=2)
-		theta2d = torch.randn(m, 1).double()
-		F = lambda x: z2.embed_one(x[1, 0:2].view(1,-1)) @ theta2d
-		print (torch.norm(theta2d))
-		return F
+    m = 128
 
-	F = cost_function()
-	xtest = torch.from_numpy(interval(50,2))
-	ytest = F(xtest)
+    def cost_function():
+        gamma = 0.2
+        torch.manual_seed(245)
+        z2 = QuadratureEmbedding(gamma=gamma, m=m, d=2)
+        theta2d = torch.randn(m, 1).double()
+        F = lambda x: z2.embed_one(x[1, 0:2].view(1, -1)) @ theta2d
+        print(torch.norm(theta2d))
+        return F
 
-	GP = GaussianProcessFF(d = 2, groups=[[0,1]], m = torch.Tensor([m,64]), gamma = torch.Tensor([0.2]))
-	GP.fit_gp(xtest,ytest)
+    F = cost_function()
+    xtest = torch.from_numpy(interval(50, 2))
+    ytest = F(xtest)
 
-	GP.visualize_contour(xtest,f_true=F)
+    GP = GaussianProcessFF(
+        d=2, groups=[[0, 1]], m=torch.tensor([m, 64]), gamma=torch.tensor([0.2])
+    )
+    GP.fit_gp(xtest, ytest)
 
+    GP.visualize_contour(xtest, f_true=F)
diff --git a/tests/gibbs_kernel.py b/tests/gibbs_kernel.py
index 0e8cf49..c647570 100644
--- a/tests/gibbs_kernel.py
+++ b/tests/gibbs_kernel.py
@@ -1,38 +1,40 @@
 from stpy.kernels import KernelFunction
-from stpy.continuous_processes.gauss_procc import  GaussianProcess
+from stpy.continuous_processes.gauss_procc import GaussianProcess
 from stpy.helpers.helper import interval
 from stpy.embeddings.optimal_positive_basis import OptimalPositiveBasis
 import matplotlib.pyplot as plt
 import torch
 import numpy as np
+
 n = 1024
 d = 1
 
 
 def gamma(x):
-	out = x[:,0].view(-1,1)*0
-	small = x <= - 0.5
-	mid = torch.logical_and(x >= -0.5,x <= 0.5)
-	large = x > 0.5
-	gamma1 = 0.1
-	gamma2 = 1.
-	out[small] = (gamma2-gamma1)/(torch.exp(-25*(x[small]+0.5)) + 1) + gamma1
-	out[mid] = gamma2
-	out[large] = (gamma2-gamma1)/(torch.exp(-25*(-x[large]+0.5)) + 1) + gamma1
-	return out
-
-gamma = lambda x: x[:,0].view(-1,1)*0 + 0.05 + 0.3*(x+1)**4
-
-#gamma = lambda x: x[x<-0.5]*0 +0 + 0.05 + 0.2*(x+1)**2#*torch.abs(torch.cos(x*np.pi)) + 0.5
-xtest = torch.from_numpy(interval(n,d))
-
-vals = gamma(xtest).T**2 + gamma(xtest)**2
+    out = x[:, 0].view(-1, 1) * 0
+    small = x <= -0.5
+    mid = torch.logical_and(x >= -0.5, x <= 0.5)
+    large = x > 0.5
+    gamma1 = 0.1
+    gamma2 = 1.0
+    out[small] = (gamma2 - gamma1) / (torch.exp(-25 * (x[small] + 0.5)) + 1) + gamma1
+    out[mid] = gamma2
+    out[large] = (gamma2 - gamma1) / (torch.exp(-25 * (-x[large] + 0.5)) + 1) + gamma1
+    return out
+
+
+gamma = lambda x: x[:, 0].view(-1, 1) * 0 + 0.05 + 0.3 * (x + 1) ** 4
+
+# gamma = lambda x: x[x<-0.5]*0 +0 + 0.05 + 0.2*(x+1)**2#*torch.abs(torch.cos(x*np.pi)) + 0.5
+xtest = torch.from_numpy(interval(n, d))
+
+vals = gamma(xtest).T ** 2 + gamma(xtest) ** 2
 plt.imshow(vals)
 plt.colorbar()
 plt.show()
 
-k = KernelFunction(kernel_name="gibbs", params={'gamma_fun':gamma})
-plt.imshow(k.kernel(xtest,xtest))
+k = KernelFunction(kernel_name="gibbs", params={"gamma_fun": gamma})
+plt.imshow(k.kernel(xtest, xtest))
 plt.colorbar()
 plt.show()
 
@@ -45,13 +47,23 @@ def gamma(x):
 s = 0.01
 b = 0
 
-Emb = OptimalPositiveBasis(d, m, offset=0.0, s=s, b=b, discretization_size=n, B=1000., kernel_object=k, samples = 1000)
+Emb = OptimalPositiveBasis(
+    d,
+    m,
+    offset=0.0,
+    s=s,
+    b=b,
+    discretization_size=n,
+    B=1000.0,
+    kernel_object=k,
+    samples=1000,
+)
 for i in range(m):
-	f_i = Emb.basis_fun(xtest, i)  ## basis function
-	plt.plot(xtest,f_i)
+    f_i = Emb.basis_fun(xtest, i)  ## basis function
+    plt.plot(xtest, f_i)
 
 plt.show()
 
 # ytest = GP.sample(xtest)
 # plt.plot(xtest,ytest)
-# plt.show()
\ No newline at end of file
+# plt.show()
diff --git a/tests/gradient_confidence_test.py b/tests/gradient_confidence_test.py
index 88288f7..19db4bd 100755
--- a/tests/gradient_confidence_test.py
+++ b/tests/gradient_confidence_test.py
@@ -2,39 +2,45 @@
 import torch
 from stpy.continuous_processes.gauss_procc import GaussianProcess
 from stpy.helpers.helper import interval
-#%matplotlib notebook
+
+# %matplotlib notebook
 
 
 # 2D Grid
 n = 20
 n_vis = 50
 d = 2
-xtest_vis = torch.from_numpy(interval(n_vis,d))
-xtest = torch.from_numpy(interval(n,d,L_infinity_ball=1.))
+xtest_vis = torch.from_numpy(interval(n_vis, d))
+xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1.0))
 noise_s = 0.001
 bw = 0.4
 
-GP_true = GaussianProcess(groups = [[0],[1]], gamma = bw*torch.ones(2,dtype = torch.float64), kernel = "ard", s = noise_s)
+GP_true = GaussianProcess(
+    groups=[[0], [1]],
+    gamma=bw * torch.ones(2, dtype=torch.float64),
+    kernel="ard",
+    s=noise_s,
+)
 y = GP_true.sample(xtest)
-GP_true.fit_gp(xtest,y)
+GP_true.fit_gp(xtest, y)
 
-zero = torch.from_numpy(np.array([[0.,0.]]))
-gradient, hessian = GP_true.mean_gradient_hessian(zero, hessian = True)
+zero = torch.from_numpy(np.array([[0.0, 0.0]]))
+gradient, hessian = GP_true.mean_gradient_hessian(zero, hessian=True)
 
 
-GP_fit = GaussianProcess(gamma = bw, kernel = "squared_exponential", s = noise_s)
-GP_fit.fit_gp(xtest ,y)
-#GP_fit.visualize(xtest_vis)
+GP_fit = GaussianProcess(gamma=bw, kernel="squared_exponential", s=noise_s)
+GP_fit.fit_gp(xtest, y)
+# GP_fit.visualize(xtest_vis)
 GP_fit.log_marginal_likelihood_self()
 
 GP_fit.visualize_quiver(xtest_vis)
 
 
-print ("Zero:" ,zero)
+print("Zero:", zero)
 g, V = GP_fit.gradient_mean_var(zero)
 
-print (gradient)
+print(gradient)
 
-print (V)
+print(V)
 
-print ("------------------")
\ No newline at end of file
+print("------------------")
diff --git a/tests/gradients_test.py b/tests/gradients_test.py
index 67b97b3..ef28e49 100755
--- a/tests/gradients_test.py
+++ b/tests/gradients_test.py
@@ -3,87 +3,100 @@
 import torch
 from stpy.continuous_processes.gauss_procc import GaussianProcess
 from stpy.helpers.helper import interval
-#%matplotlib notebook
+
+# %matplotlib notebook
 
 
 # 2D Grid
-for n in np.arange(50,60,10):
+for n in np.arange(50, 60, 10):
     n_vis = 50
     d = 2
-    xtest_vis = torch.from_numpy(interval(n_vis,d))
-    xtest = torch.from_numpy(interval(n,d,L_infinity_ball=0.01))
+    xtest_vis = torch.from_numpy(interval(n_vis, d))
+    xtest = torch.from_numpy(interval(n, d, L_infinity_ball=0.01))
     noise_s = 0.001
     bw = 0.4
 
-    GP_true = GaussianProcess(groups = [[0],[1]], gamma = bw*torch.ones(2,dtype = torch.float64), kernel = "ard", s = noise_s)
+    GP_true = GaussianProcess(
+        groups=[[0], [1]],
+        gamma=bw * torch.ones(2, dtype=torch.float64),
+        kernel="ard",
+        s=noise_s,
+    )
     y = GP_true.sample(xtest)
-    GP_true.fit_gp(xtest,y)
-
-    zero = torch.from_numpy(np.array([[0.,0.]]))
-    gradient, hessian = GP_true.mean_gradient_hessian(zero, hessian = True)
+    GP_true.fit_gp(xtest, y)
 
-   # print ("gradient:",gradient)
-   # print ("hessian:",hessian)
+    zero = torch.from_numpy(np.array([[0.0, 0.0]]))
+    gradient, hessian = GP_true.mean_gradient_hessian(zero, hessian=True)
 
+    # print ("gradient:",gradient)
+    # print ("hessian:",hessian)
 
     # [mu, _] = GP_true.get_lambdas(2, mean=True)
     # for z in [10e-1, 10e-2, 10e-3, 10e-4, 10e-5, 10e-6, 10e-7]:
     #     print(z, stpy.helper.finite_differences(mu,z,xtest[0].view(1,-1)))
 
-    theta = np.radians(12.)
+    theta = np.radians(12.0)
     thetainv = np.pi - theta
     c, s = np.cos(theta), np.sin(theta)
-    RandRot = torch.from_numpy(np.array(((c,-s), (s, c))))
-    #print (RandRot)
+    RandRot = torch.from_numpy(np.array(((c, -s), (s, c))))
+    # print (RandRot)
 
     def eval(x):
         xprime = x.mm(RandRot)
         f = GP_true.mean_std(xprime)[0]
         return f
 
-
     y_prime = eval(xtest)
-    GP_fit = GaussianProcess(groups = [[0,1]], gamma = bw*torch.ones(2,dtype = torch.float64), kernel = "ard", s = noise_s)
-    GP_fit.fit_gp(xtest,y_prime)
+    GP_fit = GaussianProcess(
+        groups=[[0, 1]],
+        gamma=bw * torch.ones(2, dtype=torch.float64),
+        kernel="ard",
+        s=noise_s,
+    )
+    GP_fit.fit_gp(xtest, y_prime)
     GP_fit.visualize(xtest_vis)
     GP_fit.log_marginal_likelihood_self()
 
-    print ("Zero:",zero)
+    print("Zero:", zero)
     g, V = GP_fit.gradient_mean_var(zero)
 
-    print (gradient)
+    print(gradient)
 
-    print (V)
+    print(V)
 
-    print ("------------------")
-
-    gradient, hessian = GP_fit.mean_gradient_hessian(zero, hessian = True)
-    Q = torch.symeig(hessian, eigenvectors = True)[1]
+    print("------------------")
 
+    gradient, hessian = GP_fit.mean_gradient_hessian(zero, hessian=True)
+    Q = torch.symeig(hessian, eigenvectors=True)[1]
 
     print(GP_fit.mean_std(zero))
-    #print ("Estimated:",Q)
-    #print ("True:", RandRot)
+    # print ("Estimated:",Q)
+    # print ("True:", RandRot)
     P = torch.t(Q) @ RandRot
-    I = torch.eye(GP_fit.d, dtype = torch.float64)
-    Noise = s*I*s
+    I = torch.eye(GP_fit.d, dtype=torch.float64)
+    Noise = s * I * s
     Perm = torch.clamp(torch.abs(P), min=10e-3)
-    print (n, P,torch.norm(torch.abs(P)-Perm))
-
+    print(n, P, torch.norm(torch.abs(P) - Perm))
 
     no = 100
-    thetas = np.linspace(0.,np.pi,no)
+    thetas = np.linspace(0.0, np.pi, no)
     res = []
     for theta in thetas:
         c, s = np.cos(theta), np.sin(theta)
-        Rot = np.array(((c,-s), (s, c)))
+        Rot = np.array(((c, -s), (s, c)))
         Rot = torch.from_numpy(Rot)
-        res.append(float(GP_fit.log_marginal_likelihood(GP_fit.kernel_object.gamma,Rot,GP_fit.kernel_object.kappa)))
-    plt.plot(thetas,res)
-    plt.plot([thetainv],np.average(np.array(res)),'ro')
+        res.append(
+            float(
+                GP_fit.log_marginal_likelihood(
+                    GP_fit.kernel_object.gamma, Rot, GP_fit.kernel_object.kappa
+                )
+            )
+        )
+    plt.plot(thetas, res)
+    plt.plot([thetainv], np.average(np.array(res)), "ro")
     plt.show()
 
-    GP_fit.optimize_params(type = "rots", restarts = 10)
+    GP_fit.optimize_params(type="rots", restarts=10)
     GP_fit.log_marginal_likelihood_self()
 
     print(GP_fit.Rot)
diff --git a/tests/hessian-estimation-test.py b/tests/hessian-estimation-test.py
index af8d033..6c287c6 100755
--- a/tests/hessian-estimation-test.py
+++ b/tests/hessian-estimation-test.py
@@ -10,14 +10,14 @@
 L_infinity_ball = 0.5
 d = 2
 
-thetae = np.radians(35.)
+thetae = np.radians(35.0)
 ce, se = np.cos(thetae), np.sin(thetae)
 R = torch.from_numpy(np.array(((ce, -se), (se, ce))))
-D = torch.diag(torch.Tensor([0.8, 1.1]).double())
-#D = torch.diag(torch.Tensor([1, 1]).double())
+D = torch.diag(torch.tensor([0.8, 1.1]).double())
+# D = torch.diag(torch.tensor([1, 1]).double())
 
 W = R.T @ D @ R
-print (W)
+print(W)
 BenchmarkFunc = QuadraticBenchmark(d=d, R=W)
 
 x = BenchmarkFunc.initial_guess(N)
@@ -31,27 +31,36 @@
 F0 = lambda x: BenchmarkFunc.eval(x, sigma=0)
 
 
-def plot_contour(xtest,ytest,lim=None):
+def plot_contour(xtest, ytest, lim=None):
     from scipy.interpolate import griddata
+
     xx = xtest[:, 0].numpy()
     yy = xtest[:, 1].numpy()
-    grid_x, grid_y = np.mgrid[min(xx):max(xx):100j, min(yy):max(yy):100j]
-    grid_z_mu = griddata((xx, yy), ytest[:, 0].numpy(), (grid_x, grid_y), method='linear')
+    grid_x, grid_y = np.mgrid[min(xx) : max(xx) : 100j, min(yy) : max(yy) : 100j]
+    grid_z_mu = griddata(
+        (xx, yy), ytest[:, 0].numpy(), (grid_x, grid_y), method="linear"
+    )
     fig, ax = plt.subplots(figsize=(10, 9))
     cs = ax.contourf(grid_x, grid_y, grid_z_mu)
-    ax.contour(cs, colors='k')
+    ax.contour(cs, colors="k")
     if lim is not None:
-        plt.xlim([-lim,lim])
-        plt.ylim([-lim,lim])
+        plt.xlim([-lim, lim])
+        plt.ylim([-lim, lim])
     plt.colorbar(cs)
     # Plot grid.
-    ax.grid(c='k', ls='-', alpha=0.1)
-
+    ax.grid(c="k", ls="-", alpha=0.1)
 
 
 ## Additive Model
 m = 64
-GP = GaussianProcessFF(d=d, s=s, m = torch.ones(d)*m, gamma=gamma*torch.ones(d), bounds=bounds, groups = stpy.helpers.helper.full_group(d))
+GP = GaussianProcessFF(
+    d=d,
+    s=s,
+    m=torch.ones(d) * m,
+    gamma=gamma * torch.ones(d),
+    bounds=bounds,
+    groups=stpy.helpers.helper.full_group(d),
+)
 
 ## Global Model
 # m = 512
@@ -61,23 +70,22 @@ def plot_contour(xtest,ytest,lim=None):
 
 p = 5
 d = 2
-embedding = PolynomialEmbedding(d,p)
+embedding = PolynomialEmbedding(d, p)
 Map = lambda x: embedding.embed(x)
 
 # Starting points
-x0_1 = torch.Tensor([0.1, 0.1]).double().view(-1, d)
+x0_1 = torch.tensor([0.1, 0.1]).double().view(-1, d)
 
-#x0_1 = torch.Tensor([-0.1, 0.]).double().view(-1, d)
-x0_2 = torch.Tensor([0.1, 0.1]).double().view(-1, d)
+# x0_1 = torch.tensor([-0.1, 0.]).double().view(-1, d)
+x0_2 = torch.tensor([0.1, 0.1]).double().view(-1, d)
 
 print("Embeding size:", Map(x0_1).size())
 
 
-Bandit = OPPR_TS_GP(x0_1, F, GP, Map, finite_dim=True, s = s, GPMap = True)
-#Bandit.decolerate(x0_1,10e-5,1)
-Bandit.decolerate_AJD([x0_1,x0_2],10e-5,1)
-
-print (Bandit.Q)
-print (W@Bandit.Q)
-print (W@torch.inverse(Bandit.Q))
+Bandit = OPPR_TS_GP(x0_1, F, GP, Map, finite_dim=True, s=s, GPMap=True)
+# Bandit.decolerate(x0_1,10e-5,1)
+Bandit.decolerate_AJD([x0_1, x0_2], 10e-5, 1)
 
+print(Bandit.Q)
+print(W @ Bandit.Q)
+print(W @ torch.inverse(Bandit.Q))
diff --git a/tests/interval_groups_test.py b/tests/interval_groups_test.py
index 0c20a87..bb566e0 100644
--- a/tests/interval_groups_test.py
+++ b/tests/interval_groups_test.py
@@ -1,10 +1,15 @@
-from stpy.helpers.helper import interval_groups, get_hierarchy, hierarchical_distance, valid_enlargement
+from stpy.helpers.helper import (
+    interval_groups,
+    get_hierarchy,
+    hierarchical_distance,
+    valid_enlargement,
+)
 
 if __name__ == "__main__":
 
-	out = get_hierarchy(start = 0,new_elements=[1,2,3])
-	curr =  [[0], [1], [2], [3]]
-	print(hierarchical_distance(curr, [[0,1],[2],[3]]))
-	enlargements = valid_enlargement(curr, out)
-	for enlargement in enlargements:
-		print (curr,"->",out[enlargement])
\ No newline at end of file
+    out = get_hierarchy(start=0, new_elements=[1, 2, 3])
+    curr = [[0], [1], [2], [3]]
+    print(hierarchical_distance(curr, [[0, 1], [2], [3]]))
+    enlargements = valid_enlargement(curr, out)
+    for enlargement in enlargements:
+        print(curr, "->", out[enlargement])
diff --git a/tests/kernelized-features-test.py b/tests/kernelized-features-test.py
index 3da0af7..e63d65e 100644
--- a/tests/kernelized-features-test.py
+++ b/tests/kernelized-features-test.py
@@ -5,27 +5,27 @@
 import numpy as np
 
 if __name__ == "__main__":
-	m = 16
-	gamma = 1.
-	s = 0.0001
-	n = 40
+    m = 16
+    gamma = 1.0
+    s = 0.0001
+    n = 40
 
-	embedding = HermiteEmbedding(m = m, gamma = gamma)
-	GP = KernelizedFeatures(embedding=embedding,s = s,m = m)
+    embedding = HermiteEmbedding(m=m, gamma=gamma)
+    GP = KernelizedFeatures(embedding=embedding, s=s, m=m)
 
-	x = torch.from_numpy(interval(n,1))
-	xtest = torch.from_numpy(interval(2048,1))
-	F = lambda x: torch.sin(10*x)
-	y = F(x)
+    x = torch.from_numpy(interval(n, 1))
+    xtest = torch.from_numpy(interval(2048, 1))
+    F = lambda x: torch.sin(10 * x)
+    y = F(x)
 
-	GP.fit_gp(x,y)
-	mu, std = GP.mean_std(xtest)
-	print (mu.size())
-	print (std.size())
-	GP.visualize(xtest)
+    GP.fit_gp(x, y)
+    mu, std = GP.mean_std(xtest)
+    print(mu.size())
+    print(std.size())
+    GP.visualize(xtest)
 
-	for _ in range(30):
-		x = torch.from_numpy(np.random.uniform(-1,1,1)).view(1,1)
-		GP.add_data_point(x,F(x))
+    for _ in range(30):
+        x = torch.from_numpy(np.random.uniform(-1, 1, 1)).view(1, 1)
+        GP.add_data_point(x, F(x))
 
-	GP.visualize(xtest)
+    GP.visualize(xtest)
diff --git a/tests/kernels/ard_matern_kernel_test.py b/tests/kernels/ard_matern_kernel_test.py
index 56379b3..fd4b05b 100644
--- a/tests/kernels/ard_matern_kernel_test.py
+++ b/tests/kernels/ard_matern_kernel_test.py
@@ -10,14 +10,20 @@
 d = 2
 eps = 0.01
 s = 1
-x = torch.rand(N,d).double()*2 - 1
-xtest = torch.from_numpy(interval(n,d,L_infinity_ball=1))
+x = torch.rand(N, d).double() * 2 - 1
+xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1))
 
 # true
 GP = GaussianProcess(kernel_name="ard_matern", d=d)
 y = GP.sample(x)
-GP.fit_gp(x,y)
-GP.optimize_params(type="bandwidth", restarts=5, verbose = False, optimizer = 'pytorch-minimize', scale = 1., weight=1.)
+GP.fit_gp(x, y)
+GP.optimize_params(
+    type="bandwidth",
+    restarts=5,
+    verbose=False,
+    optimizer="pytorch-minimize",
+    scale=1.0,
+    weight=1.0,
+)
 GP.visualize_contour(xtest)
 #
-
diff --git a/tests/marginalized_likelihood_test.py b/tests/marginalized_likelihood_test.py
index 8dc1a6c..c1551c7 100644
--- a/tests/marginalized_likelihood_test.py
+++ b/tests/marginalized_likelihood_test.py
@@ -7,102 +7,98 @@
 from pymanopt.manifolds import Euclidean
 
 if __name__ == "__main__":
-	d = 2
-	n = 3
-
-
-	## Squared exponential with single parameter
-	GP = GaussianProcess(gamma=1., kernel_name="ard", d=2)
-	x = torch.rand(n,d).double()*2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x,y)
-	xtest = torch.from_numpy(interval(50,2,L_infinity_ball=1))
-
-	#
-	# init_val = None
-	# manifold = Euclidean(2)
-	# bounds = None
-	#
-	# params = {"0":{"kappa":(1.,Euclidean(1),None),"ard_gamma":(init_val, manifold, bounds)}}
-	#GP.optimize_params_general(params = params, maxiter = 100)
-
-	#GP.optimize_params(type = "bandwidth", restarts=2)
-
-
-#
-	## Additive quick
-	k = KernelFunction(kernel_name = "ard", d = 2, groups = [[0],[1]] )
-	GP = GaussianProcess(kernel=k)
-	x = torch.rand(n,d).double()*2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x,y)
-
-	#GP.optimize_params(type="bandwidth", restarts=2)
-
-
-
-
-	# ## Additive via algebra
-	k1 = KernelFunction(kernel_name="ard" ,ard_gamma = 0.1, d = 1, group=[0])
-	k2 = KernelFunction(kernel_name="polynomial" ,ard_gamma = 0.5, power = 2, d = 1, group=[1])
-	k = k1 + k2
-	#
-	# print (k.params_dict)
-	GP = GaussianProcess(kernel=k, d=2)
-	#
-	x = torch.rand(n, d).double() * 2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x, y)
-	#GP.optimize_params(type="bandwidth", restarts=2)
-
-
-	## Additive two the same
-	k1 = KernelFunction(kernel_name="ard" ,ard_gamma = 0.1, d = 1, group=[0])
-	k2 = KernelFunction(kernel_name="ard" ,ard_gamma = 0.5, power = 2, d = 1, group=[1])
-	GP = GaussianProcess(kernel=k, d=2)
-	#
-	x = torch.rand(n, d).double() * 2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x, y)
-	#GP.optimize_params(type="bandwidth", restarts=2)
-
-
-	## Optimize groups
-	k = KernelFunction(kernel_name="ard", d=2, groups = [[0,1]])
-	GP = GaussianProcess(kernel=k, d=2)
-	#
-	x = torch.rand(n, d).double() * 2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x, y)
-	#print(k.params_dict)
-	#GP.optimize_params(type="groups", restarts=2)
-
-	## Optimize power in polynomial kernel
-	k = KernelFunction(kernel_name="polynomial", d=2, power = 3)
-	GP = GaussianProcess(kernel=k, d=2)
-	#
-	x = torch.rand(n, d).double() * 2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x, y)
-	#print(k.params_dict)
-	params = {"0":{"power":(1.,[1,2,3,4,5],None)}}
-	#GP.optimize_params_general(params = params, optimizer="discrete")
-
-
-	## Covar
-	k = KernelFunction(kernel_name="full_covariance_se", d=2)
-	GP = GaussianProcess(kernel=k, d=2)
-	#
-	x = torch.rand(n, d).double() * 2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x, y)
-	#GP.optimize_params(type="covariance", restarts=2)
-
-	## cova with regularizer
-	k = KernelFunction(kernel_name="full_covariance_se", d=2)
-	GP = GaussianProcess(kernel=k, d=2)
-	#
-	x = torch.rand(n, d).double() * 2 - 1
-	y = GP.sample(x)
-	GP.fit_gp(x, y)
-	GP.optimize_params(type="covariance", restarts=2, regularizer=["spectral_norm",0.1])
\ No newline at end of file
+    d = 2
+    n = 3
+
+    ## Squared exponential with single parameter
+    GP = GaussianProcess(gamma=1.0, kernel_name="ard", d=2)
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+    xtest = torch.from_numpy(interval(50, 2, L_infinity_ball=1))
+
+    #
+    # init_val = None
+    # manifold = Euclidean(2)
+    # bounds = None
+    #
+    # params = {"0":{"kappa":(1.,Euclidean(1),None),"ard_gamma":(init_val, manifold, bounds)}}
+    # GP.optimize_params_general(params = params, maxiter = 100)
+
+    # GP.optimize_params(type = "bandwidth", restarts=2)
+
+    #
+    ## Additive quick
+    k = KernelFunction(kernel_name="ard", d=2, groups=[[0], [1]])
+    GP = GaussianProcess(kernel=k)
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+
+    # GP.optimize_params(type="bandwidth", restarts=2)
+
+    # ## Additive via algebra
+    k1 = KernelFunction(kernel_name="ard", ard_gamma=0.1, d=1, group=[0])
+    k2 = KernelFunction(
+        kernel_name="polynomial", ard_gamma=0.5, power=2, d=1, group=[1]
+    )
+    k = k1 + k2
+    #
+    # print (k.params_dict)
+    GP = GaussianProcess(kernel=k, d=2)
+    #
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+    # GP.optimize_params(type="bandwidth", restarts=2)
+
+    ## Additive two the same
+    k1 = KernelFunction(kernel_name="ard", ard_gamma=0.1, d=1, group=[0])
+    k2 = KernelFunction(kernel_name="ard", ard_gamma=0.5, power=2, d=1, group=[1])
+    GP = GaussianProcess(kernel=k, d=2)
+    #
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+    # GP.optimize_params(type="bandwidth", restarts=2)
+
+    ## Optimize groups
+    k = KernelFunction(kernel_name="ard", d=2, groups=[[0, 1]])
+    GP = GaussianProcess(kernel=k, d=2)
+    #
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+    # print(k.params_dict)
+    # GP.optimize_params(type="groups", restarts=2)
+
+    ## Optimize power in polynomial kernel
+    k = KernelFunction(kernel_name="polynomial", d=2, power=3)
+    GP = GaussianProcess(kernel=k, d=2)
+    #
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+    # print(k.params_dict)
+    params = {"0": {"power": (1.0, [1, 2, 3, 4, 5], None)}}
+    # GP.optimize_params_general(params = params, optimizer="discrete")
+
+    ## Covar
+    k = KernelFunction(kernel_name="full_covariance_se", d=2)
+    GP = GaussianProcess(kernel=k, d=2)
+    #
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+    # GP.optimize_params(type="covariance", restarts=2)
+
+    ## cova with regularizer
+    k = KernelFunction(kernel_name="full_covariance_se", d=2)
+    GP = GaussianProcess(kernel=k, d=2)
+    #
+    x = torch.rand(n, d).double() * 2 - 1
+    y = GP.sample(x)
+    GP.fit_gp(x, y)
+    GP.optimize_params(
+        type="covariance", restarts=2, regularizer=["spectral_norm", 0.1]
+    )
diff --git a/tests/orthogonal_map_test.py b/tests/orthogonal_map_test.py
index d173cc0..4be4589 100755
--- a/tests/orthogonal_map_test.py
+++ b/tests/orthogonal_map_test.py
@@ -5,45 +5,47 @@
 
 if __name__ == "__main__":
 
+    dim = 4
 
+    Benchmark = ProteinBenchmark(
+        "/home/mojko/Documents/PhD/stpy/stpy/test_functions/protein_data_gb1.h5",
+        dim=dim,
+        ref=["A", "B", "C", "D"],
+    )
+    Benchmark.self_translate()
 
-	dim = 4
+    X = Benchmark.data.values[:, 0:dim].astype(int)
+    Y = Benchmark.data.values[:, 5].astype(float).reshape(-1, 1)
 
-	Benchmark = ProteinBenchmark("/home/mojko/Documents/PhD/stpy/stpy/test_functions/protein_data_gb1.h5", dim=dim, ref=['A', 'B', 'C', 'D'])
-	Benchmark.self_translate()
+    X_one_hot = Benchmark.translate_one_hot(X)
 
-	X = Benchmark.data.values[:,0:dim].astype(int)
-	Y = Benchmark.data.values[:,5].astype(float).reshape(-1,1)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_one_hot, Y, test_size=0.20, random_state=42
+    )
 
+    X_train = torch.from_numpy(X_train)
+    X_test = torch.from_numpy(X_test)
+    y_train = torch.from_numpy(y_train)
+    y_test = torch.from_numpy(y_test)
 
-	X_one_hot = Benchmark.translate_one_hot(X)
+    print(X_train.size())
+    print(y_train.size())
 
-	X_train, X_test, y_train, y_test = train_test_split(X_one_hot, Y, test_size = 0.20, random_state = 42)
+    print(X_test.size())
+    print(y_test.size())
 
-	X_train = torch.from_numpy(X_train)
-	X_test = torch.from_numpy(X_test)
-	y_train = torch.from_numpy(y_train)
-	y_test = torch.from_numpy(y_test)
+    d = dim * 26
+    m = dim * 26
 
-	print(X_train.size())
-	print(y_train.size())
+    ridge = lambda x: torch.relu(x)
+    Net = RandomMap(d, m, ridge, output=1)
 
+    print("Loss before training: ", Net.loss(X_test, y_test))
 
-	print(X_test.size())
-	print(y_test.size())
+    Net.fit_map(X_train, y_train, verbose=1, lr=10e-1, epochs=100)
 
-	d = dim*26
-	m = dim*26
+    print("Net:", Net.forward(X_test[1, :].view(1, -1)))
 
-	ridge = lambda x: torch.relu(x)
-	Net = RandomMap(d,m,ridge, output = 1)
+    print("Truth:", y_test[1, :])
 
-	print ("Loss before training: ",Net.loss(X_test,y_test))
-
-	Net.fit_map(X_train,y_train, verbose=1, lr = 10e-1, epochs = 100)
-
-	print ("Net:",Net.forward(X_test[1,:].view(1,-1)))
-
-	print ("Truth:",y_test[1,:])
-
-	print (Net.loss(X_test,y_test))
\ No newline at end of file
+    print(Net.loss(X_test, y_test))
diff --git a/tests/regularization_basis.py b/tests/regularization_basis.py
index 8d4cea1..2ccc6b9 100644
--- a/tests/regularization_basis.py
+++ b/tests/regularization_basis.py
@@ -6,67 +6,83 @@
 from stpy.helpers.helper import interval
 import matplotlib.pyplot as plt
 from stpy.kernels import KernelFunction
-from stpy.embeddings.bernstein_embedding import BernsteinEmbedding, BernsteinSplinesEmbedding, BernsteinSplinesOverlapping
-from stpy.embeddings.bump_bases import TriangleEmbedding,PositiveNystromEmbeddingBump
+from stpy.embeddings.bernstein_embedding import (
+    BernsteinEmbedding,
+    BernsteinSplinesEmbedding,
+    BernsteinSplinesOverlapping,
+)
+from stpy.embeddings.bump_bases import TriangleEmbedding, PositiveNystromEmbeddingBump
 
 if __name__ == "__main__":
 
-	d = 1
-	m = 32
-	n = 256
-	N = 20
+    d = 1
+    m = 32
+    n = 256
+    N = 20
 
-	s = 0.01
-	b = 0.1
-	B = 0.5
+    s = 0.01
+    b = 0.1
+    B = 0.5
 
-	gamma = 0.1
-	kernel_object = KernelFunction(gamma = gamma)
-	kernel_object_poly = KernelFunction(kernel_name="polynomial", power = N)
+    gamma = 0.1
+    kernel_object = KernelFunction(gamma=gamma)
+    kernel_object_poly = KernelFunction(kernel_name="polynomial", power=N)
 
-	EmbBern = BernsteinEmbedding(d,m,kernel_object=kernel_object,offset=0.5,b=b,B=B,s = s)
-	EmbSplines = BernsteinSplinesEmbedding(d,m,kernel_object=kernel_object,offset=0.5,b=b,B=B,s = s)
-	EmbSplinesOverlap = BernsteinSplinesOverlapping(d,m,kernel_object=kernel_object,offset=0.5,b=b,B=B,s = s)
-	Emb = TriangleEmbedding(d,m,kernel_object=kernel_object,offset=0.5,b=b,B=B,s = s)
-	Embpoly = TriangleEmbedding(d,m,kernel_object=kernel_object_poly,offset=0.5,b=b,B=B,s = s)
-	Embnys = PositiveNystromEmbeddingBump(d, m, kernel_object=kernel_object, offset=0.5, b=0, B=1000, s = s)
+    EmbBern = BernsteinEmbedding(
+        d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s
+    )
+    EmbSplines = BernsteinSplinesEmbedding(
+        d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s
+    )
+    EmbSplinesOverlap = BernsteinSplinesOverlapping(
+        d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s
+    )
+    Emb = TriangleEmbedding(
+        d, m, kernel_object=kernel_object, offset=0.5, b=b, B=B, s=s
+    )
+    Embpoly = TriangleEmbedding(
+        d, m, kernel_object=kernel_object_poly, offset=0.5, b=b, B=B, s=s
+    )
+    Embnys = PositiveNystromEmbeddingBump(
+        d, m, kernel_object=kernel_object, offset=0.5, b=0, B=1000, s=s
+    )
 
-	GP = GaussianProcess(d = d, s = s, kernel=kernel_object)
+    GP = GaussianProcess(d=d, s=s, kernel=kernel_object)
 
-	xtest = torch.from_numpy(interval(n,d,L_infinity_ball=1.1))
-	x = torch.from_numpy(np.random.uniform(-1,1,N)).view(-1,1)
+    xtest = torch.from_numpy(interval(n, d, L_infinity_ball=1.1))
+    x = torch.from_numpy(np.random.uniform(-1, 1, N)).view(-1, 1)
 
-	F_true = lambda x: torch.sin(5*x)**2-0.1
-	F = lambda x: F_true(x) + s*torch.randn(x.size()[0]).view(-1,1).double()
-	y = F(x)
+    F_true = lambda x: torch.sin(5 * x) ** 2 - 0.1
+    F = lambda x: F_true(x) + s * torch.randn(x.size()[0]).view(-1, 1).double()
+    y = F(x)
 
-	Emb.fit(x, y)
-	EmbBern.fit(x, y)
-	Embpoly.fit(x, y)
-	EmbSplines.fit(x, y)
-	EmbSplinesOverlap.fit(x, y)
-	Embnys.fit(x, y)
-	GP.fit_gp(x,y)
+    Emb.fit(x, y)
+    EmbBern.fit(x, y)
+    Embpoly.fit(x, y)
+    EmbSplines.fit(x, y)
+    EmbSplinesOverlap.fit(x, y)
+    Embnys.fit(x, y)
+    GP.fit_gp(x, y)
 
-	mu = Emb.mean_std(xtest)
-	mu_spline = EmbSplines.mean_std(xtest)
-	mu_spline_overlap = EmbSplinesOverlap.mean_std(xtest)
-	mu_true,_ = GP.mean_std(xtest)
-	mu_bern = EmbBern.mean_std(xtest)
-	mu_poly = Embpoly.mean_std(xtest)
-	mu_pos = Embnys.mean_std(xtest)
+    mu = Emb.mean_std(xtest)
+    mu_spline = EmbSplines.mean_std(xtest)
+    mu_spline_overlap = EmbSplinesOverlap.mean_std(xtest)
+    mu_true, _ = GP.mean_std(xtest)
+    mu_bern = EmbBern.mean_std(xtest)
+    mu_poly = Embpoly.mean_std(xtest)
+    mu_pos = Embnys.mean_std(xtest)
 
-	plt.plot(xtest, xtest*0+b, 'k--')
-	plt.plot(xtest, xtest * 0 + B, 'k--')
+    plt.plot(xtest, xtest * 0 + b, "k--")
+    plt.plot(xtest, xtest * 0 + B, "k--")
 
-	plt.plot(xtest,F_true(xtest),'r', label = 'true')
-	plt.plot(xtest,mu_true,'b--', label = 'no-constraints')
-	plt.plot(xtest,mu_pos)
-	plt.plot(x,y,'ro')
-	plt.plot(xtest, mu, 'g-x', label = 'Triangles')
-	#plt.plot(xtest, mu_bern, 'y-o',label = 'Bernstein basis')
-	#plt.plot(xtest, mu_poly, color = 'orange', label='triangles polynomial kernel')
-	#plt.plot(xtest, mu_spline, color='purple', label='splines')
-	#plt.plot(xtest, mu_spline_overlap, color='brown', label='splines_overlap')
-	plt.legend()
-	plt.show()
\ No newline at end of file
+    plt.plot(xtest, F_true(xtest), "r", label="true")
+    plt.plot(xtest, mu_true, "b--", label="no-constraints")
+    plt.plot(xtest, mu_pos)
+    plt.plot(x, y, "ro")
+    plt.plot(xtest, mu, "g-x", label="Triangles")
+    # plt.plot(xtest, mu_bern, 'y-o',label = 'Bernstein basis')
+    # plt.plot(xtest, mu_poly, color = 'orange', label='triangles polynomial kernel')
+    # plt.plot(xtest, mu_spline, color='purple', label='splines')
+    # plt.plot(xtest, mu_spline_overlap, color='brown', label='splines_overlap')
+    plt.legend()
+    plt.show()
diff --git a/tests/spike-basis-general.py b/tests/spike-basis-general.py
index 51bc283..fff3113 100644
--- a/tests/spike-basis-general.py
+++ b/tests/spike-basis-general.py
@@ -4,21 +4,22 @@
 import torch
 import matplotlib.pyplot as plt
 from stpy.borel_set import BorelSet
+
 if __name__ == "__main__":
-	d = 1
-	m = 100
-	S = BorelSet(1,[-1,1])
+    d = 1
+    m = 100
+    S = BorelSet(1, [-1, 1])
 
-	embed_p = FaberSchauderEmbedding(d=d, m=p)
-	print (torch.sum(embed_p.integral(S)))
+    embed_p = FaberSchauderEmbedding(d=d, m=p)
+    print(torch.sum(embed_p.integral(S)))
 
-	m = embed_p.size
-	GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
-	F = lambda x: torch.sin(x)
-	x = torch.from_numpy(interval(2,d))
-	xtest = torch.from_numpy(interval(1024, d))
-	GP.fit_gp(x, F(x))
-	GP.visualize(xtest, f_true=F, show = False)
-	for j in range(p):
-		plt.plot(xtest,embed_p.basis_fun(xtest,j+1))
-	plt.show()
\ No newline at end of file
+    m = embed_p.size
+    GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
+    F = lambda x: torch.sin(x)
+    x = torch.from_numpy(interval(2, d))
+    xtest = torch.from_numpy(interval(1024, d))
+    GP.fit_gp(x, F(x))
+    GP.visualize(xtest, f_true=F, show=False)
+    for j in range(p):
+        plt.plot(xtest, embed_p.basis_fun(xtest, j + 1))
+    plt.show()
diff --git a/tests/test-absolute-deviation.py b/tests/test-absolute-deviation.py
index f3ec4ea..db2c30e 100644
--- a/tests/test-absolute-deviation.py
+++ b/tests/test-absolute-deviation.py
@@ -5,25 +5,25 @@
 import matplotlib.pyplot as plt
 
 if __name__ == "__main__":
-	d = 1
-	p = 4
-	embed_p = ChebyschevEmbedding(d=d, p=p)
-	m = embed_p.size
-	GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
+    d = 1
+    p = 4
+    embed_p = ChebyschevEmbedding(d=d, p=p)
+    m = embed_p.size
+    GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
 
-	x = torch.from_numpy(interval(10,d))
-	xtest = torch.from_numpy(interval(1024, d))
-	GP.fit_gp(x, x**4)
+    x = torch.from_numpy(interval(10, d))
+    xtest = torch.from_numpy(interval(1024, d))
+    GP.fit_gp(x, x**4)
 
-	mu = GP.mean_aboslute_deviation(xtest, B = None)
-	mu2 = GP.mean_aboslute_deviation(xtest, B = 0.1)
-	mu3 = GP.mean_std(xtest)[0]
-	mu4 = GP.mean_constrained(xtest, B = 0.1)
-	#GP.visualize(xtest, show = False)
+    mu = GP.mean_aboslute_deviation(xtest, B=None)
+    mu2 = GP.mean_aboslute_deviation(xtest, B=0.1)
+    mu3 = GP.mean_std(xtest)[0]
+    mu4 = GP.mean_constrained(xtest, B=0.1)
+    # GP.visualize(xtest, show = False)
 
-	plt.plot(xtest,mu, "--",label = 'l1 unconstrained', alpha = 0.5)
-	plt.plot(xtest, mu2,"--",label =  'l1 constrained', alpha = 0.5)
-	plt.plot(xtest, mu3, label = 'l2 unconstrained', alpha = 0.5)
-	plt.plot(xtest, mu4,label =  'l2 constrained', alpha = 0.5)
-	plt.legend()
-	plt.show()
\ No newline at end of file
+    plt.plot(xtest, mu, "--", label="l1 unconstrained", alpha=0.5)
+    plt.plot(xtest, mu2, "--", label="l1 constrained", alpha=0.5)
+    plt.plot(xtest, mu3, label="l2 unconstrained", alpha=0.5)
+    plt.plot(xtest, mu4, label="l2 constrained", alpha=0.5)
+    plt.legend()
+    plt.show()
diff --git a/tests/test-positive-basis.py b/tests/test-positive-basis.py
index 71634ba..d07d575 100644
--- a/tests/test-positive-basis.py
+++ b/tests/test-positive-basis.py
@@ -9,16 +9,16 @@
 from stpy.helpers.helper import interval
 
 m = 32
-kernel = KernelFunction(gamma = 0.1,kernel_name="squared_exponential", power = 5)
-B4 = PositiveNystromEmbeddingBump(kernel_object=kernel, m = m, d = 1, samples = 100)
+kernel = KernelFunction(gamma=0.1, kernel_name="squared_exponential", power=5)
+B4 = PositiveNystromEmbeddingBump(kernel_object=kernel, m=m, d=1, samples=100)
 
-plt.figure(figsize = (20,20))
-basis = lambda x,j: B4.basis_fun(x,j)
-x = torch.from_numpy(np.linspace(-1,1,100)).view(-1,1)
+plt.figure(figsize=(20, 20))
+basis = lambda x, j: B4.basis_fun(x, j)
+x = torch.from_numpy(np.linspace(-1, 1, 100)).view(-1, 1)
 
 for j in range(m):
-	plt.plot(x,basis(x,j), lw = 6)
-	plt.grid(ls = '--', lw = 4)
-	plt.xlim((-1,1))
+    plt.plot(x, basis(x, j), lw=6)
+    plt.grid(ls="--", lw=4)
+    plt.xlim((-1, 1))
 
 plt.show()
diff --git a/tests/test_functions/felsimulator_test.py b/tests/test_functions/felsimulator_test.py
index 91b8cb2..e12b571 100644
--- a/tests/test_functions/felsimulator_test.py
+++ b/tests/test_functions/felsimulator_test.py
@@ -7,8 +7,6 @@
 
 if __name__ == "__main__":
     sigma = 0.1
-    xtest = interval_torch(30, d= 2, L_infinity_ball=0.5)
-    F = SwissFEL(d =2, dts = 'evaluations_bpm.hdf5')
+    xtest = interval_torch(30, d=2, L_infinity_ball=0.5)
+    F = SwissFEL(d=2, dts="evaluations_bpm.hdf5")
     F.Simulator.GP.visualize_contour(xtest)
-
-
diff --git a/tests/triangle-integration-test.py b/tests/triangle-integration-test.py
index 3c74e3d..591faff 100644
--- a/tests/triangle-integration-test.py
+++ b/tests/triangle-integration-test.py
@@ -10,42 +10,41 @@
 
 
 if __name__ == "__main__":
-	d = 1
-	m = 64
-	S = BorelSet(1,[-1,1])
-
-	embedding = TriangleEmbedding(d=d, m=m, s = 10e-8)
-
-	levels = 5
-	hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
-	basic_sets = hierarchical_structure.get_sets_level(hierarchical_structure.levels)
-
-	xtest = hierarchical_structure.top_node.return_discretization(512)
-
-	for set in basic_sets:
-		print (set.bounds, set.volume())
-		x = torch.linspace(set.bounds[0, 0], set.bounds[0, 1], 2)
-		Gamma_half = embedding.cov()
-		val = torch.sum(torch.pinverse(Gamma_half)@embedding.integral(set))
-
-
-		plt.plot(x, x * 0 + float(val)/set.volume(), '-o', color="green", lw=5)
-	for i in range(m):
-		plt.plot(xtest, embedding.basis_fun(xtest,i), 'k')
-	plt.show()
-
-	plt.subplot(1,2,1)
-	plt.imshow(embedding.M)
-	plt.subplot(1,2,2)
-	plt.imshow(embedding.Gamma_half)
-	plt.show()
-	# m = embed_p.size
-	# GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
-	# F = lambda x: torch.sin(x)
-	# x = torch.from_numpy(interval(2,d))
-	# xtest = torch.from_numpy(interval(1024, d))
-	# GP.fit_gp(x, F(x))
-	# GP.visualize(xtest, f_true=F, show = False)
-	# for j in range(p):
-	# 	plt.plot(xtest,embed_p.basis_fun(xtest,j+1))
-	# plt.show()
\ No newline at end of file
+    d = 1
+    m = 64
+    S = BorelSet(1, [-1, 1])
+
+    embedding = TriangleEmbedding(d=d, m=m, s=10e-8)
+
+    levels = 5
+    hierarchical_structure = HierarchicalBorelSets(d=1, interval=(-1, 1), levels=levels)
+    basic_sets = hierarchical_structure.get_sets_level(hierarchical_structure.levels)
+
+    xtest = hierarchical_structure.top_node.return_discretization(512)
+
+    for set in basic_sets:
+        print(set.bounds, set.volume())
+        x = torch.linspace(set.bounds[0, 0], set.bounds[0, 1], 2)
+        Gamma_half = embedding.cov()
+        val = torch.sum(torch.pinverse(Gamma_half) @ embedding.integral(set))
+
+        plt.plot(x, x * 0 + float(val) / set.volume(), "-o", color="green", lw=5)
+    for i in range(m):
+        plt.plot(xtest, embedding.basis_fun(xtest, i), "k")
+    plt.show()
+
+    plt.subplot(1, 2, 1)
+    plt.imshow(embedding.M)
+    plt.subplot(1, 2, 2)
+    plt.imshow(embedding.Gamma_half)
+    plt.show()
+    # m = embed_p.size
+    # GP = KernelizedFeatures(embeding=embed_p, m=m, d=d)
+    # F = lambda x: torch.sin(x)
+    # x = torch.from_numpy(interval(2,d))
+    # xtest = torch.from_numpy(interval(1024, d))
+    # GP.fit_gp(x, F(x))
+    # GP.visualize(xtest, f_true=F, show = False)
+    # for j in range(p):
+    # 	plt.plot(xtest,embed_p.basis_fun(xtest,j+1))
+    # plt.show()