"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "krr.plot(y_known=np.concatenate(property_values[test_idx]),\n",
+ " y=y, property_name=property_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "celltoolbar": "Raw Cell Format",
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {
+ "height": "12px",
+ "width": "252px"
+ },
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": "block",
+ "toc_window_display": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorials/TutorialEn.rst b/docs/source/tutorials/TutorialEn.rst
new file mode 100644
index 000000000..b00852ae7
--- /dev/null
+++ b/docs/source/tutorials/TutorialEn.rst
@@ -0,0 +1,342 @@
+.. _TutorialEn:
+
+.. role:: raw-html(raw)
+ :format: html
+
+Energy fitting
+==============
+
+Approach used in Rascal can be used for calculating any of the rotationally
+invariant properties (such as formation energy), as well as rotationally
+covariant properties (such as dipole moment). This example is about fitting the
+formation energies of some small molecules. The dataset is located in the
+`librascal/examples/data/ <../../../../examples/data>`_ folder, the python
+notebook doing the calculation is `librascal/examples/SOAP_example.ipynb
+(outbound)
+`_.
+
+
+Let's take a look at the code and describe what it do step-by-step.
+
+Imports
+*******
+
+Just import of the necessary modules, including Rascal itself.
+
+.. code-block:: python
+
+ from matplotlib import pylab as plt
+ import os, sys
+ sys.path.insert(0,"../build/")
+ import time
+ import rascal
+ import json
+ import ase
+ from ase.io import read, writex
+ from ase.build import make_supercell
+ from ase.visualize import view
+ import numpy as np
+ from rascal.representations import SOAP
+
+Dataset
+*******
+
+Here ASE is used to read the file with information about molecules, then the
+size of the feature matrix shown. The number of columns is the number of atoms
+in the set of molecules, the number of rows is depend on the max_radial and
+max_angular parameters. If the number of columns is big, sparsification needs to
+be done to reduce it.
+
+.. code-block:: python
+
+ frames = read('./data/small_molecules-1000.xyz',':')
+ hypers = dict(soap_type="PowerSpectrum",
+ interaction_cutoff=3.5,
+ max_radial=6,
+ max_angular=6,
+ gaussian_sigma_constant=0.4,
+ gaussian_sigma_type="Constant",
+ cutoff_smooth_width=0.5,
+ )
+ soap = SOAP(**hypers)
+ %time representation = soap.transform(frames)
+ X = representation.get_feature_matrix().T
+ X.shape
+
+
+Functions
+*********
+
+Here we define the functions, which are needed for the further computations.
+Let's describe some of them. compute_representation - creates a feature manager
+class using information about structure compute_kernel - compute the matrix of
+global similarity using global cosine kernel with a power of zeta to itself
+KRR.predict - predicting the energy of the molecules in the set, using trained
+model train_krr_model - train the model basing on the given dataset and using
+global cosine kernel
+
+.. code-block:: python
+
+ # Load the small molecules
+ frames = read('./data/small_molecules-1000.xyz',':600')
+
+ def compute_representation(representation,frames):
+ expansions = soap.transform(frames)
+ return expansions
+
+ def compute_kernel(zeta, rep1, rep2=None):
+ if rep2 is None:
+ kernel = rep1.cosine_kernel_global(zeta)
+ else:
+ kernel = rep1.cosine_kernel_global(rep2,zeta)
+ return kernel
+
+ def extract_energy(frames):
+ prop = [[]]*len(frames)
+ for ii,cc in enumerate(frames):
+ prop[ii] = cc.info['dft_formation_energy_per_atom_in_eV']
+ y = np.array(prop)
+ return y
+
+ def split_dataset(frames, test_fraction, seed=10):
+ N = len(frames)
+ ids = np.arange(N)
+ np.random.seed(seed)
+ np.random.shuffle(ids)
+ Ntrain = int(N*test_fraction)
+ train = ids[:Ntrain]
+ test = ids[Ntrain:]
+ targets = extract_energy(frames)
+ return [frames[ii] for ii in train],targets[train],[frames[ii] for ii in test],targets[test]
+
+ def get_mae(ypred,y):
+ return np.mean(np.abs(ypred-y))
+ def get_rmse(ypred,y):
+ return np.sqrt(np.mean((ypred-y)**2))
+ def get_sup(ypred,y):
+ return np.amax(np.abs((ypred-y)))
+ def get_r2(y_pred,y_true):
+ weight = 1
+ sample_weight = None
+ numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,dtype=np.float64)
+ denominator = (weight * (y_true - np.average(y_true, axis=0, weights=sample_weight)) ** 2).sum(axis=0,dtype=np.float64)
+ output_scores = 1 - (numerator / denominator)
+ return np.mean(output_scores)
+
+ score_func = dict(
+ MAE=get_mae,
+ RMSE=get_rmse,
+ SUP=get_sup,
+ R2=get_r2,
+ )
+
+ def get_score(ypred,y):
+ scores = {}
+ for k,func in score_func.items():
+ scores[k] = func(ypred,y)
+ return scores
+
+ class KRR(object):
+ def __init__(self,zeta,weights,representation,X):
+ self.weights = weights
+ self.representation = representation
+ self.zeta = zeta
+ self.X = X
+
+ def predict(self,frames):
+ features = compute_representation(self.representation,frames)
+ kernel = compute_kernel(self.zeta , self.X, features)
+ return np.dot(self.weights, kernel)
+
+ def train_krr_model(zeta,Lambda,representation,frames,y,jitter=1e-8):
+ features = compute_representation(representation,frames)
+ kernel = compute_kernel(zeta,features)
+ # adjust the kernel so that it is properly scaled
+ delta = np.std(y) / np.mean(kernel.diagonal())
+ kernel[np.diag_indices_from(kernel)] += Lambda**2 / delta **2 + jitter
+ # train the krr model
+ weights = np.linalg.solve(kernel,y)
+ model = KRR(zeta, weights,representation, features)
+ return model,kernel
+
+
+Full spectrum
+*************
+
+Here the full (with radial and angular parts) energies are computed. Let's
+describe the parameters of the soap descriptor, defined in the ``hypers``
+dictionary.
+
+- **interaction_cutoff**: Maximum pairwise distance for atoms to be considered
+ in expansion
+- **max_radial**: number of radial basis functions
+- **max_angular**: highest angular momentum number in the expansion
+- **gaussian_sigma_constant**: specifies the atomic Gaussian widths, in the
+ case where they're fixed.
+- **gaussian_sigma_type**: how the Gaussian atom sigmas (smearing widths) are
+ allowed to vary -- fixed (``Constant``), by species (``PerSpecies``), or by
+ distance from the central atom (``Radial``)
+- **cutoff_smooth_width**: the distance over which the the interaction is
+ smoothed to zero
+
+.. code-block:: python
+
+ hypers = dict(soap_type="PowerSpectrum",
+ interaction_cutoff=3.5,
+ max_radial=6,
+ max_angular=6,
+ gaussian_sigma_constant=0.4,
+ gaussian_sigma_type="Constant",
+ cutoff_smooth_width=0.5,
+ )
+ soap = SOAP(**hypers)
+
+ frames_train, y_train, frames_test, y_test = split_dataset(frames,0.8)
+
+ zeta = 2
+ Lambda = 5e-3
+ krr,k = train_krr_model(zeta, Lambda, soap, frames_train, y_train)
+
+ y_pred = krr.predict(frames_test)
+ get_score(y_pred, y_test)
+
+ plt.scatter(y_pred, y_test, s=3)
+ plt.axis('scaled')
+ plt.xlabel('DFT energy / (eV/atom)')
+ plt.ylabel('Predicted energy / (eV/atom)')
+
+The result of this block is:
+
+.. image:: ../resources/images/R1.png
+
+The result is quite good. One can try to change the train dataset to see how it
+affects the precision of the result.
+
+Radial spectrum
+***************
+
+Here we compute the energy, supposing the angular component to be zero.
+
+.. code-block:: python
+
+ hypers = dict(soap_type="RadialSpectrum",
+ interaction_cutoff=3.5,
+ max_radial=6,
+ max_angular=0,
+ gaussian_sigma_constant=0.4,
+ gaussian_sigma_type="Constant",
+ cutoff_smooth_width=0.5,
+ )
+ soap = SOAP(**hypers)
+
+ frames_train, y_train, frames_test, y_test = split_dataset(frames,0.8)
+
+ zeta = 2
+ Lambda = 5e-4
+ krr,k = train_krr_model(zeta, Lambda, soap, frames_train, y_train)
+
+ y_pred = krr.predict(frames_test)
+ get_score(y_pred, y_test)
+
+ plt.scatter(y_pred, y_test, s=3)
+ plt.axis('scaled')
+ plt.xlabel('DFT energy / (eV/atom)')
+ plt.ylabel('Predicted energy / (eV/atom)')
+
+Comparison of full and radial spectrum:
+
+.. image:: ../resources/images/Comps.png
+
+It can be seen that the two spectres are quite similar, but the radial spectrum is much more simple to compute (as feature matrix is much smaller and the set of spherical harmonics doesn't have to be computed). It is quite an inteseting fact, but, unfortunately, this feature is probably not generalizable and should be just the feature of this particular dataset.
+
+Map of the dataset
+*******************
+Here we use sklearn to do `kernel principal component analysis (outbound) `_.
+
+.. code-block:: python
+
+ def compute_representation(representation,frames):
+ expansions = soap.transform(frames)
+ return expansions
+
+ def compute_kernel(zeta, rep1, rep2=None):
+ if rep2 is None:
+ kernel = rep1.cosine_kernel_global(zeta)
+ else:
+ kernel = rep1.cosine_kernel_global(rep2,zeta)
+ return kernel
+
+ def link_ngl_wdgt_to_ax_pos(ax, pos, ngl_widget):
+ from matplotlib.widgets import AxesWidget
+ from scipy.spatial import cKDTree
+ r"""
+ Initial idea for this function comes from @arose, the rest is @gph82 and @clonker
+ """
+
+ kdtree = cKDTree(pos)
+ #assert ngl_widget.trajectory_0.n_frames == pos.shape[0]
+ x, y = pos.T
+
+ lineh = ax.axhline(ax.get_ybound()[0], c="black", ls='--')
+ linev = ax.axvline(ax.get_xbound()[0], c="black", ls='--')
+ dot, = ax.plot(pos[0,0],pos[0,1], 'o', c='red', ms=7)
+
+ ngl_widget.isClick = False
+
+ def onclick(event):
+ linev.set_xdata((event.xdata, event.xdata))
+ lineh.set_ydata((event.ydata, event.ydata))
+ data = [event.xdata, event.ydata]
+ _, index = kdtree.query(x=data, k=1)
+ dot.set_xdata((x[index]))
+ dot.set_ydata((y[index]))
+ ngl_widget.isClick = True
+ ngl_widget.frame = index
+
+ def my_observer(change):
+ r"""Here comes the code that you want to execute
+ """
+ ngl_widget.isClick = False
+ _idx = change["new"]
+ try:
+ dot.set_xdata((x[_idx]))
+ dot.set_ydata((y[_idx]))
+ except IndexError as e:
+ dot.set_xdata((x[0]))
+ dot.set_ydata((y[0]))
+ print("caught index error with index %s (new=%s, old=%s)" % (_idx, change["new"], change["old"]))
+
+ # Connect axes to widget
+ axes_widget = AxesWidget(ax)
+ axes_widget.connect_event('button_release_event', onclick)
+
+ # Connect widget to axes
+ ngl_widget.observe(my_observer, "frame", "change")
+
+ # Load the small molecules
+ frames = read('./data/small_molecules-1000.xyz',':600')
+ hypers = dict(soap_type="PowerSpectrum",
+ interaction_cutoff=3.5,
+ max_radial=6,
+ max_angular=6,
+ gaussian_sigma_constant=0.4,
+ gaussian_sigma_type="Constant",
+ cutoff_smooth_width=0.5,
+ )
+ soap = SOAP(**hypers)
+ zeta = 2
+ features = compute_representation(soap, frames)
+ kernel = compute_kernel(zeta,features)
+ from sklearn.decomposition import KernelPCA
+ kpca = KernelPCA(n_components=2,kernel='precomputed')
+ kpca.fit(kernel)
+ X = kpca.transform(kernel)
+ plt.scatter(X[:,0],X[:,1],s=3)
+
+The result of this block is:
+
+.. image:: ../resources/images/PCAs.png
+
+It shows how the structures is located in the abstract 2D map, where similar
+structures are located near to each other, and the very different ones far from
+each other.
diff --git a/docs/source/tutorials/Tutorial_Template.ipynb b/docs/source/tutorials/Tutorial_Template.ipynb
new file mode 100644
index 000000000..816c06a5c
--- /dev/null
+++ b/docs/source/tutorials/Tutorial_Template.ipynb
@@ -0,0 +1,93 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "toc": true
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Title of the Tutorial\n",
+ "\n",
+ "This notebook is intended to _purpose of tutorial_.\n",
+ "For more information on _nuances of tutorial_, please refer to (among others): \n",
+ "- [Title (Surname Year)](url) \n",
+ "- [Title (Surname Year)](url)\n",
+ "\n",
+ "Beyond libRascal, the packages used in this tutorial are: [pkg](url), [pkg](url), and [pkg](url)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import time\n",
+ "\n",
+ "import numpy as np\n",
+ "import ase\n",
+ "from ase.io import read\n",
+ "from matplotlib import pyplot as plt\n",
+ "import json\n",
+ "from IPython.display import display, Markdown\n",
+ "\n",
+ "from rascal.representations import SphericalInvariants as SOAP\n",
+ "from rascal.models import Kernel\n",
+ "\n",
+ "sys.path.append('./utilities')\n",
+ "from general_utils import *"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": true,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst
new file mode 100644
index 000000000..6c6da0757
--- /dev/null
+++ b/docs/source/tutorials/index.rst
@@ -0,0 +1,38 @@
+.. _tutorials:
+
+Tutorials
+=========
+
+Here you will find some tutorial examples for getting started with librascal,
+both as a new user of the library and for getting started developing a new
+representation.
+
+
+Introductory Tutorials
+~~~~~~~~~~~~~~~
+
+.. toctree::
+ :maxdepth: 1
+
+ Introduction_to_Atomic_Descriptors.ipynb
+ Hyperparameters.ipynb
+ Property_Prediction_Tutorial.ipynb
+ Kernels.ipynb
+ Benefits_Performance.ipynb
+ Advanced_SOAP.ipynb
+
+Converting Your Workflow to Rascal
+~~~~~~~~~~~~~~~~~
+
+.. toctree::
+ :maxdepth: 1
+
+ Converting_Quip.ipynb
+
+Rascal Cookbook
+~~~~~~~~~~~~~~~~~~~~
+
+.. toctree::
+ :maxdepth: 1
+
+ nl-for-user
diff --git a/docs/source/tutorials/nl-for-user.rst b/docs/source/tutorials/nl-for-user.rst
new file mode 100644
index 000000000..dff3e8e2b
--- /dev/null
+++ b/docs/source/tutorials/nl-for-user.rst
@@ -0,0 +1,109 @@
+.. _nl-for-user:
+
+How to build a neighbor list?
+=============================
+
+Overview
+~~~~~~~~
+
+
+Here we provide a tutorial on how to use the C++ interface for building neighbor lists.
+
+Rascal comes with a built-in neighborhood manager. The role of a neighborhood manager is to minimize manual handeling of building the neighborhood lists.
+The current implementation avoids the explicit use of embedded lists to describe neighborhoods and makes iterations over them more trivial.
+The neighborhood manager interface is universal to all the representations of Rascal. It can also be very easily interfaced with MD/MC simulation packages.
+We prefer using `Eigen `_ library for the linear algebra operations because of the versatility and flexibility allowed by the library.
+
+`Eigen` library
+~~~~~~~~~~~~~~~
+
+`Eigen` is an open source library for linear algebra operatins in C++. It is implemented using expression templates techniques.
+We usually use the following templates to describe the positions, the atomic numbers array,..etc
+
+.. code-block:: cpp
+
+ using Vector_t = Eigen::Matrix; // dim = 1, 2 or 3
+ Eigen::MatrixXd MATRIX(NROWS,NCOLUMNS); //
+ using rascal::VecXi = Eigen::Matrix
+
+Creating a neighborhood manager
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+First, the neighborhood manager is created and initialized using the :cpp:class:`rascal::NeighborhoodManagerCell`.
+
+.. code-block:: cpp
+
+ using Manager_t = rascal::NeighbourhoodManagerCell;
+ Manager_t manager; // initialize the neighborhood manager
+
+While the neighborhood manager can be built for the first time using :cpp:class:`rascal::NeighborhoodManagerCell::build`,
+we recommend updating it using :class:`rascal::NeighborhoodManagerCell::update`. The following inputs should be provided in order:
+
+- the atomic positions matrix column wise as an `Eigen` matrix object
+- an array of the atomic numbers of the species
+- an array of centers ids
+- the cell vectors column wise as an `Eigen` matrix object
+- a boolean array of the periodic boundary conditions over the 3 axes
+- the maximum radial cutoff (in the units of the cell vectors and atomic positions)
+
+The folllowing code is an example of the inputs that need to be customized according to the user's use, as in to provide
+cell vectors, atomic positions and numbers and the ids of the centers.
+
+.. code-block:: cpp
+
+ Eigen::MatrixXd cell(3,3); // the cell vectors matrix
+
+ int number_of_atoms{100}; // the number of atoms in the structure
+ Eigen::MatrixXd positions(3,number_of_atoms); // atomic postions matrix
+ rascal::VecXi numbers(22); // atomic numbers array
+
+ std::vector center_ids{}; // a vector containing the atomic centers ids
+
+ std::array pbc{true, false, false}; // for periodic boundary conditions over the x axis only
+
+ double cutoff{3.5}; // a radial cutoff distance of 3.5 angstrom
+
+ manager.update(positions, numbers, center_ids, cell, pbc, cutoff);
+
+- .. note:: A neighborhood manager can be updated as much as necessary by providing all the upmentioned inputs.
+
+
+Commands of the neighborhood manager:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In the current implementation of the neighborhood manager, the centers and their neighbors are :cpp:class:`rascal::Neighborhood Cell::ClusterRef` objects of order 1 and 2, respectively.
+In order to access objects of order 2, we need to iterate over the corresponding order 1 object (the corresponding center).
+This implementation has the advantage of not using explicit arrays to deal with center and neighbor properties.
+
+Several methods are implemented for the :cpp:class:`rascal::NeighborhoodCell::ClusterRef` objects (centers and their neighbors), such as retrieving
+the index of the object, its position and its atom type.
+
+- .. note:: The positions of order 2 are given with a certain offset relative to the position of the corresponding center of order 1. If one atom is included in the neighborhood of two different centers, it will have different positions depending on
+ the center being iterated over.
+
+- .. warning:: The relative positions of the neighbors of a center are calculated on the fly and are not stored. If needed they have to be stored manually.
+
+This is an example of code than can be used to print to the screen the positions, types and indices of the centers and their neighbors. It also calculates the relative shift in the positions of every center's neighbors
+ and its norm.
+
+.. code-block:: cpp
+
+ for (auto center : manager) {
+
+ int center_index{center.get_atom_index()}; // get the index of the center atomic
+ Eigen::MatrixXd position{center.get_position()}; // get the position vector of the center
+ auto center_type{center.get_atom_type()}; // get the atome type of the center
+ std::cout << "Neighbors properties : " << endl;
+
+ for (auto neigh : center){
+ int neigh_index{center.get_atom_index()}; // get the index of the neighbor
+ Eigen::MatrixXd position{neigh.get_position()}; // get the position vector of the neighbor
+ auto neigh_type{center.get_atom_type()}; // get the atome type of the neighbor
+ auto relative_shift{position - center.get_position()}; // compute the position offset
+
+ std::cout << "This is the position of atom " << neigh_index << " of a type " << neigh_type << endl;
+ std::cout << "The relative position is : " << neigh_position << endl;
+ std::cout << "The relative shift is : " << relative_shift << endl;
+ std::cout << "The norm of the shif is : " << relative_shift.norm() << " ang" << endl;
+ }
+ }
diff --git a/docs/source/tutorials/utilities/general_utils.py b/docs/source/tutorials/utilities/general_utils.py
new file mode 100644
index 000000000..25f7bd2d5
--- /dev/null
+++ b/docs/source/tutorials/utilities/general_utils.py
@@ -0,0 +1,123 @@
+import os
+import inspect
+import time
+
+from IPython.display import Markdown, display, clear_output, Image, HTML
+import ipywidgets as widgets
+import numpy as np
+import ase
+from ase.io import read
+from matplotlib import pyplot as plt
+from tqdm import tqdm_notebook as tqdm
+import pandas as pd
+import json
+
+# This is for backwards compatibility -- old versions of RASCAL will follow
+# the latter schema, newer versions the former.
+try:
+ from rascal.representations import SphericalInvariants as SOAP
+except:
+ from rascal.representations import SOAP
+
+# These are some imports/helpers for all of the tutorials
+style = json.load(open('./utilities/widget_style.json'))
+hyper_dict = json.load(open("./utilities/hyperparameter_presets.json"))
+hyper_vals = json.load(open("./utilities/hyperparameter_bounds.json"))
+
+
+def mask_body_order(hyperparams):
+ """ This is for backwards compatibility, as we will soon be moving to using
+ "body order" in place of "soap_type"
+
+ Author: R. Cersonsky
+
+ Keywords:
+ hyperparams - full or partial dictionary of hyperparameters
+
+ Returns:
+ dictionary of hyperparameters suitable for a SOAP object
+ """
+
+ soap_types = ["RadialSpectrum", "PowerSpectrum", "BiSpectrum"]
+ return {"soap_type": soap_types[hyperparams["body_order"]-1],
+ **{h: hyperparams[h] for h in hyperparams if h != 'body_order'}}
+
+
+def markdown_table_from_dict(d, headers):
+ """ Function to generate a markdown table from a dictionary
+
+ Author: R. Cersonsky
+
+ Keywords:
+ d - dictionary of type {parameter: value}
+ headers - optional headers for the table
+
+ Returns:
+ markdown string
+ """
+ return '
{}
'.format(''.join(['
{}
'.format(h) for h in headers]))+''.join(['
{}
{}
'.format(key.replace('_',' ').title(), ''.join(['
{}
'.format(round(v,4)) for v in d[key]])) for key in d])+'
'
+
+
+def compute_kernel(calculator, features1, features2=None, kernel_type='Structure', **kwargs):
+ """ Function to calculate similarity kernel
+
+ Author: M. Veit or F. Musil, modified by R. Cersonsky
+
+ Keywords:
+ calculator - soap calculator
+ features1 - ASE-Atoms-like object to compute kernel of
+ features2 - ASE-Atoms-like object to compute kernel of.
+ Default is features1
+ kernel_type - "atom" or "structure"
+ kwargs - hyperparameters of soap calculator
+
+ Returns:
+ kernel function
+ """
+ from rascal.models import Kernel
+
+ kernel = Kernel(representation=calculator, name='Cosine', target_type=kernel_type, zeta=2, **kwargs)
+ return kernel(X=features1, Y=features2)
+
+def readme_button():
+ """ Function to generate and display a button to show the package README
+
+ Author: R. Cersonsky
+
+ """
+ def show_readme_for_button(b):
+ clear_output()
+ display(Markdown(str(''.join(open('../README.rst')))))
+
+ button = widgets.Button(description="Show README", style=style)
+ output = widgets.Output()
+ display(button, output)
+
+ button.on_click(show_readme_for_button)
+
+
+def _button_template_(options, description, disp_func, default=None):
+ """ Function to generate and display a ToggleButton instance for a set of
+ options
+
+ Author: R. Cersonsky
+
+ Keywords:
+ options - list of options
+ descriptions - string to describe button
+ disp_func - function object to be called when a button is clicked
+ default - value of the button, needs to be in list of options
+
+ Returns:
+ widgets.ToggleButtons Object
+ """
+ button = widgets.ToggleButtons(
+ options=options,
+ description=description,
+ button_style='',
+ style=style,
+ value=options[0] if default == None else default
+ )
+ display(button)
+ button.observe(disp_func, 'value')
+ return button
diff --git a/docs/source/tutorials/utilities/hyperparameter_bounds.json b/docs/source/tutorials/utilities/hyperparameter_bounds.json
new file mode 100644
index 000000000..3ca1f73fb
--- /dev/null
+++ b/docs/source/tutorials/utilities/hyperparameter_bounds.json
@@ -0,0 +1 @@
+{"body_order": {"options": [1, 2, 3], "name": "Body Order"}, "interaction_cutoff": {"options": [1.75, 5.0], "name": "$r_{cut}$"}, "max_radial": {"options": [0, 10], "name": "$n_{max}$"}, "max_angular": {"options": [0, 10], "name": "$l_{max}$"}, "gaussian_sigma_constant": {"options": [0.01, 1.0], "name": "$\\sigma$"}, "gaussian_sigma_type": {"fixed": "Constant"}, "cutoff_smooth_width": {"fixed": 0.5}}
\ No newline at end of file
diff --git a/docs/source/tutorials/utilities/hyperparameter_presets.json b/docs/source/tutorials/utilities/hyperparameter_presets.json
new file mode 100644
index 000000000..c3200ef85
--- /dev/null
+++ b/docs/source/tutorials/utilities/hyperparameter_presets.json
@@ -0,0 +1 @@
+{"Minimal Power Spectrum": {"body_order": 2, "interaction_cutoff": 3.5, "max_radial": 2, "max_angular": 1, "gaussian_sigma_constant": 0.5, "gaussian_sigma_type": "Constant", "cutoff_smooth_width": 0.0}, "Power Spectrum": {"body_order": 2, "interaction_cutoff": 3.5, "max_radial": 6, "max_angular": 6, "gaussian_sigma_constant": 0.4, "gaussian_sigma_type": "Constant", "cutoff_smooth_width": 0.5}, "Radial Spectrum": {"body_order": 1, "interaction_cutoff": 1.75, "max_radial": 6, "max_angular": 0, "gaussian_sigma_constant": 0.25, "gaussian_sigma_type": "Constant", "cutoff_smooth_width": 0.5}}
\ No newline at end of file
diff --git a/docs/source/tutorials/utilities/learning_utils.py b/docs/source/tutorials/utilities/learning_utils.py
new file mode 100644
index 000000000..4bb4c5985
--- /dev/null
+++ b/docs/source/tutorials/utilities/learning_utils.py
@@ -0,0 +1,644 @@
+import os
+import inspect
+import time
+
+from IPython.display import Markdown, display, clear_output, Image, HTML
+import ipywidgets as widgets
+import numpy as np
+import ase
+from ase.io import read
+from matplotlib import pyplot as plt
+from tqdm import tqdm_notebook as tqdm
+import pandas as pd
+import json
+
+# This is for backwards compatibility -- old versions of RASCAL will follow
+# the latter schema, newer versions the former.
+try:
+ from rascal.representations import SphericalInvariants as SOAP
+except:
+ from rascal.representations import SOAP
+
+from general_utils import (markdown_table_from_dict, \
+ readme_button, _button_template_,
+ compute_kernel, mask_body_order)
+
+# These are some imports/helpers for all of the tutorials
+style = json.load(open('./utilities/widget_style.json'))
+hyper_dict = json.load(open("./utilities/hyperparameter_presets.json"))
+hyper_vals = json.load(open("./utilities/hyperparameter_bounds.json"))
+
+# Properties preset for the learning tutorial
+known_properties = dict(CS="Atom",
+ dft_formation_energy_per_atom_in_eV="Structure")
+ignore = ['Natoms', 'numbers', 'Name',
+ 'positions', 'cutoff', 'nneightol', "NAME"]
+
+def split_dataset(frames, train_fraction, seed=10):
+ """ Function to split a dataset into testing and training portions.
+
+ Author: M. Veit or F. Musil
+
+ Keywords:
+ frames - ASE-Atoms like
+ train_fraction - float in [0.0,1.0], percentage of frames to train
+
+ Returns:
+ indices of training and testing frames
+ """
+
+ N = len(frames)
+ ids = np.arange(N)
+ np.random.seed(seed)
+ np.random.shuffle(ids)
+ Ntrain = int(N*train_fraction)
+ train = ids[:Ntrain]
+ test = ids[Ntrain:]
+ return np.array(train), np.array(test)
+
+
+def get_r2(y_pred, y_true):
+ """ Function to calculate R^2
+ Author: M. Veit or F. Musil
+
+ Keywords:
+ y_pred - list of predicted y values
+ y_true - list of known y values
+
+ Returns:
+ float
+ """
+ numerator = ((y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
+ denominator = ((y_true - np.average(y_true,
+ axis=0)) ** 2).sum(axis=0,
+ dtype=np.float64)
+ output_scores = 1 - numerator / denominator
+ return np.mean(output_scores)
+
+
+def get_score(y_pred, y_true):
+ """ Function to calculate accuracy statistics
+
+ Author: M. Veit or F. Musil
+
+ Keywords:
+ y_pred - list of predicted y values
+ y_true - list of known y values
+
+ Returns:
+ dictionary containing SUP (supremum), MAE (mean squared error),
+ RMSD (root mean squared displacement),
+ and R^2
+ """
+ return {
+ "SUP": [np.amax(np.abs((y_pred-y_true)))],
+ "MAE": [np.mean(np.abs(y_pred-y_true))],
+ "RMSD": [np.sqrt(np.mean((y_pred-y_true)**2))],
+ r"$R^2$": [get_r2(y_pred, y_true)]
+ }
+
+
+class KRR(object):
+ """ Class for Kernel Ridge Regression
+
+ Author: M. Veit or F. Musil
+
+ Keywords:
+ zeta - integer
+ weights - weights of the given by the kernel and the targets given by
+ the training set
+ representation - feature set given by soap vectors
+ kernel_type - "atom" or "Structure"
+ X - training dataset
+
+ Functions:
+ predict - given the KRR, predicts the properties for a set of ASE frames
+ """
+
+ def __init__(self, weights, features, kernel_type, calculator=None, **kwargs):
+ self.weights = weights
+ self.hypers = dict(**kwargs)
+ self.calculator = calculator if calculator!=None else SOAP(**mask_body_order(kwargs))
+ self.X = features
+ self.kernel_type = kernel_type
+
+ def predict(self, frames):
+ features = self.calculator.transform(frames)
+ kernel = compute_kernel(calculator=self.calculator, features1=self.X, \
+ features2=features, kernel_type=self.kernel_type, \
+ **self.hypers)
+ return np.dot(self.weights, kernel)
+
+
+def extract_property(frames, property='energy'):
+ """ Function to read properties from ASE frames object
+
+ Author: M. Veit or F. Musil
+
+ Keywords:
+ frames - ASE Atoms-like object
+ property - property to extract
+
+ Returns:
+ list of the property from the given frames
+
+ Raises error when property is not found in the ASE object
+ """
+ if(property in frames[0].info):
+ return np.array([cc.info[property] for cc in frames])
+ elif(property in frames[0].arrays):
+ return np.array([cc.arrays[property] for cc in frames])
+ else:
+ print(frames[0].info)
+ raise KeyError(
+ "{} is not a property in the given frames".format(property))
+
+
+class learning_tutorial(object):
+ """ Class to wrap the learning tutorial
+
+ Author: R. Cersonsky
+
+ Constructor Keywords:
+ input_file - ASE Atoms-like object containing molecules or
+ crystals with some property to learn
+ training_percentage - float, percentage of frames to include in the
+ training set
+ interactive - boolean, whether to load the tutorial interactive-
+ ly in the jupyter notebook
+ verbose - boolean, whether or not to narrate the actions of
+ the tutorial
+ hyperparameters - dictionary, hyperparameters for SOAP calculation
+ number_of_frames - int, how many frames to include in the dataset
+ property - string, which property to train the dataset on
+
+ """
+
+ def __init__(self, input_file='./data/learning/small_molecules-1000.xyz',
+ training_percentage=0.8, interactive=False, verbose=True,
+ hyperparameters=dict(**hyper_dict['Power Spectrum']),
+ number_of_frames=None, property=None):
+
+ # presets given by M. Veit and F. Musil
+ self.zeta = 2
+ self.Lambda = 5e-3
+
+ self.hyperparameters = hyperparameters
+ self.verbose = verbose
+
+ # The verbosity wrap serves to limit the amount of text output in the
+ # jupyter notebook
+ self.verbosity_wrap = lambda s: (None if not verbose
+ else display(Markdown(s)))
+
+ # Looks in the examples folder for files suitable for the learning
+ # tutorial
+ file_options = ['./data/learning/{}'.format(f) for f in
+ os.listdir('./data/learning/') if f.endswith('xyz')]
+
+ # If the supplied file is not in the examples folder, includes it in the
+ # list of suitable files
+ if(input_file != None):
+ file_options.insert(0, input_file)
+ if(input_file in file_options[1:]):
+ file_options.pop(file_options[1:].index(input_file)+1)
+
+ # Sets the file to the default if not supplied
+ self.input_file = file_options[0] if input_file == None else input_file
+
+ # Reads input file
+ self.frames = np.array(read(self.input_file, ":"))
+ self.train_idx, self.test_idx = None, None
+
+ # This next section is for the jupyter interface. Even if the user has
+ # specified zero interactivity, the sliders serve as containers for the
+ # hyperparameters to change when set
+ self.sliders = {val:
+ widgets.FloatSlider(
+ value=self.hyperparameters.get(val,
+ hyper_vals[val]['options'][0]),
+ min=hyper_vals[val]['options'][0],
+ max=hyper_vals[val]['options'][1],
+ description=hyper_vals[val]['name'],
+ continuous_update=True,
+ step=(hyper_vals[val]['options'][1] -
+ hyper_vals[val]['options'][0])/20.,
+ style=style,
+ )
+ if isinstance(hyper_vals[val]['options'][0], float) else
+ widgets.IntSlider(
+ value=self.hyperparameters.get(val,
+ hyper_vals[val]['options'][0]),
+ min=hyper_vals[val]['options'][0],
+ max=hyper_vals[val]['options'][1],
+ description=hyper_vals[val]['name'],
+ continuous_update=True,
+ step=1,
+ style=style,
+ )
+ if isinstance(hyper_vals[val]['options'][0], int)
+ and hyper_vals[val]['options'][0] != True
+ else widgets.Dropdown(
+ options=hyper_vals[val]['options'],
+ style=style,
+ value=self.hyperparameters.get(val,
+ hyper_vals[val]['options'][0]),
+ description=hyper_vals[val]['name'],
+ )
+ for val in hyper_vals
+ if 'fixed' not in hyper_vals[val]}
+ self.properties = {prop:
+ extract_property(self.frames, prop)
+ for prop in sorted([*list(self.frames[0].info.keys()),
+ *list(self.frames[0].arrays.keys())],
+ key=lambda p: -int(p in known_properties))
+ if prop not in ignore}
+
+ self.sliders['number_of_frames'] = widgets.IntSlider(
+ value=int(len(self.frames)*0.2)
+ if number_of_frames == None
+ else number_of_frames,
+ min=1,
+ max=len(self.frames),
+ description="Number of Frames",
+ step=1,
+ style=style)
+ self.sliders['property_to_ml'] = widgets.Dropdown(
+ value=list(
+ self.properties.keys())[0]
+ if property == None
+ else property,
+ options=list(
+ self.properties.keys()),
+ description="Property to ML",
+ style=style)
+ if(self.sliders['property_to_ml'].value not in known_properties):
+ self.sliders['kernel_type'] = widgets.Dropdown(
+ value='Atom',
+ options=['Atom', 'Structure'],
+ description="Kernel Type",
+ style=style)
+ else:
+ self.sliders['kernel_type'] = widgets.Dropdown(
+ value=known_properties[self.sliders['property_to_ml'].value],
+ options=[
+ known_properties[self.sliders['property_to_ml'].value]],
+ description="Kernel Type",
+ style=style)
+ self.sliders['training_percentage'] = widgets.FloatSlider(
+ value=training_percentage,
+ min=0,
+ max=1,
+ description="Training Percentage",
+ continuous_update=True,
+ step=0.05,
+ style=style,
+ )
+
+ # Show and enable the widgets if interactive==True
+ if(interactive):
+ self.input_button = widgets.Dropdown(options=[*file_options], # , "Other"],
+ style=style,\
+ value=self.input_file,
+ description="Input File: ",
+ )
+ self.input_button.observe(self.get_input, names='value')
+ display(self.input_button)
+
+ self.preset_button = _button_template_(list(hyper_dict.keys()),
+ "SOAP Presets: ", self.preset_func)
+
+ slider_order = ['property_to_ml', 'kernel_type',
+ 'number_of_frames', 'training_percentage',
+ *list(hyper_vals.keys())]
+ for s in slider_order:
+ if(s in self.sliders):
+ self.sliders[s].observe(lambda change:
+ self.change_func(change),
+ names='value')
+ display(self.sliders[s])
+
+ # Set up one KRR for each possible property for caching reasons
+ self.krr = {prop: None for prop in self.properties}
+ self.trained = {prop: False for prop in self.properties}
+
+ # These serve as placeholders for time estimates until any calculation
+ # is run
+ self.est_frames, self.est_times = [], []
+ self.pred_frames, self.pred_times = [], []
+
+ def reset_ML(self, inp_change=False):
+ """ Function to reset the properties, number of frames, and kernel type
+ if any values affecting the machine learning have changed
+
+ Author: R. Cersonsky
+ """
+ if(inp_change):
+ self.properties = {prop:
+ extract_property(self.frames, prop)
+ for prop in sorted([*list(self.frames[0].info.keys()),
+ *list(self.frames[0].arrays.keys())],
+ key=lambda p: -int(p in known_properties))
+ if prop not in ignore}
+
+ self.sliders['number_of_frames'].max = len(self.frames)
+ self.sliders['number_of_frames'].value = int(len(self.frames)*0.2)
+
+ self.sliders['property_to_ml'].options = list(
+ self.properties.keys())
+ self.sliders['property_to_ml'].value = list(
+ self.properties.keys())[0]
+
+ if(self.sliders['property_to_ml'].value in known_properties):
+ self.sliders['kernel_type'].options = [
+ known_properties[self.sliders['property_to_ml'].value]]
+ self.sliders['kernel_type'].value = self.sliders['kernel_type'].options[0]
+ else:
+ self.sliders['kernel_type'].options = ['Atom', 'Structure']
+ self.sliders['kernel_type'].value = 'Atom'
+
+ self.krr = {prop: None for prop in self.properties}
+ self.trained = {prop: False for prop in self.properties}
+
+ def change_func(self, change):
+ """ Function for button events
+
+ Author: R. Cersonsky
+ """
+ change['owner'].value = change['new']
+ for s in self.hyperparameters:
+ if(s in self.sliders):
+ if(self.hyperparameters[s] != self.sliders[s].value):
+ self.est_frames, self.est_times = [], []
+ self.hyperparameters[s] = self.sliders[s].value
+
+ if(change['owner'] == self.sliders['property_to_ml']):
+ if(self.sliders['property_to_ml'].value in known_properties):
+ self.sliders['kernel_type'].options = [
+ known_properties[self.sliders['property_to_ml'].value]]
+ self.sliders['kernel_type'].value = self.sliders['kernel_type'].options[0]
+ else:
+ self.sliders['kernel_type'].options = ['Atom', 'Structure']
+ self.sliders['kernel_type'].value = 'Atom'
+
+ self.reset_ML()
+
+ def preset_func(self, a):
+ """ Function to change hyperparameters based upon presets
+
+ Author: R. Cersonsky
+ """
+ for s in self.sliders:
+ if(s in self.hyperparameters):
+ self.hyperparameters[s] = hyper_dict[self.preset_button.value][s]
+ self.sliders[s].value = self.hyperparameters[s]
+ self.reset_ML()
+
+ def get_input(self, a):
+ """ Function to change input file
+
+ Author: R. Cersonsky
+ """
+ inp = self.input_button.value
+ self.input_file = inp
+ self.frames = np.array(read(self.input_file, ":"))
+ self.reset_ML(True)
+
+ def train_krr_model_func(self, frame_idx, jitter=1e-8, pretend=False):
+ """ Function to trains the model on the given SOAP representation and
+ properties. It is used for both the time estimation and the eventual
+ training. The flag pretend is set to True when it is being used for
+ estimation, false otherwise.
+
+ Author: R. Cersonsky
+
+ Keywords:
+ frame_idx - list of indices of the frames to train on
+ jitter - float, anticipated error to adjust kernel diagonals
+ pretend - boolean, whether to not show the results
+
+ """
+
+ if(pretend == False):
+ self.output_params()
+ verbosity_wrap = lambda s: self.verbosity_wrap(s)
+ else:
+ verbosity_wrap = lambda s: None
+
+ verbosity_wrap(" We will now train a model on {}.".format(
+ self.sliders['property_to_ml'].value))
+
+ representation = SOAP(**mask_body_order(self.hyperparameters))
+
+ if(known_properties[self.sliders['property_to_ml'].value] == 'Atom'):
+ props = np.concatenate(self.properties[self.sliders['property_to_ml'].value][frame_idx])[:,0]
+ else:
+ props = self.properties[self.sliders['property_to_ml'].value][frame_idx]
+
+ training_properties=props
+
+ verbosity_wrap("First, I am going to separate my dataset:")
+ verbosity_wrap(" Now we will compute the SOAP representation of \
+ our training frames.")
+
+ t=time.time()
+ features=representation.transform(self.frames[frame_idx])
+
+ verbosity_wrap('This took {} seconds/frame.'.format(
+ round((time.time()-t)/len(frame_idx), 8)))
+
+ if(pretend == False):
+ self.estimate_time(N=max(0, 20-len(self.est_frames)),
+ p="Estimating time to compute kernel...")
+ est=int(np.poly1d(np.polyfit(self.est_frames,
+ self.est_times,
+ deg=2))(len(frame_idx)))+1
+ else:
+ est=0
+
+ verbosity_wrap(" Next we find the kernel for our training model.\
+ (This step will take approximately {} minutes and \
+ {} seconds.)".format(int(est/60), int(est % 60)))
+
+ time.sleep(0.5)
+ t=time.time()
+ kernel = compute_kernel(calculator=representation, \
+ features1=features, \
+ kernel_type=self.sliders['kernel_type'].value, \
+ **self.hyperparameters)
+
+ self.est_frames.append(len(frame_idx))
+ self.est_times.append(time.time()-t)
+
+ delta=np.std(training_properties) / np.mean(kernel.diagonal())
+ kernel[np.diag_indices_from(
+ kernel)] += self.Lambda**2 / delta ** 2 + jitter
+
+ verbosity_wrap(" We will adjust the our kernel with the tolerance \
+ matrix $\\Lambda = ({})I$.".format(\
+ round(self.Lambda**2 / delta ** 2 + jitter, 8)))
+ verbosity_wrap(" Now we can take this kernel to compute the weights \
+ of our KRR.")
+
+ weights=np.linalg.solve(kernel, training_properties)
+ model=KRR(weights, features, \
+ kernel_type=self.sliders['kernel_type'].value,
+ **self.hyperparameters
+ )
+ if(pretend == False):
+ self.krr[self.sliders['property_to_ml'].value], k=model, kernel
+ self.trained[self.sliders['property_to_ml'].value]=True
+
+ def train_krr_model(self, jitter= 1e-8):
+ """ Wrapper to the train_krr_model_func function for use in the tutorial
+
+ Author: R. Cersonsky
+
+ Keywords:
+ jitter - float, anticipated error to adjust kernel diagonals
+
+ """
+
+ training_dict={"Training Set": [int(self.sliders['number_of_frames'].value*(self.sliders['training_percentage'].value)),\
+ round(100*(self.sliders['training_percentage'].value))],
+ "Testing Set": [int(self.sliders['number_of_frames'].value*(1-self.sliders['training_percentage'].value)),\
+ round(100*(1-self.sliders['training_percentage'].value))]}
+ headers=["Partition", "Number of Frames", "Percentage"]
+ self.verbosity_wrap(markdown_table_from_dict(training_dict, headers))
+
+ self.train_idx, self.test_idx=split_dataset(self.frames[: self.sliders['number_of_frames'].value],
+ self.sliders['training_percentage'].value)
+ self.train_krr_model_func(self.train_idx, jitter= jitter)
+
+ def plot_prediction_func(self, y_known= None, frames = None,
+ frame_idx= [], pretend = False):
+ """ Function to predict properties from the given SOAP representation and
+ KRR model. It is used for both the time estimation and the tutorial
+ prediction. The flag pretend is set to True when it is being used for
+ estimation, false otherwise.
+
+ Author: R. Cersonsky
+
+ Keywords:
+ y_known - list, target properties to train on
+ frames - frames to predict properties of
+ frame_idx - list of indices of the frames to train on
+ pretend - boolean, whether to not show the results
+ """
+ if(len(frame_idx) > 0):
+ frames=self.frames[frame_idx]
+ y_known = np.concatenate(self.properties[self.sliders['property_to_ml'].value][frame_idx])[: , 0] if self.sliders['kernel_type'].value == 'Atom' else self.properties[self.sliders['property_to_ml'].value][frame_idx]
+ if(pretend == False):
+ verbosity_wrap=lambda s: self.verbosity_wrap(s)
+ else:
+ verbosity_wrap = lambda s: None
+ if(self.trained[self.sliders['property_to_ml'].value] == False):
+ verbosity_wrap("Model has not yet been trained, training now...")
+ self.train_krr_model()
+ # np.concatenate(self.properties[self.sliders['property_to_ml'].value][frame_idx])[:,0] if self.sliders['kernel_type'].value=='Atom' else self.properties[self.sliders['property_to_ml'].value][frame_idx]
+ testing_properties=y_known
+
+ if(pretend == False):
+ self.estimate_time(x=self.pred_frames, y=self.pred_times, f=self.plot_prediction_func, N=max(
+ 0, 20-len(self.pred_frames)), ref=len(frames), p="Estimating time to compute prediction...")
+ est=int(np.poly1d(np.polyfit(self.pred_frames,
+ self.pred_times, deg=3))(len(frames)))+1
+ else:
+ est=0
+
+ t=time.time()
+ verbosity_wrap("Predicting the properties of our data set will take approximately {} minutes and {} seconds.".format(
+ int(est/60), int(est % 60)))
+ y_pred=self.krr[self.sliders['property_to_ml'].value].predict(frames)
+ self.pred_frames.append(len(frame_idx))
+ self.pred_times.append(time.time()-t)
+ verbosity_wrap(markdown_table_from_dict(
+ get_score(y_pred, testing_properties), headers = ["Statistic", "Value"]))
+
+ if(not pretend):
+ plt.scatter(y_pred, testing_properties, s=3)
+ plt.axis('scaled')
+ plt.xlabel(self.sliders['property_to_ml'].value)
+ plt.ylabel('Predicted '+self.sliders['property_to_ml'].value)
+ plt.gca().set_aspect('equal')
+ plt.show()
+
+ def predict_test_set(self):
+ """ Wrapper for plot_prediction_func that is used within the tutorial
+
+ Author: R. Cersonsky
+ """
+ self.plot_prediction_func(frame_idx=self.test_idx)
+
+ def predict_new_set(self, filename='./data/small_molecules-1000.xyz',
+ num_frames=''):
+ """ Wrapper predicts a new set of data beyond the training set, loaded
+ from a different file
+
+ Author: R. Cersonsky
+
+ Keywords:
+ filename - string corresponding to an ASE-like file
+ num_frames - number of frames to predict
+ """
+ frames=np.array(read(filename, ":{}".format(num_frames)))
+ properties=extract_property(
+ frames, self.sliders['property_to_ml'].value)
+ self.plot_prediction_func(y_known=properties, frames=frames)
+
+ def output_params(self):
+ """ Function to output the hyperparameters that are currently set
+
+ Author: R. Cersonsky
+ """
+ self.verbosity_wrap('Our input file is {}, of which we are using {} \
+ frames.'.format(self.input_file, \
+ self.sliders['number_of_frames'].value))
+
+ hdict={hyper_vals[k]['name']: [self.hyperparameters[k]]
+ for k in self.hyperparameters
+ if 'fixed' not in hyper_vals[k]}
+ headers=["Parameter", "Value"]
+
+ self.verbosity_wrap(" Our hyperparameters are {}".format(
+ markdown_table_from_dict(hdict, headers)))
+
+ def set(self, value_name, value):
+ """ Public Setter function for tutorial interface
+
+ Author: R. Cersonsky
+
+ Keywords:
+ value_name - string, must be in self.sliders
+ value - value to set
+ """
+ assert value_name in self.sliders
+ self.sliders[value_name].value=value
+
+ def estimate_time(self, x=None, y=None, f=None, N=20, ref=None, p=None):
+ """ Helper function for estimating time
+
+ Author: R. Cersonsky
+
+ Keywords:
+ x - list of numbers of frames computed
+ y - list, size=size(x), of times to computed frames in x
+ f - function to estimate the run time of
+ N - number of trials to run
+ ref - int, upper limit of trials to run.
+ Defaults to (number_of_frames in the ultimate computation) / 4
+ p - print statement
+ """
+ if(N <= 0):
+ return
+ if(x == None or y == None or f == None or ref == None):
+ x=self.est_frames
+ y=self.est_times
+ f=self.train_krr_model_func
+ ref=int(0.25*self.sliders['number_of_frames'].value *
+ self.sliders['training_percentage'].value)
+
+ if(p != None and min(N, ref) > 0):
+ self.verbosity_wrap(p)
+ for nf in tqdm(np.random.randint(2, ref, size = min(N, ref))):
+ f(frame_idx=np.random.randint(len(self.frames), size = nf), pretend=True)
diff --git a/docs/source/tutorials/utilities/tutorial_utils.py b/docs/source/tutorials/utilities/tutorial_utils.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/source/tutorials/utilities/vectors_utils.py b/docs/source/tutorials/utilities/vectors_utils.py
new file mode 100644
index 000000000..eff2088ea
--- /dev/null
+++ b/docs/source/tutorials/utilities/vectors_utils.py
@@ -0,0 +1,429 @@
+import os
+import inspect
+import time
+
+from IPython.display import Markdown, display, clear_output, Image, HTML
+import ipywidgets as widgets
+import numpy as np
+import ase
+from ase.io import read
+from matplotlib import pyplot as plt
+from tqdm import tqdm_notebook as tqdm
+import pandas as pd
+import json
+
+# This is for backwards compatibility -- old versions of RASCAL will follow
+# the latter schema, newer versions the former.
+try:
+ from rascal.representations import SphericalInvariants as SOAP
+except:
+ from rascal.representations import SOAP
+
+from general_utils import (markdown_table_from_dict, \
+ readme_button, _button_template_,
+ compute_kernel, mask_body_order)
+
+# These are some imports/helpers for all of the tutorials
+style = json.load(open('./utilities/widget_style.json'))
+hyper_dict = json.load(open("./utilities/hyperparameter_presets.json"))
+hyper_vals = json.load(open("./utilities/hyperparameter_bounds.json"))
+
+
+def format_positions(positions):
+ """ Function to format positions for best viewing angle. Does a simple PCA
+ to find the best direction to view the molecule and rotates position
+ along these axes
+
+ Author: R. Cersonsky
+
+ Keywords:
+ positions - list of 3-vectors
+
+ Returns:
+ positions - list of 3-vectors
+ """
+ from sklearn.decomposition import PCA
+ pca = PCA(n_components=3)
+ return pca.fit_transform(positions)
+
+
+class SOAP_tutorial(object):
+ """ Class to wrap the SOAP vectors tutorial
+
+ Author: R. Cersonsky
+
+ Constructor Keywords:
+ molecule_file - filename of ASE Atoms-like object
+ containing molecule
+ interactive - boolean, whether to load the tutorial interactive-
+ ly in the jupyter notebook
+ verbose - boolean, whether or not to narrate the actions of
+ the tutorial
+ hyperparameters - dictionary, hyperparameters for SOAP calculation
+ or string corresponding to one of the SOAP presets
+
+ Functions:
+ predict - given the KRR, predicts the properties for a set of ASE frames
+ """
+
+ def __init__(self,
+ molecule_file=None,
+ interactive=False,
+ verbose=True,
+ hyperparameters=dict(**hyper_dict['Power Spectrum']),
+ ):
+
+ # Check if hyperparameters are of type string or dictionary
+ if(isinstance(hyperparameters, str)):
+ self.hyperparameters = hyper_dict[hyperparameters]
+ else:
+ self.hyperparameters = {h: hyperparameters[h]
+ if h in hyperparameters
+ else hyper_dict['Power Spectrum'][h]
+ for h in hyper_dict['Power Spectrum']}
+ self.verbose = verbose
+ self.verbosity_wrap = lambda s: (
+ None if not verbose else display(Markdown(s)))
+
+ # Look for and load available molecules
+ file_options = ['./data/molecules/{}'.format(f)
+ for f in os.listdir('./data/molecules/')
+ if f.endswith('xyz')]
+
+ if(molecule_file != None and molecule_file not in file_options):
+ file_options.append(molecule_file)
+
+ self.frames, self.positions, self.symbols, self.inds, self.labels = {}, {}, {}, {}, {}
+ self.import_molecules(file_options)
+
+ self.vectors = None
+ self.interactive = interactive
+
+ # This next section is for the jupyter interface. Even if the user has
+ # specified zero interactivity, the sliders serve as containers for the
+ # hyperparameters to change when set
+ self.sliders = {val:
+ widgets.FloatSlider(
+ value=self.hyperparameters.get(val,
+ hyper_vals[val]['options'][0]),
+ min=hyper_vals[val]['options'][0],
+ max=hyper_vals[val]['options'][1],
+ description=hyper_vals[val]['name'],
+ continuous_update=True,
+ step=(hyper_vals[val]['options'][1] -
+ hyper_vals[val]['options'][0])/20.,
+ style=style,
+ )
+ if isinstance(hyper_vals[val]['options'][0], float) else
+ widgets.IntSlider(
+ value=self.hyperparameters.get(val,
+ hyper_vals[val]['options'][0]),
+ min=hyper_vals[val]['options'][0],
+ max=hyper_vals[val]['options'][1],
+ description=hyper_vals[val]['name'],
+ continuous_update=True,
+ step=1,
+ style=style,
+ )
+ if isinstance(hyper_vals[val]['options'][0], int) and
+ hyper_vals[val]['options'][0] != True else
+ widgets.Dropdown(options=hyper_vals[val]['options'],
+ style=style,
+ value=self.hyperparameters.get(val,
+ hyper_vals[val]['options'][0]),
+ description=hyper_vals[val]['name'],
+ )
+ for val in hyper_vals if 'fixed' not in hyper_vals[val]}
+
+ # Which elements to center smooth gaussians on. Defaults to all elements
+ # present except for hydrogen.
+ constituents = list(set([s for k in self.symbols for s in self.symbols[k]]))
+ self.sliders['center_select']=widgets.SelectMultiple(
+ options=constituents,
+ value=[s for s in constituents if s != "H"],
+ description="Where to Place SOAP Centers",
+ style=style)
+
+ # Whether to compute the global kernel (average of the environments) or
+ # atomic kernel.
+ self.sliders['average'] = widgets.Dropdown(
+ options=["Environment-Centered", "Average"],
+ value="Environment-Centered",
+ description="Type of SOAP Vectors",
+ style=style)
+
+ # Show and enable the widgets if interactive==True
+ if(interactive):
+ self.preset_button = _button_template_(list(hyper_dict.keys()),
+ "SOAP Presets: ",
+ self.preset_func)
+ if(isinstance(hyperparameters, str)):
+ self.preset_button.value = hyperparameters
+
+ slider_order = ['center_select',
+ 'average',
+ *list(hyper_vals.keys())]
+
+ for s in slider_order:
+ if(s in self.sliders):
+ display(self.sliders[s])
+ self.sliders[s].observe(
+ lambda change: self.change_func(change), names='value')
+
+ # These buttons allow users to choose up to two molecules to compute
+ # SOAP vectors and compare.
+ self.molecule_buttons = [widgets.Dropdown(options=[*self.frames.keys()], # , "Other"],
+ style=style,\
+ value=list(
+ self.frames.keys())[0],
+ description="\tMolecule: "
+ ) for i in range(2)]
+
+ def import_molecules(self, file_list):
+ """ Function to import ASE-like files and include them in the dictionaries
+ of available molecules.
+
+ Author: R. Cersonsky
+
+ Keywords:
+ file_list - list of strings corresponding to ASE-like files
+ """
+ for file in file_list:
+ for frame in read(file, ":"):
+ key = str(frame.symbols)
+
+ if(key in self.frames):
+ print("Overwriting previous entry for {}".format(key))
+
+ self.frames[key] = frame
+ self.positions[key] = format_positions(frame.get_positions())
+ self.inds[key] = sorted(range(len(self.positions[key])),
+ key=lambda j: self.positions[key][j][0])
+ self.positions[key] = self.positions[key][self.inds[key]]
+ self.symbols[key] = frame.symbols[self.inds[key]]
+ super_scripts = ['' if list(self.symbols[key]).count(s) == 1
+ else '^{{({})}}'.format(list(self.symbols[key])[:i].count(s)+1)
+ for i, s in enumerate(list(self.symbols[key]))]
+ self.labels[key] = np.array([r'${}{}$'.format(self.symbols[key][i], s)
+ for i, s in enumerate(super_scripts)])
+
+ def change_func(self, change):
+ """ Function to import ASE-like files and include them in the dictionaries
+ of available molecules.
+
+ Author: R. Cersonsky
+
+ Keywords:
+ file_list - list of strings corresponding to ASE-like files
+ """
+ change['owner'].value = change['new']
+ for s in self.hyperparameters:
+ if(s in self.sliders):
+ if(self.hyperparameters[s] != self.sliders[s].value):
+ self.hyperparameters[s] = self.sliders[s].value
+ self.vectors = None
+
+ def preset_func(self, a):
+ """ Function to change hyperparameters based upon presets
+
+ Author: R. Cersonsky
+ """
+ for s in self.sliders:
+ if(s in self.hyperparameters):
+ self.hyperparameters[s] = hyper_dict[self.preset_button.value][s]
+ self.sliders[s].value = self.hyperparameters[s]
+
+ def get_soap_vectors(self, key=None, average=False):
+ """ Get the SOAP vectors for the given molecule, denoted by key
+
+ Author: R. Cersonsky
+
+ Keywords:
+ key - string corresponding to molecule built by \
+ import_molecules
+ average - whether to compute the average SOAP vector for the
+ molecule
+ """
+ if(key == None):
+ key = self.input_file.replace('./data/molecules/', '')[:-4]
+
+ representation = SOAP(**mask_body_order(self.hyperparameters))
+ all_frames = [self.frames[k] for k in sorted(self.frames.keys())]
+ features = representation.transform(
+ all_frames).get_dense_feature_matrix(representation)
+
+ lengths = [len(self.inds[k]) for k in sorted(self.frames.keys())]
+
+ self.vectors = {k: features[sum(lengths[0:i]):sum(lengths[0:i])+lengths[i]]
+ for i, k in enumerate(sorted(self.frames.keys()))}
+ vectors = self.vectors[key]
+
+ # if(average):
+ # return np.mean(vectors, axis=0)
+ return vectors[self.inds[key]]
+
+ def make_figure(self, key, img_name):
+ """ Function to generate an image of the given molecule and cache it
+ under img_name
+
+ Author: R. Cersonsky
+
+ Keywords:
+ key - string corresponding to molecule built by
+ import_molecules
+ img_name - filename under which to cache the image
+ """
+ from ase.data import covalent_radii, atomic_numbers
+ from matplotlib import pyplot, patches
+ from ase.data.colors import jmol_colors
+ style = {
+ "horizontalalignment": "center",
+ "verticalalignment": "center",
+ "fontsize": 12
+ }
+ fig, ax = pyplot.subplots(1)
+
+ for i, p in enumerate(self.positions[key]):
+ an = atomic_numbers[self.symbols[key][i]]
+ ax.text(*p[:2],
+ s=self.labels[key][i],
+ **style,
+ zorder=p[-1]+0.01
+ )
+ ax.add_artist(patches.Circle(p[:2],
+ covalent_radii[an],
+ facecolor=jmol_colors[an],
+ edgecolor='k',
+ alpha=0.95,
+ zorder=p[-1]))
+ ax.axis('off')
+ bounds = [*np.min(self.positions[key], axis=0), *
+ np.max(self.positions[key], axis=0)]
+ ax.set_xlim([bounds[0]-1.5, bounds[3]+1.5])
+ ax.set_ylim([bounds[1]-1.5, bounds[5]+1.5])
+ ax.set_aspect('equal')
+ fig.savefig(img_name)
+ fig.clf()
+
+ def show_kernel(self, key1, key2=None, show=True, average=False):
+ from rascal.models import Kernel
+ from ase.data import chemical_symbols
+ """ Function to show pandas dataframe for the comparison kernel
+
+ Author: R. Cersonsky
+
+ Keywords:
+ key1 - string corresponding to molecule built by
+ import_molecules
+ key2 - string corresponding to molecule built by
+ import_molecules, defaults to key1
+ show - boolean, whether to show or return the dataframe
+ average - boolean, whether to compute the average SOAP vector
+ for the molecule
+ """
+ key2 = key2 if key2 != None else key1
+ average = self.sliders['average'].value == "Average"
+
+ soap=SOAP(**mask_body_order(self.hyperparameters))
+ features1=soap.transform(self.frames[key1])
+ features2=soap.transform(self.frames[key2])
+
+ kernel = Kernel(soap,
+ target_type="Structure" if average else "Atom",
+ zeta = 2,
+ **self.hyperparameters)
+ data = kernel(features2, features1)
+
+ if (average):
+
+ vec1 = np.mean(features1.get_dense_feature_matrix(soap),axis=0)
+ vec2 = np.mean(features2.get_dense_feature_matrix(soap), axis=0)
+ if(vec1.shape==vec2.shape):
+ data = np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
+ df=pd.DataFrame(data=[['%1.3f' %(data)]],
+ columns =[key2],
+ index =[key1]
+ )
+ else:
+ df=pd.DataFrame(data=[['%1.3f' %(data[i][j])
+ for i,v in enumerate(features2._frames.numbers) if v!=1]
+ for j,w in enumerate(features1._frames.numbers) if w!=1],
+ columns =[chemical_symbols[v] for i,v in enumerate(features2._frames.numbers) if v!=1],
+ index =[chemical_symbols[v] for i,v in enumerate(features1._frames.numbers) if v!=1]
+ )
+ #
+ df.name = ("Self-" if key1 == key2 else '') + \
+ "Similarity " + ("Kernel" if not average else "")
+ return df
+
+ def show_molecule(self, key, show=True):
+ """ Function to either generate or show the image for the given molecule
+
+ Author: R. Cersonsky
+
+ Keywords:
+ key - string corresponding to molecule built by
+ import_molecules
+ show - boolean, whether to show or return the dataframe
+ """
+ import os
+ img_name = './images/cache/{}.png'.format(key)
+ if(not os.path.exists(img_name)):
+ self.make_figure(key, img_name)
+ if(show):
+ display(Image(img_name))
+ return img_name
+
+ def compute(self, a=None):
+ """ Function to compute the SOAP vectors for the set hyperparameters
+
+ Author: R. Cersonsky
+ """
+ def wrap_compute(a):
+ self.get_soap_vectors(
+ key=self.molecule_buttons[0].value, average=self.sliders['average'].value == "Average")
+ print("SOAP Vectors for {}: \n\t".format(
+ self.molecule_buttons[0].value), self.vectors[self.molecule_buttons[0].value])
+
+ compute_button = widgets.Button(description="Compute",
+ style=style,
+ value=False,
+ )
+ compute_button.on_click(wrap_compute)
+ clear_button = widgets.Button(description="Clear Output",
+ style=style,
+ value=False,
+ )
+ clear_button.on_click(lambda a: self.compute(a))
+ display(self.molecule_buttons[0])
+ display(widgets.widgets.HBox((compute_button, clear_button)))
+
+ def compare(self, a=None):
+ """ Function to compare the SOAP vectors of two molecules
+ for the set hyperparameters
+
+ Author: R. Cersonsky
+ """
+ clear_output()
+
+ def wrap_compare(a):
+ df = self.show_kernel(
+ key1=self.molecule_buttons[0].value, key2=self.molecule_buttons[1].value, show=False)
+ m1 = self.show_molecule(self.molecule_buttons[0].value, show=False)
+ m2 = self.show_molecule(self.molecule_buttons[1].value, show=False)
+ display(HTML("