diff --git a/docs/api.rst b/docs/api.rst index 2a9f345..1b6eaa9 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,2 +1,161 @@ PySke API -========= \ No newline at end of file +######### + +PySke API offer applications implemented with list and tree skeletons. +The user can use the sequential or parallel version. +The parallel version allows a faster execution time when its launched on several processors, cores or computers. + +Run examples with parallel computing: + + .. code-block:: console + + mpirun -np NB_CORES python3 PROGRAM_NAME [OPTIONS] + +Examples without :code:`--data` option are only runnable in parallel. + +List Examples +============= + +Dot Product +----------- + +.. py:module:: pyske.examples.list.dot_product + + +Dot Product functions +^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: opt_dot_product + +.. autofunction:: dot_product + +Running Example +^^^^^^^^^^^^^^^ + +.. autoprogram:: pyske.examples.list.util:dot_product_parser() + :prog: dot_product_main.py + + +Discrete Fast Fourier Transform +------------------------------- +.. py:module:: pyske.examples.list.fft + +Fast Fourier Transform function +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: fft + +Running Example +^^^^^^^^^^^^^^^ + +.. autoprogram:: pyske.examples.list.util:standard_parser(data_arg=False) + :prog: fft_main.py + + +K-means Clustering +------------------ + +K-means clustering is an unsupervised algorithm that aims to partition group of points in k clusters. + +K-means function +^^^^^^^^^^^^^^^^ + +.. py:module:: pyske.examples.list.k_means + +.. autofunction:: k_means + +Initialization functions +^^^^^^^^^^^^^^^^^^^^^^^^ + +This is the standard method that initializes the centroids. This method chooses the centroids in order that each point is as far as possible from the other. + +.. autofunction:: k_means_init + + +Point Interface +^^^^^^^^^^^^^^^ + +K-means algorithm takes a list of points in parameters. For now two versions implement this class, one for 2 dimension points and another for 3 dimension points. + +Point 2D class implementation: + +.. autoclass:: pyske.core.util.point_2D.Point_2D + :members: + :special-members: + :member-order: bysource + +Running Example +^^^^^^^^^^^^^^^ + +.. autoprogram:: pyske.examples.list.util:k_means_parser() + :prog: k_means_main.py + + +Maximum Prefix Sum +------------------ + +.. py:module:: pyske.examples.list.maximum_prefix_sum + +Maximum Prefix Sum function +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: mps + +Running Example +^^^^^^^^^^^^^^^ + +.. autoprogram:: pyske.examples.list.util:standard_parser() + :prog: maximum_prefix_sum_main.py + +Maximum Segment Sum +------------------- + +Running Example +^^^^^^^^^^^^^^^ + +.. autoprogram:: pyske.examples.list.util:standard_parser() + :prog: maximum_segment_sum_main.py + +Parallel Regular Sampling Sort +------------------------------ + +.. py:module:: pyske.examples.list.regular_sampling_sort + + +Broadcast function +^^^^^^^^^^^^^^^^^^ + +.. autofunction:: bcast + +Sort function +^^^^^^^^^^^^^ + +.. autofunction:: pssr + + +Running Example +^^^^^^^^^^^^^^^ + +.. autoprogram:: pyske.examples.list.util:standard_parser() + :prog: regular_sampling_sort_main.py + +Variance Example +---------------- + +.. py:module:: pyske.examples.list.variance + +Variance function +^^^^^^^^^^^^^^^^^ + +.. autofunction:: variance + +Running Example +^^^^^^^^^^^^^^^ + +.. autoprogram:: pyske.examples.list.util:standard_parser() + :prog: variance_main.py + + +Tree Examples +============= + diff --git a/docs/conf.py b/docs/conf.py index 8e83820..b6fa59b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,9 +10,9 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys +sys.path.insert(0, os.path.abspath('../')) # -- Project information ----------------------------------------------------- @@ -31,6 +31,8 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + "sphinx.ext.autodoc", + "sphinxcontrib.autoprogram" ] # Add any paths that contain templates here, relative to this directory. @@ -52,4 +54,4 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst index 8115248..552c990 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,7 +7,7 @@ Welcome to PySke's documentation! ================================= .. toctree:: - :maxdepth: 2 + :maxdepth: 3 :caption: Contents: intro diff --git a/pyske/core/util/point_2D.py b/pyske/core/util/point_2D.py new file mode 100644 index 0000000..d0bfeca --- /dev/null +++ b/pyske/core/util/point_2D.py @@ -0,0 +1,91 @@ +""" +A module to represent a 2D point +""" + +from math import sqrt +from pyske.core.util.point_Interface import Point_Interface + + +class Point_2D(Point_Interface): + """A class to represent a 2D point""" + + def __init__(self, x=0, y=0): + self.__x = x + self.__y = y + + def __repr__(self): + return "(%s, %s)" % (self.__x, self.__y) + + def __eq__(self, other): + """ + Equality between two points + """ + if isinstance(other, Point_2D): + return self.__x == other.__x and self.__y == other.__y + return False + + def __add__(self, other): + """ + Addition of two points + + Examples:: + + >>> p1 = Point_2D(5,5) + >>> p2 = Point_2D(5,7) + >>> p1 + p2 + (10, 12) + """ + if isinstance(other, Point_2D): + return Point_2D(self.x + other.x, self.y + other.y) + + def __mul__(self, other): + """ + Multiplication by a point or a scalar + + Examples:: + + >>> p1 = Point_2D(5,5) + >>> p2 = Point_2D(5,7) + >>> p1 * 5 + (25, 25) + >>> p1 * p2 + (25, 35) + """ + if isinstance(other, Point_2D): + return Point_2D(self.x * other.x, self.y * other.y) + if isinstance(other, int) or isinstance(other, float): + return Point_2D(self.x * other, self.y * other) + + def __truediv__(self, other): + if isinstance(other, int): + return Point_2D(self.x / other, self.y / other) + + @property + def x(self): + """X getter""" + return self.__x + + @property + def y(self): + """Y getter""" + return self.__y + + def distance(self, other: 'Point_2D'): + """ + Returns the distance from another point. + + Examples:: + + >>> from pyske.core.util.point_2D import Point_2D + >>> p1 = Point_2D(5,5) + >>> p2 = Point_2D(5,7) + >>> p1.distance(p2) + 2.0 + + :param other: a point + :return: distance from other point + + """ + dx = self.__x - other.x + dy = self.__y - other.y + return sqrt(dx ** 2 + dy ** 2) diff --git a/pyske/core/util/point_3D.py b/pyske/core/util/point_3D.py new file mode 100644 index 0000000..678710d --- /dev/null +++ b/pyske/core/util/point_3D.py @@ -0,0 +1,95 @@ +""" +A module to represent a 3D point +""" + +from math import sqrt +from pyske.core.util.point_Interface import Point_Interface + + +class Point_3D(Point_Interface): + """A class to represent a 3D point""" + + def __init__(self, x=0, y=0, z=0): + self.__x = x + self.__y = y + self.__z = z + + def __repr__(self): + return "(%s, %s, %s)" % (self.__x, self.__y, self.__z) + + def __eq__(self, other): + if isinstance(other, Point_3D): + return self.__x == other.__x and self.__y == other.__y and self.__z == other.__z + return False + + def __add__(self, other): + """ + Addition of two points + + Examples:: + + >>> p1 = Point_3D(5,5,2) + >>> p2 = Point_3D(5,7,1) + >>> p1 + p2 + (10, 12, 3) + """ + if isinstance(other, Point_3D): + return Point_3D(self.x + other.x, self.y + other.y, self.z + other.z) + + def __mul__(self, other): + """ + Multiplication by a point or a scalar + + Examples:: + + >>> p1 = Point_3D(5,5,2) + >>> p2 = Point_3D(5,7,1) + >>> p1 * 5 + (25, 25, 10) + >>> p1 * p2 + (25, 35, 2) + """ + if isinstance(other, Point_3D): + return Point_3D(self.x * other.x, self.y * other.y, self.z * other.z) + if isinstance(other, int) or isinstance(other, float): + return Point_3D(self.x * other, self.y * other, self.z * other.z) + + def __truediv__(self, other): + if isinstance(other, int): + return Point_3D(self.x / other, self.y / other, self.z / other) + + @property + def x(self): + """X getter""" + return self.__x + + @property + def y(self): + """Y getter""" + return self.__y + + @property + def z(self): + """Z getter""" + return self.__z + + def distance(self, other): + """ + Returns the distance from another 3D point. + + Examples:: + + >>> from pyske.core.util.point_2D import Point_2D + >>> p1 = Point_3D(5,5,2) + >>> p2 = Point_3D(5,7,1) + >>> p1.distance(p2) + 2.24 + + :param other: a point + :return: distance from other point + + """ + dx = self.__x - other.x + dy = self.__y - other.y + dz = self.__z - other.z + return sqrt(dx ** 2 + dy ** 2 + dz ** 2) diff --git a/pyske/core/util/point_Interface.py b/pyske/core/util/point_Interface.py new file mode 100644 index 0000000..6196c47 --- /dev/null +++ b/pyske/core/util/point_Interface.py @@ -0,0 +1,25 @@ +""" +A module to represent a point +""" +from abc import ABC + +class Point_Interface(ABC): + """Point interface to represent point of n dimensions""" + + def __repr__(self): + pass + + def __eq__(self, other): + pass + + def __add__(self, other): + pass + + def __mul__(self, other): + pass + + def __truediv__(self, other): + pass + + def distance(self, other): + pass diff --git a/pyske/examples/list/dot_product_main.py b/pyske/examples/list/dot_product_main.py index e357322..782c146 100644 --- a/pyske/examples/list/dot_product_main.py +++ b/pyske/examples/list/dot_product_main.py @@ -2,14 +2,13 @@ Execution of dot_product.py """ -import argparse import gc import random from pyske.examples.list.dot_product import opt_dot_product, dot_product from pyske.core import par, Timing, PList as DPList from pyske.core.opt import fun as opt from pyske.core.opt.list import PList -from pyske.examples.list.util import rand_list, print_experiment +from pyske.examples.list.util import rand_list, print_experiment, dot_product_parser # -------------- Execution ----------------- @@ -26,12 +25,7 @@ def __compute(): return opt_dot_product(PList.raw(pl1), PList.raw(pl2), uncurry=opt.uncurry).run() # Command-line arguments parsing - parser = argparse.ArgumentParser() - parser.add_argument("--size", help="size of the list to generate", type=int, default=1_000_000) - parser.add_argument("--iter", help="number of iterations", type=int, default=30) - parser.add_argument("--test", help="choice of the test", - choices=[_DIRECT, _HAND, _EVAL, _OPT], - default=_DIRECT) + parser = dot_product_parser() args = parser.parse_args() size = args.size test = args.test diff --git a/pyske/examples/list/k_means.py b/pyske/examples/list/k_means.py new file mode 100644 index 0000000..a1eab32 --- /dev/null +++ b/pyske/examples/list/k_means.py @@ -0,0 +1,122 @@ +""" +K-Means +""" +import random +from typing import Callable, Tuple + +from pyske.core.interface import List +from pyske.core.list import SList +from pyske.core.util.point_Interface import Point_Interface +from pyske.core.util.par import procs + + +def cluster_index(point: Point_Interface, centroids: SList[Point_Interface]) -> \ + Tuple[Point_Interface, int]: + """ + Get the centroid index of the closest centroid + """ + min_dist = float("inf") + p_centroid = centroids[0] + for centroid in centroids: + if point.distance(centroid) < min_dist: + min_dist = point.distance(centroid) + p_centroid = centroid + return point, centroids.index(p_centroid) + + +def assign_clusters(input_list: List[Point_Interface], centroids: SList[Point_Interface]) -> \ + List[Tuple[Point_Interface, int]]: + """ + Assign each point to a cluster + """ + return input_list.map(lambda x: cluster_index(x, centroids)) + + +def update_centroids(clusters: List[Tuple[Point_Interface, int]], + centroids: SList[Point_Interface]): + """ + Update centroids of clusters + """ + + def centroids_list_update(list_to_update, item): + if isinstance(item, SList): + list_to_update = list_to_update.map2(lambda a_pair, b_pair: (a_pair[0] + b_pair[0], + a_pair[1] + b_pair[1]), + item) + else: + index = item[1] + point = item[0] + list_to_update[index] = (list_to_update[index][0] + point, + list_to_update[index][1] + 1) + return list_to_update + + point_class = type(centroids[0]) + neutral_list = SList.init(lambda _: (point_class(), 0), len(centroids)) + new_centroids = clusters.reduce(lambda a_item, b_item: + centroids_list_update(a_item, b_item), neutral_list) + new_centroids = new_centroids.map(lambda x: x[0] / x[1]) + + return new_centroids + + +def max_dist(pair_a: Tuple[Point_Interface, float], pair_b: Tuple[Point_Interface, float]): + """ + Return the tuple with the maximum distance + """ + if pair_a[1] > pair_b[1]: + return pair_a + return pair_b + + +def k_means_init(input_list: List[Point_Interface], n_cluster: int) -> SList[Point_Interface]: + """ + K-means++ initialization + + :param input_list: a list of points + :param n_cluster: number of clusters + + :return: list of centroids + """ + centroids = SList([]) + first_centroid = input_list.get_partition() \ + .map(lambda l: l[random.randint(0, l.length() - 1)]) \ + .to_seq()[random.randint(0, list(procs())[len(list(procs())) - 1])] + centroids.append(first_centroid) + + for _ in range(n_cluster - 1): + dist = input_list.map(lambda x: x.distance(centroids[0])) + for i in range(1, len(centroids)): + temp_dist = input_list.map(lambda x, index=i: x.distance(centroids[index])) + dist = dist.map2(lambda x, y: y if y < x else x, temp_dist) + + zip_list = input_list.zip(dist) + next_centroid = zip_list.reduce(max_dist)[0] + centroids.append(next_centroid) + + return centroids + + +def k_means(input_list: List[Point_Interface], init_function: Callable[[List, int], List], + n_cluster: int, + max_iter: int = 10) -> List[Tuple[Point_Interface, int]]: + """ + K-means algorithm on a list of points + + :param input_list: a list of points + :param init_function: a function that initialize centroids + :param n_cluster: number of clusters + :param max_iter: number of iterations + + :return: a list of tuples with the point and his cluster index + """ + centroids = init_function(input_list, n_cluster) + + j = 0 + while j < max_iter: + clusters = assign_clusters(input_list, centroids) + + centroids = update_centroids(clusters, centroids) + + j = j + 1 + + return clusters diff --git a/pyske/examples/list/k_means_main.py b/pyske/examples/list/k_means_main.py new file mode 100644 index 0000000..c8b7782 --- /dev/null +++ b/pyske/examples/list/k_means_main.py @@ -0,0 +1,42 @@ +""" +Execution of k_means +""" + +from pyske.core import Timing +from pyske.examples.list.k_means import k_means, k_means_init +from pyske.examples.list import util + +PAR = 'parallel' +SEQ = 'sequential' + + +if __name__ == '__main__': + + parser = util.k_means_parser() + + args = parser.parse_args() + size = args.size + num_iter = args.iter + choice = args.data + clusters = args.clusters + dimensions = args.dimensions + show_clusters = args.show_clusters + + pyske_list_class = util.select_pyske_list(choice) + input_list = util.rand_point_list(pyske_list_class, size, clusters, dimensions) + + timing = Timing() + execute = util.select_execute(choice) + example = k_means + execute(lambda: print('Version:\t', choice)) + for iteration in range(1, 1 + num_iter): + timing.start() + result = example(input_list, k_means_init, clusters) + timing.stop() + util.print_experiment("", timing.get(), execute, iteration) + if show_clusters: + if dimensions == 2: + util.print_2D_result(result.to_seq()) + if dimensions == 3: + util.print_3D_result(result.to_seq()) + diff --git a/pyske/examples/list/regular_sampling_sort.py b/pyske/examples/list/regular_sampling_sort.py index 69feb8b..f816de0 100644 --- a/pyske/examples/list/regular_sampling_sort.py +++ b/pyske/examples/list/regular_sampling_sort.py @@ -18,8 +18,7 @@ def bcast(input_list: PList, src_pid: int) -> PList: Example:: >>> from pyske.core import PList, par - >>> bcast(PList.from_seq([42]), 0).to_seq() == \ - list(map(lambda _: 42, par.procs())) + >>> bcast(PList.from_seq([42]), 0).to_seq() == list(map(lambda _: 42, par.procs())) True :param input_list: a parallel list. diff --git a/pyske/examples/list/util.py b/pyske/examples/list/util.py index 57bed0d..0ae417f 100644 --- a/pyske/examples/list/util.py +++ b/pyske/examples/list/util.py @@ -2,8 +2,23 @@ Utility functions for PySke examples """ +from typing import Tuple + +import argparse +import matplotlib.pyplot as plt + +from sklearn.datasets import make_blobs +from pyske.core import Distribution, SList +from pyske.core.support import parallel +from pyske.core.util.point_2D import Point_2D +from pyske.core.util.point_3D import Point_3D + PAR = 'parallel' SEQ = 'sequential' +_DIRECT = '_DIRECT' +_HAND = 'hand_optimized' +_OPT = 'optimized' +_EVAL = 'evaluated' def standard_parse_command_line(size_arg=True, iter_arg=True, data_arg=True): @@ -19,18 +34,7 @@ def standard_parse_command_line(size_arg=True, iter_arg=True, data_arg=True): :param data_arg: (default True) flag to select argument --data :return: (size, iter, ['parallel' | 'sequential']) """ - # pylint: disable=import-outside-toplevel - import argparse - parser = argparse.ArgumentParser() - if size_arg: - parser.add_argument("--size", help="size of the list to generate", - type=int, default=1_000_000) - if iter_arg: - parser.add_argument("--iter", help="number of iterations", - type=int, default=30) - if data_arg: - parser.add_argument("--data", help="type of data structure", - choices=[PAR, SEQ], default=SEQ) + parser = standard_parser(size_arg, iter_arg, data_arg) size = num_iter = 0 data_type = PAR args = parser.parse_args() @@ -43,6 +47,50 @@ def standard_parse_command_line(size_arg=True, iter_arg=True, data_arg=True): return size, num_iter, data_type +def standard_parser(size_arg=True, iter_arg=True, data_arg=True): + """ + Parser for standard example. + """ + parser = argparse.ArgumentParser() + if size_arg: + parser.add_argument("--size", help="size of the list to generate", + type=int, default=1_000_000) + if iter_arg: + parser.add_argument("--iter", help="number of iterations", + type=int, default=30) + if data_arg: + parser.add_argument("--data", help="type of data structure", + choices=[PAR, SEQ], default=SEQ) + return parser + +def k_means_parser(): + """ + Parser for k-means example. + """ + parser = argparse.ArgumentParser() + parser.add_argument("--size", help="size of the list to generate", type=int, default=5_000) + parser.add_argument("--iter", help="number of iterations", type=int, default=30) + parser.add_argument("--data", help="type of data structure", choices=[PAR, SEQ], default=SEQ) + parser.add_argument("--clusters", help="number of clusters", type=int, default=3) + parser.add_argument("--dimensions", help="point dimensions", type=int, default=2) + parser.add_argument("--show-clusters", help="display the clusters graph of 2D or 3D points", + action="store_true") + + return parser + +def dot_product_parser(): + """ + Parse for dot-product example. + """ + + parser = argparse.ArgumentParser() + parser.add_argument("--size", help="size of the list to generate", type=int, default=1_000_000) + parser.add_argument("--iter", help="number of iterations", type=int, default=30) + parser.add_argument("--test", help="choice of the test", + choices=[_DIRECT, _HAND, _EVAL, _OPT], + default=_DIRECT) + return parser + def select_pyske_list(choice): """ Return a PySke list class. @@ -88,6 +136,71 @@ def rand_list(cls, size): import random return cls.init(lambda _: float(random.randint(-100, 100)), size) +def select_point_dimensions(dimensions): + """ + Return a PySke list class. + + :param dimensions: point dimensions + Precondition: dimensions >= 2 + :return: a Point + """ + # pylint: disable=import-outside-toplevel + if dimensions == 3: + from pyske.core.util.point_3D import Point_3D as PointClass + else: + from pyske.core.util.point_2D import Point_2D as PointClass + return PointClass + +def rand_point_list(cls, size, clusters, dimensions): + """ + Return a randomly generated list of points. + + :param cls: the class of the generated list. + :param size: a positive number + Precondition: size >= 0 + :param clusters: number of clusters + :param dimensions: point dimensions + Precondition: dimensions >= 2 + :return: a list of the given class + """ + x, _ = make_blobs(n_samples=size, centers=clusters, n_features=dimensions) + x = x.tolist() + pointclass = select_point_dimensions(dimensions) + x = list(map(lambda y: pointclass(*y), x)) + distr = Distribution().balanced(size) + return cls.from_seq(x).distribute(distr) + +def print_2D_result(clusters_list: SList[Tuple[Point_2D, int]]): + """ + Print experiment of 2 dimension points k-means clustering + """ + if parallel.PID == 0: + x = clusters_list.map(lambda pair: pair[0].x) + y = clusters_list.map(lambda pair: pair[0].y) + colors = clusters_list.map(lambda pair: pair[1]) + plt.scatter(x, y, c=colors) + plt.show() + +def print_3D_result(clusters_list: SList[Tuple[Point_3D, int]]): + """ + Print experiment of 3 dimension points k-means clustering + """ + if parallel.PID == 0: + x = clusters_list.map(lambda pair: pair[0].x) + y = clusters_list.map(lambda pair: pair[0].y) + z = clusters_list.map(lambda pair: pair[0].z) + colors = clusters_list.map(lambda pair: pair[1]) + + # Tracé du résultat en 3D + fig = plt.figure() + ax = fig.add_subplot(projection='3d') # Affichage en 3D + ax.scatter(x, y, z, label='Courbe', marker='d', c=colors) # Tracé des points 3D + plt.title("Points 3D") + ax.set_xlabel('X') + ax.set_ylabel('Y') + ax.set_zlabel('Z') + plt.tight_layout() + plt.show() def print_experiment(result, timing, execute, iteration=None): """