From f7a64c3c47f1793a174ab1f9a48269cb78b3537f Mon Sep 17 00:00:00 2001 From: TheovanKraay Date: Mon, 13 Nov 2017 13:50:44 +0000 Subject: [PATCH 1/6] Create Optimise I propose some functions that extend capability for optimisation of markov chain modelling using clustering. These are all extensions of existing functions that work with k-means clustering in the clickstream package. --- R/Optimise | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 R/Optimise diff --git a/R/Optimise b/R/Optimise new file mode 100644 index 0000000..e4b9e73 --- /dev/null +++ b/R/Optimise @@ -0,0 +1,156 @@ +#' generates a list of markov chains from a given set of clusters +#' +#' @export +#' @description the purpose of this function is to generate pre-computed markov chain objects from clusters of clickstreams. +#' @param clusters The clusters from which to generate markov chain objects +#' @param order The order for the markov chain + +fitMarkovChains =function(clusters, order=1) { + markovchains <- NULL + for (i in clusters[[1]]){ + mc <- fitMarkovChain(i, order = order) + markovchains <- append(markovchains, mc) + } + return(markovchains) +} + + +#' generates the optimal markov chains from a list of markov chains and corresponding clusters +#' +#' @export +#' @description the purpose of this function is to predict from a pattern using pre-computed markov chains and corresponding clusters. The markov chain corresponding with the cluster that is the best fit to the prediction value is used. +#' @param startPattern The pattern object to be used +#' @param markovchains The pre-computed markov chains generated from a set of clusters +#' @param clusters The corresponding clusters (should be in the corresponding order as the markov chains) +#' @examples +#' +#' training <- c("User1,h,c,c,p,c,h,c,p,p,c,p,p,o", +#' "User2,i,c,i,c,c,c,d", +#' "User3,h,i,c,i,c,p,c,c,p,c,c,i,d", +#' "User4,c,c,p,c,d") +#' +#' test <- c("User1,h,c,c,p,p,h,c,p,p,c,p,p,o", +#' "User2,i,c,i,c,c,c,d", +#' "User4,c,c,c,c,d") +#' +#' csf <- tempfile() +#' writeLines(training, csf) +#' trainingCLS <- readClickstreams(csf, header = TRUE) +#' +#' csf <- tempfile() +#' writeLines(test, csf) +#' testCLS <- readClickstreams(csf, header = TRUE) +#' +#' clusters <- clusterClickstreams(trainingCLS, centers = 2) +#' markovchains <- fitMarkovChains(clusters, order = 1) +#' startPattern <- new("Pattern", sequence = c("c")) +#' mc <- getOptimalMarkovChain(startPattern, markovchains, clusters) +#' predict(mc, startPattern) + +getOptimalMarkovChain =function(startPattern, markovchains, clusters) { + markovchainIndex <- predict(clusters, startPattern) + optimalPreComputedChain <- markovchains[[markovchainIndex]] + return(optimalPreComputedChain) +} + +#' generates an optimal set of clusters for a clickstream based on certain constraints. +#' +#' @export +#' @description this is an experimental function which allows clustering based on targetting an average figure for next click probabilities derived when fitting each cluster to a markov chain. The user can either +#' @param cls The clickstream +#' @param maxIterations number of times to iterate (repeat) through the k-means clustering. +#' @param optimalProbMean The target average probability of each next page click prediction in a 1st order markov chain +#' @param range the range above the optimal probability to target. +#' @param clusterCentresRange the additional cluster centres to evaluate +#' @param divisor the number by which to divide the total number of clicks (determines how many cluster centres to start with) +#' @param takeHighest determines whether to default to the highest mean next click probability, or error if the target is not reached after the given number of k-means iterations +#' @param order The order for markov chains that will be used to evaluate each cluster +#' +#' @examples +#' clickstreams <- c("User1,h,c,c,p,c,h,c,p,p,c,p,p,o", +#' "User2,i,c,i,c,c,c,d", +#' "User3,h,i,c,i,c,p,c,c,p,c,c,i,d", +#' "User4,c,c,p,c,d", +#' "User5,h,c,c,p,p,c,p,p,p,i,p,o", +#' "User7,i,h,c,c,p,p,c,p,c,d", +#' "User8,i,h,c,c,p,p,c,p,c,d", +#' "User9,i,h,c,c,p,p,c,p,c,d", +#' "User10,i,h,c,c,p,p,c,p,c,d", +#' "User11,i,h,c,c,p,p,c,p,c,d,z") +#' +#' csf <- tempfile() +#' writeLines(clickstreams, csf) +#' cls <- readClickstreams(csf, header = TRUE) +#' cls +#' +#' cluster <- getOptimalClusters(cls, centresMax = 3, order = 1) +#' markovchains <- fitMarkovChains(cluster) +#' startPattern <- new("Pattern", sequence = c("H")) +#' predictFromMarkovChains(startPattern,markovchains,cluster) + +getOptimalClusters = function(cls, maxIterations=10, optimalProbMean=0.60, range=0.055, divisor=1000, order=1, clusterCentresRange=0, takeHighest=TRUE){ + vectorOfClickstreamLengths <- NULL + for(i in cls){ + vectorOfClickstreamLengths <- append(vectorOfClickstreamLengths, length(i)) + } + clicks <- sum(vectorOfClickstreamLengths) + clicks <- clicks/divisor + clicks <- floor(clicks) + centresMin <- clicks + vec<-unlist(cls) + dedupe <- vec[which(!duplicated(vec))] + if (centresMin > length(dedupe)){ + centresMin = length(dedupe) + } + centresMax <- centresMin + clusterCentresRange + clusterOfClusters <- list() + clusterCentres <- centresMin:centresMax + iterations <- 1:maxIterations + vectorOfAllProbsMeans <-NULL + limit <- optimalProbMean + range + print(optimalProbMean) + print(limit) + for (i in iterations){ + for (c in clusterCentres){ + print(c) + clusters <- clusterClickstreams(cls, centers = c) + markovchains <- fitMarkovChains(clusters, order = order) + vectorOfProbs <-NULL + for (d in dedupe){ + if(d !="Defer"){ + value <- d[[1]] + startPattern <- new("Pattern", sequence = c(value)) + mc <- getOptimalMarkovChain(startPattern,markovchains,clusters) + prob <- predict(mc, startPattern) + vectorOfProbs <- append(vectorOfProbs, prob@probability) + } + } + vectorOfAllProbsMeans <- append(vectorOfAllProbsMeans, mean(vectorOfProbs)) + clusterOfClusters <- list.append(clusterOfClusters, clusters) + } + print(vectorOfAllProbsMeans) + indexOfHighestProbs <- which(vectorOfAllProbsMeans>optimalProbMean & vectorOfAllProbsMeans < limit) + print(indexOfHighestProbs) + #return the best set of clusters + if (takeHighest != TRUE){ + if (length(indexOfHighestProbs) > 0){ + break + } + } + } + if (takeHighest != TRUE){ + if (length(indexOfHighestProbs) > 0){ + return(clusterOfClusters[[indexOfHighestProbs]]) + } + else{ + stop(("target prediction accuracy was not reached with the given number of iterations")) + } + } + else{ + if (length(indexOfHighestProbs) == 0){ + warning("target prediction accuracy was not reached with the given number of iterations. Taking highest probability mean") + } + indexOfHighestProbs <- which(vectorOfAllProbsMeans==max(vectorOfAllProbsMeans)) + return(clusterOfClusters[[indexOfHighestProbs]]) + } +} From 9d62810f4548429e013dd0c23651fdcdc6b80a9a Mon Sep 17 00:00:00 2001 From: TheovanKraay Date: Wed, 15 Nov 2017 18:05:03 +0000 Subject: [PATCH 2/6] Update Optimise --- R/Optimise | 70 +++++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/R/Optimise b/R/Optimise index e4b9e73..c73373a 100644 --- a/R/Optimise +++ b/R/Optimise @@ -56,7 +56,7 @@ getOptimalMarkovChain =function(startPattern, markovchains, clusters) { #' generates an optimal set of clusters for a clickstream based on certain constraints. #' #' @export -#' @description this is an experimental function which allows clustering based on targetting an average figure for next click probabilities derived when fitting each cluster to a markov chain. The user can either +#' @description this is an experimental function which allows consensus clustering based on targetting an average figure for next click probabilities derived when fitting each cluster to a markov chain. The user can either #' @param cls The clickstream #' @param maxIterations number of times to iterate (repeat) through the k-means clustering. #' @param optimalProbMean The target average probability of each next page click prediction in a 1st order markov chain @@ -83,27 +83,18 @@ getOptimalMarkovChain =function(startPattern, markovchains, clusters) { #' cls <- readClickstreams(csf, header = TRUE) #' cls #' -#' cluster <- getOptimalClusters(cls, centresMax = 3, order = 1) -#' markovchains <- fitMarkovChains(cluster) -#' startPattern <- new("Pattern", sequence = c("H")) -#' predictFromMarkovChains(startPattern,markovchains,cluster) +#' clusters <- getConsensusClusters(cls, centresMax = 3, order = 1) +#' markovchains <- fitMarkovChains(clusters) +#' startPattern <- new("Pattern", sequence = c("h")) +#' mc <- getOptimalMarkovChain(startPattern, markovchains, clusters) +#' predict(mc, startPattern) -getOptimalClusters = function(cls, maxIterations=10, optimalProbMean=0.60, range=0.055, divisor=1000, order=1, clusterCentresRange=0, takeHighest=TRUE){ - vectorOfClickstreamLengths <- NULL - for(i in cls){ - vectorOfClickstreamLengths <- append(vectorOfClickstreamLengths, length(i)) - } - clicks <- sum(vectorOfClickstreamLengths) - clicks <- clicks/divisor - clicks <- floor(clicks) - centresMin <- clicks +getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalProbMean=0.40, range=0.10, centresMin=2, order=1, clusterCentresRange=0, takeHighest=TRUE){ + cls <- trainingCLS vec<-unlist(cls) dedupe <- vec[which(!duplicated(vec))] - if (centresMin > length(dedupe)){ - centresMin = length(dedupe) - } centresMax <- centresMin + clusterCentresRange - clusterOfClusters <- list() + listOfClusters <- list() clusterCentres <- centresMin:centresMax iterations <- 1:maxIterations vectorOfAllProbsMeans <-NULL @@ -116,6 +107,7 @@ getOptimalClusters = function(cls, maxIterations=10, optimalProbMean=0.60, range clusters <- clusterClickstreams(cls, centers = c) markovchains <- fitMarkovChains(clusters, order = order) vectorOfProbs <-NULL + print("starting next page probability aggregation....") for (d in dedupe){ if(d !="Defer"){ value <- d[[1]] @@ -126,31 +118,43 @@ getOptimalClusters = function(cls, maxIterations=10, optimalProbMean=0.60, range } } vectorOfAllProbsMeans <- append(vectorOfAllProbsMeans, mean(vectorOfProbs)) - clusterOfClusters <- list.append(clusterOfClusters, clusters) + listOfClusters <- list.append(listOfClusters, clusters) } print(vectorOfAllProbsMeans) - indexOfHighestProbs <- which(vectorOfAllProbsMeans>optimalProbMean & vectorOfAllProbsMeans < limit) - print(indexOfHighestProbs) - #return the best set of clusters - if (takeHighest != TRUE){ - if (length(indexOfHighestProbs) > 0){ - break - } - } + candidates <- which(vectorOfAllProbsMeans>optimalProbMean & vectorOfAllProbsMeans < limit) + cat("candidates are: ",candidates,"\n") } if (takeHighest != TRUE){ - if (length(indexOfHighestProbs) > 0){ - return(clusterOfClusters[[indexOfHighestProbs]]) + if (length(candidates) > 0){ + #get the candidate clusters into a vector + candidateClusters <- list() + for (i in candidates){ + cat("i is: ",i,"\n") + clusters <- listOfClusters[[i]] + candidateClusters <- list.append(candidateClusters,clusters) + } + print("Evaluating candidate models.....") + vec_variances <- NULL + for(c in candidateClusters){ + markovchains <- fitMarkovChains(c) + variance <- mcEvaluateAllClusters(markovchains,c,testCLS,trainingCLS,returnChiSquareOnly = TRUE) + cat("variance is....",variance,"\n") + vec_variances <- append(vec_variances,variance) + } + cat("vec_variances is: ",vec_variances,"\n") + winner <- which(vec_variances==min(vec_variances)) + cat("winner is: ",winner,"\n") + return(candidateClusters[[winner]]) } else{ - stop(("target prediction accuracy was not reached with the given number of iterations")) + stop(("target range was not reached with the given number of iterations")) } } else{ - if (length(indexOfHighestProbs) == 0){ + if (length(candidates) == 0){ warning("target prediction accuracy was not reached with the given number of iterations. Taking highest probability mean") } - indexOfHighestProbs <- which(vectorOfAllProbsMeans==max(vectorOfAllProbsMeans)) - return(clusterOfClusters[[indexOfHighestProbs]]) + candidates <- which(vectorOfAllProbsMeans==max(vectorOfAllProbsMeans)) + return(listOfClusters[[candidates]]) } } From 9d24f504b026f71e76eae02f55078d16667e8ba8 Mon Sep 17 00:00:00 2001 From: TheovanKraay Date: Wed, 15 Nov 2017 18:50:30 +0000 Subject: [PATCH 3/6] Update Optimise --- R/Optimise | 1 - 1 file changed, 1 deletion(-) diff --git a/R/Optimise b/R/Optimise index c73373a..6597040 100644 --- a/R/Optimise +++ b/R/Optimise @@ -129,7 +129,6 @@ getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalP #get the candidate clusters into a vector candidateClusters <- list() for (i in candidates){ - cat("i is: ",i,"\n") clusters <- listOfClusters[[i]] candidateClusters <- list.append(candidateClusters,clusters) } From 5f00c014942b0d2c7be8b01a6686e72bb87a7a43 Mon Sep 17 00:00:00 2001 From: TheovanKraay Date: Fri, 17 Nov 2017 21:36:33 +0000 Subject: [PATCH 4/6] Update Optimise --- R/Optimise | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/Optimise b/R/Optimise index 6597040..7c5fe7e 100644 --- a/R/Optimise +++ b/R/Optimise @@ -56,13 +56,13 @@ getOptimalMarkovChain =function(startPattern, markovchains, clusters) { #' generates an optimal set of clusters for a clickstream based on certain constraints. #' #' @export -#' @description this is an experimental function which allows consensus clustering based on targetting an average figure for next click probabilities derived when fitting each cluster to a markov chain. The user can either +#' @description this is an experimental function for a consensus clustering algorithm based on targetting a range of average next state probabilities derived when fitting each cluster to a markov chain. #' @param cls The clickstream #' @param maxIterations number of times to iterate (repeat) through the k-means clustering. #' @param optimalProbMean The target average probability of each next page click prediction in a 1st order markov chain #' @param range the range above the optimal probability to target. +#' @param centresMin the minimum cluster centres to evaluate #' @param clusterCentresRange the additional cluster centres to evaluate -#' @param divisor the number by which to divide the total number of clicks (determines how many cluster centres to start with) #' @param takeHighest determines whether to default to the highest mean next click probability, or error if the target is not reached after the given number of k-means iterations #' @param order The order for markov chains that will be used to evaluate each cluster #' @@ -83,13 +83,13 @@ getOptimalMarkovChain =function(startPattern, markovchains, clusters) { #' cls <- readClickstreams(csf, header = TRUE) #' cls #' -#' clusters <- getConsensusClusters(cls, centresMax = 3, order = 1) +#' clusters <- getConsensusClusters(trainingCLS, testCLS, maxIterations=20, optimalProbMean=0.50, range = 0.40, centresMin = 2, clusterCentresRange = 4, order = 1, takeHighest=FALSE) #' markovchains <- fitMarkovChains(clusters) #' startPattern <- new("Pattern", sequence = c("h")) #' mc <- getOptimalMarkovChain(startPattern, markovchains, clusters) #' predict(mc, startPattern) -getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalProbMean=0.40, range=0.10, centresMin=2, order=1, clusterCentresRange=0, takeHighest=TRUE){ +getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalProbMean=0.50, range=0.30, centresMin=2, order=1, clusterCentresRange=0, takeHighest=FALSE){ cls <- trainingCLS vec<-unlist(cls) dedupe <- vec[which(!duplicated(vec))] @@ -132,7 +132,7 @@ getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalP clusters <- listOfClusters[[i]] candidateClusters <- list.append(candidateClusters,clusters) } - print("Evaluating candidate models.....") + print("Evaluating candidates.....") vec_variances <- NULL for(c in candidateClusters){ markovchains <- fitMarkovChains(c) @@ -140,7 +140,7 @@ getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalP cat("variance is....",variance,"\n") vec_variances <- append(vec_variances,variance) } - cat("vec_variances is: ",vec_variances,"\n") + cat("vector of variances is: ",vec_variances,"\n") winner <- which(vec_variances==min(vec_variances)) cat("winner is: ",winner,"\n") return(candidateClusters[[winner]]) @@ -151,7 +151,7 @@ getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalP } else{ if (length(candidates) == 0){ - warning("target prediction accuracy was not reached with the given number of iterations. Taking highest probability mean") + warning("target range was not reached with the given number of iterations. Taking highest probability mean") } candidates <- which(vectorOfAllProbsMeans==max(vectorOfAllProbsMeans)) return(listOfClusters[[candidates]]) From 893560367c9ce09c209718f282e8e89c96333664 Mon Sep 17 00:00:00 2001 From: TheovanKraay Date: Fri, 24 Nov 2017 12:31:30 +0000 Subject: [PATCH 5/6] Update Optimise Added a parallelized version of getConsensusClusters, and added function getParallelClusterSets (which performs the discreet function of clustering a clickstream, fitting markov chains to each cluster, and returning a list of the cluster-markovchain pair). Depends on the parallel() library. --- R/Optimise | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 3 deletions(-) diff --git a/R/Optimise b/R/Optimise index 7c5fe7e..1c90527 100644 --- a/R/Optimise +++ b/R/Optimise @@ -103,7 +103,6 @@ getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalP print(limit) for (i in iterations){ for (c in clusterCentres){ - print(c) clusters <- clusterClickstreams(cls, centers = c) markovchains <- fitMarkovChains(clusters, order = order) vectorOfProbs <-NULL @@ -141,7 +140,7 @@ getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalP vec_variances <- append(vec_variances,variance) } cat("vector of variances is: ",vec_variances,"\n") - winner <- which(vec_variances==min(vec_variances)) + winner <- which.min(vec_variances) cat("winner is: ",winner,"\n") return(candidateClusters[[winner]]) } @@ -151,7 +150,172 @@ getConsensusClusters = function(trainingCLS, testCLS, maxIterations=10, optimalP } else{ if (length(candidates) == 0){ - warning("target range was not reached with the given number of iterations. Taking highest probability mean") + warning("target prediction accuracy was not reached with the given number of iterations. Taking highest probability mean") + } + candidates <- which(vectorOfAllProbsMeans==max(vectorOfAllProbsMeans)) + return(listOfClusters[[candidates]]) + } +} + +getParallelClusterSets = function(trainingCLS, maxIterations,centres){ + mkWorker <- function(centres) { + fitMarkovChains =function(clusters, order=1) { + markovchains <- NULL + for (i in clusters[[1]]){ + mc <- fitMarkovChain(i, order = order) + markovchains <- append(markovchains, mc) + } + return(markovchains) + } + force(centres) + worker <- function(cls) { + clusterChainPair <- list() + clusters <- clusterClickstreams(clickstreamList = cls,centers=centres) + clusterChainPair <- list.append(clusterChainPair, clusters) + mc <- fitMarkovChains(clusters) + clusterChainPair <- list.append(clusterChainPair, mc) + return (clusterChainPair) + } + return(worker) + } + + ListOfclickstreams <- list() + for (i in maxIterations){ + ListOfclickstreams <- list.append(ListOfclickstreams, trainingCLS) + } + parallelCluster <- parallel::makeCluster(parallel::detectCores()) + clusterEvalQ(parallelCluster, library(clickstream)) + clusterEvalQ(parallelCluster, library(stringr)) + clusterEvalQ(parallelCluster, library(rlist)) + clusterEvalQ(parallelCluster, library(stringi)) + clusterEvalQ(parallelCluster, library(plyr)) + clusterEvalQ(parallelCluster, library(methods)) + clusterEvalQ(parallelCluster, library(igraph)) + clusterEvalQ(parallelCluster, library(stats)) + clusterEvalQ(parallelCluster, library(utils)) + clusterEvalQ(parallelCluster, library(reshape2)) + clusterEvalQ(parallelCluster, library(Rsolnp)) + clusterEvalQ(parallelCluster, library(linprog)) + clusterEvalQ(parallelCluster, library(ggplot2)) + clusterEvalQ(parallelCluster, library(ClickClust)) + setOfclusterSets <- list() + print(centres) + for (c in centres){ + clusters <- parallel::parLapply(parallelCluster,ListOfclickstreams,mkWorker(c)) + setOfclusterSets <- list.append(setOfclusterSets,clusters) + } + if(!is.null(parallelCluster)) { + parallel::stopCluster(parallelCluster) + parallelCluster <- c() + } + return (setOfclusterSets) +} + +#' generates an optimal set of clusters for a clickstream based on certain constraints. This version parallelises k-means and fitToMarkovChain operations across computer cores, and depends on the parallel() library to function. +#' +#' @export +#' @description this is an experimental function for a consensus clustering algorithm based on targetting a range of average next state probabilities derived when fitting each cluster to a markov chain. +#' @param cls The clickstream +#' @param maxIterations number of times to iterate (repeat) through the k-means clustering. +#' @param optimalProbMean The target average probability of each next page click prediction in a 1st order markov chain +#' @param range the range above the optimal probability to target. +#' @param centresMin the minimum cluster centres to evaluate +#' @param clusterCentresRange the additional cluster centres to evaluate +#' @param takeHighest determines whether to default to the highest mean next click probability, or error if the target is not reached after the given number of k-means iterations +#' @param order The order for markov chains that will be used to evaluate each cluster +#' +#' @examples +#' clickstreams <- c("User1,h,c,c,p,c,h,c,p,p,c,p,p,o", +#' "User2,i,c,i,c,c,c,d", +#' "User3,h,i,c,i,c,p,c,c,p,c,c,i,d", +#' "User4,c,c,p,c,d", +#' "User5,h,c,c,p,p,c,p,p,p,i,p,o", +#' "User7,i,h,c,c,p,p,c,p,c,d", +#' "User8,i,h,c,c,p,p,c,p,c,d", +#' "User9,i,h,c,c,p,p,c,p,c,d", +#' "User10,i,h,c,c,p,p,c,p,c,d", +#' "User11,i,h,c,c,p,p,c,p,c,d,z") +#' +#' csf <- tempfile() +#' writeLines(clickstreams, csf) +#' cls <- readClickstreams(csf, header = TRUE) +#' cls +#' +#' clusters <- getConsensusClusters(trainingCLS, testCLS, maxIterations=20, optimalProbMean=0.50, range = 0.40, centresMin = 2, clusterCentresRange = 4, order = 1, takeHighest=FALSE) +#' markovchains <- fitMarkovChains(clusters) +#' startPattern <- new("Pattern", sequence = c("h")) +#' mc <- getOptimalMarkovChain(startPattern, markovchains, clusters) +#' predict(mc, startPattern) +#' +getConsensusClustersParallel = function(trainingCLS, testCLS, maxIterations=10, optimalProbMean=0.50, range=0.30, centresMin=2, order=1, clusterCentresRange=0, takeHighest=FALSE){ + cls <- trainingCLS + vec<-unlist(cls) + dedupe <- vec[which(!duplicated(vec))] + centresMax <- centresMin + clusterCentresRange + listOfClusters <- list() + clusterCentres <- centresMin:centresMax + iterations <- 1:maxIterations + vectorOfAllProbsMeans <-NULL + limit <- optimalProbMean + range + print(optimalProbMean) + print(limit) + + print("getting cluster sets in parallel....") + clusterSets <- getParallelClusterSets(trainingCLS, iterations, centres=clusterCentres) + + print("starting next page probability aggregation....") + for (i in clusterSets){ + for (c in i){ + clusters <- c[[1]] + markovchains <- c[[2]] + vectorOfProbs <-NULL + for (d in dedupe){ + if(d !="Defer"){ + value <- d[[1]] + startPattern <- new("Pattern", sequence = c(value)) + mc <- getOptimalMarkovChain(startPattern,markovchains,clusters) + prob <- predict(mc, startPattern) + vectorOfProbs <- append(vectorOfProbs, prob@probability) + } + } + vectorOfAllProbsMeans <- append(vectorOfAllProbsMeans, mean(vectorOfProbs)) + listOfClusters <- list.append(listOfClusters, clusters) + } + print(vectorOfAllProbsMeans) + candidates <- which(vectorOfAllProbsMeans>optimalProbMean & vectorOfAllProbsMeans < limit) + cat("candidates are: ",candidates,"\n") + # Shutdown cluster neatly + } + print("finished next page probability aggregation....") + if (takeHighest != TRUE){ + if (length(candidates) > 0){ + #get the candidate clusters into a vector + candidateClusters <- list() + for (i in candidates){ + clusters <- listOfClusters[[i]] + candidateClusters <- list.append(candidateClusters,clusters) + } + print("Evaluating candidates.....") + vec_variances <- NULL + for(c in candidateClusters){ + markovchains <- fitMarkovChains(c) + variance <- mcEvaluateAllClusters(markovchains,c,testCLS,trainingCLS,returnChiSquareOnly = TRUE) + cat("variance is....",variance,"\n") + vec_variances <- append(vec_variances,variance) + } + cat("vector of variances is: ",vec_variances,"\n") + #winner <- which(vec_variances==min(vec_variances)) + winner <- which.min(vec_variances) + cat("winner is: ",winner,"\n") + return(candidateClusters[[winner]]) + } + else{ + stop(("target range was not reached with the given number of iterations")) + } + } + else{ + if (length(candidates) == 0){ + warning("target prediction accuracy was not reached with the given number of iterations. Taking highest probability mean") } candidates <- which(vectorOfAllProbsMeans==max(vectorOfAllProbsMeans)) return(listOfClusters[[candidates]]) From cfbc671af1c2ca9ad189c99d573c3f04fd286de1 Mon Sep 17 00:00:00 2001 From: TheovanKraay Date: Wed, 29 Nov 2017 10:31:01 +0000 Subject: [PATCH 6/6] Rename Optimise to Optimise.R --- R/{Optimise => Optimise.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{Optimise => Optimise.R} (100%) diff --git a/R/Optimise b/R/Optimise.R similarity index 100% rename from R/Optimise rename to R/Optimise.R