Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 335 additions & 0 deletions R/Test.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,335 @@
#' evaluates the number of occurances of predicted next click vs total number of starting pattern occurances
#' in a given clickstream. The predicted next click can be from a markov chain of any order.
#' @export
#' @param mc a markovchain object (this should have been built from a set of training data)
#' @param startPattern the starting pattern we want to predict next click on, and evaluate observed occurances in test data.
#' @param testCLS clickstream object with test data
#' @examples
#' training <- c("User1,h,c,c,p,c,h,c,p,p,c,p,p,o",
#' "User2,i,c,i,c,c,c,d",
#' "User3,h,i,c,i,c,p,c,c,p,c,c,i,d",
#' "User4,c,c,p,c,d")
#'
#' test <- c("User1,h,h,h,h,c,c,p,p,h,c,p,p,c,p,p,o",
#' "User2,i,c,i,c,c,c,d",
#' "User4,c,c,c,c,d,c,c,c,c")
#'
#' csf <- tempfile()
#' writeLines(training, csf)
#' trainingCLS <- readClickstreams(csf, header = TRUE)
#'
#' csf <- tempfile()
#' writeLines(test, csf)
#' testCLS <- readClickstreams(csf, header = TRUE)
#'
#' mc <- fitMarkovChain(trainingCLS, order = 1)
#' startPattern <- new("Pattern", sequence = c("c","c"))
#' res <- mcEvaluate(mc, startPattern, testCLS)
#' res

mcEvaluate = function(mc, startPattern, testCLS){
setClass(
"Result",
representation(
totalclicks = "numeric",
observed = "numeric",
expected = "numeric",
residual = "numeric",
residualSquared = "numeric",
component = "numeric",
predictedNextClick = "character",
patternSequence = "character",
probability = "numeric"
)
)
patternMatch <- function(p,s) {
gg <- gregexpr(paste0("(?=",p,")"),s,perl=TRUE)[[1]]
if (length(gg)==1 && gg==-1) 0 else length(gg)
}
pred <- predict(mc, startPattern)
expec <- pred@probability
predicted <- as.character(pred@sequence)
vec_totalPages <- NULL
vec_observed <- NULL
for (i in testCLS){

clicks <- paste(i, collapse = ",")
pattern <- paste(startPattern@sequence, collapse = ",")
pattern <- as.character(pattern)
patternchar <- paste(pattern, ",", sep = "")
totalPages <- patternMatch(patternchar, clicks)
vec_totalPages <- append(vec_totalPages, totalPages)
expectedSeq <- paste(pattern, predicted, sep = ",")
obs <- patternMatch(expectedSeq, clicks)
vec_observed <- append(vec_observed, obs)
}
TotalExpectded <- sum(vec_totalPages) * expec
observed = sum(vec_observed)
expected = TotalExpectded
residual = observed-expected
residualSquared = residual^2
component = residualSquared/expected
result = new(
"Result", expected = expected, observed = observed, residual = residual,
residualSquared = residualSquared, component = component,
predictedNextClick = predicted, patternSequence = pattern, totalclicks = sum(vec_totalPages),
probability = expec
)
return(result)
}

getNOrderPatterns = function(trainingCLS, order){
secondOrder = function(patternlist = NULL){
if (is.null(patternlist)){
vec<-unlist(trainingCLS)
dedupe <- vec[which(!duplicated(vec))]
}
else{
dedupe <- patternlist
}
listPatterns <- list()
OuterlistPatterns <- list()
for (a in dedupe){
value <- a[[1]]
vecPatternsList <- list()
for(b in dedupe){
vecPattern <- append(value,b)
vecPatternsList <- list.append(vecPatternsList, vecPattern)
}
OuterlistPatterns <- list.append(OuterlistPatterns, vecPatternsList)
}
OuterlistPatterns <- unlist(OuterlistPatterns, recursive = FALSE)
}
for(i in 2:order){
if(i>2){
patternlist <- secondOrder(patternlist=patternlist)
}
else{
patternlist <- secondOrder()
}
}
return(patternlist)
}

#' evaluates all next page clicks in a clickstream training data set against the test data. Handles higher order by cycling through every possible pattern permutation (uses getNOrderPatterns function to get patterns). Produces a report of observed and expected values in a matrix
#'
#' @export
#' @param mc a markovchain object that corresponds to a list of clusters
#' @param trainingCLS clickstream object with training data (this should be the data used to build the markov chain object)
#' @param testCLS clickstream object with test data
#' @param mc the markov chain against which to compare the clickstream data
#' @examples
#' training <- c("User1,h,c,c,p,c,h,c,p,p,c,p,p",
#' "User2,i,c,i,c,c,c,d")
#'
#' test <- c("User1,h,c,c,p,c,h,c,d,p,c,d,p",
#' "User2,i,c,i,p,c,c,d")
#'
#' csf <- tempfile()
#' writeLines(training, csf)
#' trainingCLS <- readClickstreams(csf, header = TRUE)
#'
#' csf <- tempfile()
#' writeLines(test, csf)
#' testCLS <- readClickstreams(csf, header = TRUE)
#'
#' mc <- fitMarkovChain(trainingCLS, order = 2)
#' mcEvaluateAll(mc, testCLS, trainingCLS)


mcEvaluateAll = function(mc, testCLS, trainingCLS, includeChiSquare = TRUE, returnChiSquareOnly = FALSE){
results <- data.frame( "totalclicks" = character(), "observed" = character(), "expected" = character(), "residual" = character(), "residualSquared" = character(), "component" = numeric(), "predictedNextClick" = character(), "patternSequence" = character(), "probability" = character(), stringsAsFactors=FALSE)
if(mc@order==1){
vec<-unlist(trainingCLS)
dedupe <- vec[which(!duplicated(vec))]
}
else{
dedupe <-getNOrderPatterns(trainingCLS, order = mc@order)
}
for (d in dedupe){
if(mc@order==1){
value <- d[[1]]
}
else{
value <- d
}
startPattern <- new("Pattern", sequence = c(value))
res <- mcEvaluate(mc, startPattern, testCLS)
if (res@totalclicks != 0 && res@expected > 0){
vec_results <- c(res@totalclicks, res@observed, res@expected, res@residual, res@residualSquared, res@component, res@patternSequence, res@predictedNextClick,res@probability)
results[nrow(results) + 1, ] <- c(res@totalclicks, res@observed, res@expected, res@residual, res@residualSquared, res@component, res@predictedNextClick, res@patternSequence, res@probability)
}
}
ChiSquare <- sum(as.numeric(results$component))
if (includeChiSquare == TRUE){
results[nrow(results) + 1, ] <- c(0, 0, 0, 0, "variance:", ChiSquare, 0, 0, 0)
}
if (returnChiSquareOnly == TRUE){
results <- ChiSquare
}
return(results)
}


#' evaluates all next page clicks in a clickstream training data set against the test data on the basis of a set of pre-computed Markov chains and corresponding clusters. Handles higher order by cycling through every possible pattern permutation (uses getNOrderPatterns function to get patterns). Produces and produces a report of observed and expected values in a matrix
#'
#' @export
#' @param markovchains a list of pre-computed markovchain objects that correspond to a list of clusters
#' @param clusters a clist of clusters
#' @param trainingCLS clickstream object with training data (this should be the data used to build the markov chains objects)
#' @param testCLS clickstream object with test data
#' @param mc the markov chain against which to compare the clickstream data
#' @examples
#' training <- c("User1,h,c,c,p,c,h,c,h,o,p,p,c,p,p,o",
#' "User2,i,c,i,c,c,c,o,o,o,i,d",
#' "User3,h,i,c,i,c,o,i,p,c,c,p,c,c,i,d",
#' "User4,c,c,p,c,d,o,i,h,o,o")
#'
#' test <- c("User1,h,c,c,p,p,h,o,i,c,p,p,c,p,p,o",
#' "User2,i,c,i,c,c,c,d",
#' "User4,c,c,c,c,d")
#'
#' csf <- tempfile()
#' writeLines(training, csf)
#' trainingCLS <- readClickstreams(csf, header = TRUE)
#'
#' csf <- tempfile()
#' writeLines(test, csf)
#' testCLS <- readClickstreams(csf, header = TRUE)
#'
#' clusters <- clusterClickstreams(trainingCLS, centers = 2, order = 1)
#' markovchains <- fitMarkovChains(clusters, order = 2)
#' mcEvaluateAllClusters(markovchains, clusters, testCLS, trainingCLS)

mcEvaluateAllClusters = function(markovchains, clusters, testCLS, trainingCLS, includeChiSquare = TRUE, returnChiSquareOnly = FALSE){
results <- data.frame( "totalclicks" = character(), "observed" = character(), "expected" = character(), "residual" = character(), "residualSquared" = character(), "component" = numeric(), "predictedNextClick" = character(), "patternSequence" = character(), "probability" = character(), stringsAsFactors=FALSE)
if(markovchains[[1]]@order==1){
vec<-unlist(trainingCLS)
dedupe <- vec[which(!duplicated(vec))]
}
else{
dedupe <-getNOrderPatterns(trainingCLS, order = markovchains[[1]]@order)
}
for (d in dedupe){
if(markovchains[[1]]@order==1){
value <- d[[1]]
}
else{
value <- d
}
startPattern <- new("Pattern", sequence = c(value))
mc <- getOptimalMarkovChain(startPattern,markovchains,clusters)
res <- mcEvaluate(mc, startPattern, testCLS)
if (res@totalclicks != 0){
vec_results <- c(res@totalclicks, res@observed, res@expected, res@residual, res@residualSquared, res@component, res@patternSequence, res@predictedNextClick,res@probability)
results[nrow(results) + 1, ] <- c(res@totalclicks, res@observed, res@expected, res@residual, res@residualSquared, res@component, res@predictedNextClick, res@patternSequence, res@probability)
}
}
ChiSquare <- sum(as.numeric(results$component))
if (includeChiSquare == TRUE){
results[nrow(results) + 1, ] <- c(0, 0, 0, 0, "variance:", ChiSquare, 0, 0, 0)
}
if (returnChiSquareOnly == TRUE){
results <- ChiSquare
}
return(results)
}


getSequenceTotal = function(startPattern, nextSequence, testCLS){
setClass(
"Result",
representation(
totalclicks = "numeric",
observed = "numeric",
nextSequence = "character",
startPattern = "character"
)
)
patternMatch <- function(p,s) {
gg <- gregexpr(paste0("(?=",p,")"),s,perl=TRUE)[[1]]
if (length(gg)==1 && gg==-1) 0 else length(gg)
}
nextseq <- nextSequence
vec_totalPages <- NULL
vec_observed <- NULL
for (i in testCLS){
clicks <- paste(i, collapse = ",")
clicks <- paste(clicks, ",", sep = "")
pattern <- paste(startPattern@sequence, collapse = ",")
pattern <- as.character(pattern)
patternchar <- paste(pattern, ",", sep = "")
totalPages <- patternMatch(patternchar, clicks)
vec_totalPages <- append(vec_totalPages, totalPages)
expectedSeq <- paste(pattern, nextseq, sep = ",")
obs <- patternMatch(expectedSeq, clicks)
vec_observed <- append(vec_observed, obs)
}
result = new(
"Result", observed = sum(vec_observed),
nextSequence = nextseq, startPattern = pattern, totalclicks = sum(vec_totalPages)
)
return(result)
}

getSequenceMatrix = function(mc, testCLS){
empiricalMatrix <- mc
cols <- colnames(empiricalMatrix)
rows <- rownames(empiricalMatrix)
for (c in rows){
for(r in cols){
value <- empiricalMatrix[[c,r]]
startPattern <- new("Pattern", sequence = c(c))
nextSequence <- r
value <- getSequenceTotal(startPattern, nextSequence, testCLS)
empiricalMatrix[[c,r]] <- value@observed
}
}
return(empiricalMatrix)
}


#' calculates the Chi-Square Statistic, p-value, and degrees of freedom, for a transition matrix compared with observed state changes.
#'
#'
#' @export
#' @param cls The clickstream
#' @param mc the markov chain against which to compare the clickstream data
#' @examples
#' clickstreams <- c("User1,h,c,c,p,c,h,c,p,p,c,p,p,o",
#' "User2,i,c,i,c,c,c,d",
#' "User3,h,i,c,i,c,p,c,c,p,c,c,i,d",
#' "User4,c,c,p,c,d")
#'
#' csf <- tempfile()
#' writeLines(clickstreams, csf)
#' cls <- readClickstreams(csf, header = TRUE)
#' mc <- fitMarkovChain(cls)
#' chiSquareTest(cls,mc, verbose = TRUE)

chiSquareTest <- function(cls, mc, printValues = TRUE) {
object <- as.data.frame(t(mc@transitions[[1]]))
data <- cls
data <- getSequenceMatrix(object, data)
data <- data[match(rownames(data),names(object)),]
data <- data[,match(colnames(data),names(object))]
cols <- colSums(data)
statistic <- 0
for (i in 1:length(object)) {
for (j in 1:length(object)) {
if (data[i, j]>0&object[i, j]>0) statistic <- statistic + data[i, j]*log(data[i, j]/(cols[i]*object[i, j]))
}
}
statistic <- statistic * 2
null_elements <- sum(object == 0)
dof <- length(object) * (length(object) - 1) - null_elements
p.value <- 1 - pchisq(q = statistic,df = dof)
if(printValues==TRUE){
cat("Test data:\n");print(data);cat("Transition matrix: \n");print(object);print("...")
cat("Chi-Square Statistic:",statistic,"\n")
cat("degrees of freedom:",dof,"\n")
cat("The p-value:",p.value,"\n")
}
out <- list(statistic = statistic, dof = dof,pvalue = p.value)
return(out)
}