From b33975437c080278d1f0a48e9cb241dc27d99ed6 Mon Sep 17 00:00:00 2001 From: SARTHAK Date: Mon, 6 Feb 2023 00:34:47 +0530 Subject: [PATCH 1/2] Added multinomial nb --- CMakeLists.txt | 5 +- examples/neighbors/bernoulli_nb.cpp | 3 +- examples/neighbors/multinomial_nb.cpp | 24 +++++ src/slowmokit.hpp | 1 + .../neighbors/bernoulli_nb/bernoulli_nb.cpp | 8 +- .../neighbors/bernoulli_nb/bernoulli_nb.hpp | 5 +- .../methods/neighbors/multinomial_nb.hpp | 13 +++ .../multinomial_nb/multinomial_nb.cpp | 100 ++++++++++++++++++ .../multinomial_nb/multinomial_nb.hpp | 25 +++++ 9 files changed, 180 insertions(+), 4 deletions(-) create mode 100644 examples/neighbors/multinomial_nb.cpp create mode 100644 src/slowmokit/methods/neighbors/multinomial_nb.hpp create mode 100644 src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp create mode 100644 src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b03d30..7781392 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,4 +39,7 @@ add_library(slowmokit src/slowmokit/methods/preprocessing/standardization.cpp src/slowmokit/methods/neighbors/bernoulli_nb.hpp src/slowmokit/methods/neighbors/bernoulli_nb/bernoulli_nb.hpp - src/slowmokit/methods/neighbors/bernoulli_nb/bernoulli_nb.cpp) \ No newline at end of file + src/slowmokit/methods/neighbors/bernoulli_nb/bernoulli_nb.cpp + src/slowmokit/methods/neighbors/multinomial_nb.hpp + src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.hpp + src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp) \ No newline at end of file diff --git a/examples/neighbors/bernoulli_nb.cpp b/examples/neighbors/bernoulli_nb.cpp index 0d00148..69caa4d 100644 --- a/examples/neighbors/bernoulli_nb.cpp +++ b/examples/neighbors/bernoulli_nb.cpp @@ -17,6 +17,7 @@ // std::vector yTrain{0,0,1,1,0,0,1,1,1,0}; // std::vector xTest{1,0,0}; // BernoulliNB bernoulli; -// std::cout<> xTrain{ +// {0,0,0}, +// {1,0,0}, +// {1,0,1}, +// {1,0,0}, +// {0,1,1}, +// {1,0,0}, +// {1,1,0}, +// {0,1,1}, +// {1,0,1}, +// {1,1,1} +// }; +// std::vector yTrain{2,2,2,2,1,2,2,1,2,0}; +// std::vector xTest{0,0,0}; +// std::vector classes{0,1,2}; +// MultinomialNB multinomial; +// multinomial.fit(xTrain,yTrain); +// std::cout< } template - int fit(std::vector> xTrain,std::vector yTrain,std::vector xTest){ + void fit(std::vector> xTrain,std::vector yTrain){ + this->xTrain = xTrain; + this->yTrain = yTrain; + } + +template + int fit_predict(std::vector xTest){ int nFeatures = xTrain[0].size(); std::vector postProbs; diff --git a/src/slowmokit/methods/neighbors/bernoulli_nb/bernoulli_nb.hpp b/src/slowmokit/methods/neighbors/bernoulli_nb/bernoulli_nb.hpp index 0df9b4c..1ffe1bc 100644 --- a/src/slowmokit/methods/neighbors/bernoulli_nb/bernoulli_nb.hpp +++ b/src/slowmokit/methods/neighbors/bernoulli_nb/bernoulli_nb.hpp @@ -12,12 +12,15 @@ template class BernoulliNB{ private: + std::vector> xTrain; + std::vector yTrain; double prior_prob(std::vector yTrain,int label); double conditional_prob(std::vector> xTrain,std::vector yTrain,int featureCol,int featureVal,int label); public: // returns predicted value for each test according to bernoulli naive bayes model - int fit(std::vector> xTrain,std::vector yTrain,std::vector xTest); + void fit(std::vector> xTrain,std::vector yTrain) + int fit_predict(std::vector xTest); }; #endif \ No newline at end of file diff --git a/src/slowmokit/methods/neighbors/multinomial_nb.hpp b/src/slowmokit/methods/neighbors/multinomial_nb.hpp new file mode 100644 index 0000000..f0f6fc0 --- /dev/null +++ b/src/slowmokit/methods/neighbors/multinomial_nb.hpp @@ -0,0 +1,13 @@ +/** + * @file methods/neighbors/multinomial_nb.hpp + * + * Easy include for Multinomial Nb algorithm + */ + + +#ifndef SLOWMOKIT_MULTINOMIAL_NB_HPP +#define SLOWMOKIT_MULTINOMIAL_NB_HPP + +#include "multinomial_nb/multinomial_nb.hpp" + +#endif //SLOWMOKIT_MULTINOMIAL_NB_HPP \ No newline at end of file diff --git a/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp b/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp new file mode 100644 index 0000000..822e985 --- /dev/null +++ b/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp @@ -0,0 +1,100 @@ +/** + * @file methods/neighbors/multinomial_nb/multinomial_nb.cpp + * + * Implementation of the Multinomial Naive Bayes main program + */ +#include "multinomial_nb.hpp" + +template + double priorProb(std::vector yTrain,int label){ // Prior-Probability P(y) + int sum=0; + for(int i=0;i + double conditionalProb(std::vector> xTrain,std::vector yTrain,int featureCol,int featureVal,int label){ // Conditional-Probabilty P(x=f1 / y=class) + int denominator=0,numerator=0; + std::set varies_values; + for(int i=0;i because if word do not exist in training but exist in testing + // denominator + number_of_different_values_in_feature + } + +template + void fit(std::vector> xTrain,std::vector yTrain,std::vector classes){ + this->xTrain = xTrain; + this->yTrain = yTrain; + this->classes = classes; + } + +template + int fit_predict(std::vector xTest){ + int n_features = xTrain[0].size(); + + std::vector postProbs; + + std::vector ::iterator label; // different ouput labels + for(label=classes.begin();label!=classes.end();label++){ // iterating over all labels (P(y=y1,y2,y3....)) + double likelihood=1.0; + for(int i=0;i Log10(P(x=x1 / y=label)) + } + + double prior = priorProb(yTrain,*label); + double post = prior*likelihood; + postProbs.push_back(post); + } + double sumpropProbs = 0.0; + int max=0; + for(int i=0;ipostProbs[max]){ + max = i; + } + } + return max; + }; + +signed main(){ + std::vector> x_train{ + {0,0,0}, + {1,0,0}, + {1,0,1}, + {1,0,0}, + {0,1,1}, + {1,0,0}, + {1,1,0}, + {0,1,1}, + {1,0,1}, + {1,1,1} + }; + std::vector y_train{2,2,2,2,1,2,2,1,2,0}; + std::vector x_test{0,0,0}; + std::vector classes{0,1,2}; + MultinomialNB multinomial; + multinomial.fit(x_train,y_train,classes); + std::cout< +class MultinomialNB{ +private: + std::vector> xTrain; + std::vector yTrain; + std::vector classes; + double priorProb(std::vector yTrain,int label); + double conditionalProb(std::vector> xTrain,std::vector yTrain,int featureCol,int featureVal,int label); +public: + void fit(std::vector> xTrain,std::vector yTrain,std::vector classes); + int fit_predict(std::vector xTest); +}; + +#endif \ No newline at end of file From b4caa746a48236e0eff6adbb148e6ea7bed4491c Mon Sep 17 00:00:00 2001 From: SARTHAK Date: Sun, 19 Feb 2023 15:34:10 +0530 Subject: [PATCH 2/2] csv Reader added --- docs/methods/preprocessing/csv_reader.md | 24 +++ examples/methods/preprocessing/csv_reader.cpp | 6 + .../multinomial_nb/multinomial_nb.cpp | 144 +++++++++--------- .../multinomial_nb/multinomial_nb.hpp | 71 ++++----- .../methods/preprocessing/csv_reader.cpp | 83 ++++++++++ .../methods/preprocessing/csv_reader.hpp | 15 ++ 6 files changed, 237 insertions(+), 106 deletions(-) create mode 100644 docs/methods/preprocessing/csv_reader.md create mode 100644 examples/methods/preprocessing/csv_reader.cpp create mode 100644 src/slowmokit/methods/preprocessing/csv_reader.cpp create mode 100644 src/slowmokit/methods/preprocessing/csv_reader.hpp diff --git a/docs/methods/preprocessing/csv_reader.md b/docs/methods/preprocessing/csv_reader.md new file mode 100644 index 0000000..05b5fa0 --- /dev/null +++ b/docs/methods/preprocessing/csv_reader.md @@ -0,0 +1,24 @@ +# Csv Reader + +It reads the csv File provided and accordingly set the data type in that particular column double/string + +After computing the data type whole file is converted to string vector with their data types in first row of vector. Those values can be then converted to data-type on the runtime whenever required. + +## Parameters + +| Name | Definition | Type | +|--------------|--------------------------------------------|--------------| +| file | Csv file which needs to be read | `ifstream` | + + +## Methods + +| Name | Definition | Return value | +|----------------------------------------|-----------------------------------------------|---------------| +| `readCsv(string name,bool index)` | To read csv file and allocate which column is double or string | `vector>` | + +## Example + +``` +std::vector>reader = readCsv("../../lol.csv",1); +``` \ No newline at end of file diff --git a/examples/methods/preprocessing/csv_reader.cpp b/examples/methods/preprocessing/csv_reader.cpp new file mode 100644 index 0000000..f123331 --- /dev/null +++ b/examples/methods/preprocessing/csv_reader.cpp @@ -0,0 +1,6 @@ +// #include "../../src/slowmokit/methods/preprocessing/csv_reader.hpp" + +// signed main(){ +// std::vector>reader = readCsv("../../lol.csv",1); +// return 0; +// } \ No newline at end of file diff --git a/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp b/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp index d646ccf..29f04bf 100644 --- a/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp +++ b/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.cpp @@ -5,92 +5,94 @@ */ #include "multinomial_nb.hpp" -template +template double MultinomialNB::priorProb(std::vector yTrain, int label) { // Prior-Probability P(y) - int sum = 0; - for (int i = 0; i < yTrain.size(); i++) - { - if (yTrain[i] == label) - { - sum += 1; - } - } - return sum / double(yTrain.size()); + int sum = 0; + for (int i = 0; i < yTrain.size(); i++) + { + if (yTrain[i] == label) + { + sum += 1; + } + } + return sum / double(yTrain.size()); } -template +template double MultinomialNB::conditionalProb(std::vector> xTrain, std::vector yTrain, int featureCol, int featureVal, int label) { // Conditional-Probabilty P(x=f1 / y=class) - int denominator = 0, numerator = 0; - std::set varies_values; - for (int i = 0; i < yTrain.size(); i++) - { - if (yTrain[i] == label) - { // Check if label of feature is same as output looking for - denominator++; // All values with output_class we are looking for - varies_values.insert(xTrain[i][featureCol]); - if (xTrain[i][featureCol] == featureVal) - { - numerator++; // numerator increase only when that feature consists of - // value same as value in testing feature - } - } - } - return (numerator + 1) / - double(denominator + - varies_values - .size()); // as it is multinomial nb so to avoid overfitting - // numerator+1 -> because if word do not exist in training but exist in - // testing denominator + number_of_different_values_in_feature + int denominator = 0, numerator = 0; + std::set varies_values; + for (int i = 0; i < yTrain.size(); i++) + { + if (yTrain[i] == label) + { // Check if label of feature is same as output looking for + denominator++; // All values with output_class we are looking for + varies_values.insert(xTrain[i][featureCol]); + if (xTrain[i][featureCol] == featureVal) + { + numerator++; // numerator increase only when that feature + // consists of value same as value in testing + // feature + } + } + } + return (numerator + 1) / + double(denominator + + varies_values + .size()); // as it is multinomial nb so to avoid overfitting + // numerator+1 -> because if word do not exist in training but exist in + // testing denominator + number_of_different_values_in_feature } -template +template int MultinomialNB::fit_predict(std::vector> xTrain, std::vector yTrain, std::vector classes, std::vector xTest) { - int n_features = xTrain[0].size(); + int n_features = xTrain[0].size(); - std::vector postProbs; + std::vector postProbs; - std::vector::iterator label; // different ouput labels - for (label = classes.begin(); label != classes.end(); label++) - { // iterating over all labels (P(y=y1,y2,y3....)) - double likelihood = 1.0; - for (int i = 0; i < n_features; i++) - { - double cond = conditionalProb(xTrain, yTrain, i, xTest[i], *label); - likelihood += log10(cond); // Log10 of computed conditionl probabilty => - // Log10(P(x=x1 / y=label)) - } + std::vector::iterator label; // different ouput labels + for (label = classes.begin(); label != classes.end(); label++) + { // iterating over all labels (P(y=y1,y2,y3....)) + double likelihood = 1.0; + for (int i = 0; i < n_features; i++) + { + double cond = conditionalProb(xTrain, yTrain, i, xTest[i], *label); + likelihood += log10(cond); // Log10 of computed conditionl + // probabilty => Log10(P(x=x1 / y=label)) + } - double prior = priorProb(yTrain, *label); - double post = prior * likelihood; - postProbs.push_back(post); - } - double sumpropProbs = 0.0; - int max = 0; - for (int i = 0; i < postProbs.size(); i++) - { // Iterating over all labels computed values to check if any negative value - // exists, if so then direclty values are compared else we will compute - // value/sum - if (postProbs[i] < 0) - { - sumpropProbs = 1; - break; - } - sumpropProbs += postProbs[i]; - } - for (int i = 0; i < postProbs.size(); i++) - { - postProbs[i] /= sumpropProbs; - if (postProbs[i] > postProbs[max]) - { - max = i; - } - } - return max; + double prior = priorProb(yTrain, *label); + double post = prior * likelihood; + postProbs.push_back(post); + } + double sumpropProbs = 0.0; + int max = 0; + for (int i = 0; i < postProbs.size(); i++) + { // Iterating over all labels computed values to check if any negative + // value + // exists, if so then direclty values are compared else we will compute + // value/sum + if (postProbs[i] < 0) + { + sumpropProbs = 1; + break; + } + sumpropProbs += postProbs[i]; + } + for (int i = 0; i < postProbs.size(); i++) + { + postProbs[i] /= sumpropProbs; + if (postProbs[i] > postProbs[max]) + { + max = i; + } + } + return max; }; \ No newline at end of file diff --git a/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.hpp b/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.hpp index 04df347..b69d9d0 100644 --- a/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.hpp +++ b/src/slowmokit/methods/neighbors/multinomial_nb/multinomial_nb.hpp @@ -9,43 +9,44 @@ #include "../../../core.hpp" -template class MultinomialNB +template +class MultinomialNB { - private: - /** - * @brief Prior probability of multinomial naive bayes - * - * @param yTrain All y training values - * @param label output we are looking in prior probability - * @return double prior Probability - */ - double priorProb(std::vector yTrain, int label); - /** - * @brief Conditional Probability - * - * @param xTrain all x training int values - * @param yTrain all y training int values - * @param featureCol column for which we are computing conditional prob - * @param featureVal value in that column - * @param label y value - * @return double conditional probability - */ - double conditionalProb(std::vector> xTrain, - std::vector yTrain, int featureCol, int featureVal, - int label); + private: + /** + * @brief Prior probability of multinomial naive bayes + * + * @param yTrain All y training values + * @param label output we are looking in prior probability + * @return double prior Probability + */ + double priorProb(std::vector yTrain, int label); + /** + * @brief Conditional Probability + * + * @param xTrain all x training int values + * @param yTrain all y training int values + * @param featureCol column for which we are computing conditional prob + * @param featureVal value in that column + * @param label y value + * @return double conditional probability + */ + double conditionalProb(std::vector> xTrain, + std::vector yTrain, int featureCol, + int featureVal, int label); - public: - /** - * @brief fit_predict function - * - * @param xTrain all x training values - * @param yTrain all y training values int - * @param classes classes of y - * @param xTest testing values int - * @return int output predicted value - */ - int fit_predict(std::vector> xTrain, std::vector yTrain, - std::vector classes, std::vector xTest); + public: + /** + * @brief fit_predict function + * + * @param xTrain all x training values + * @param yTrain all y training values int + * @param classes classes of y + * @param xTest testing values int + * @return int output predicted value + */ + int fit_predict(std::vector> xTrain, std::vector yTrain, + std::vector classes, std::vector xTest); }; #endif \ No newline at end of file diff --git a/src/slowmokit/methods/preprocessing/csv_reader.cpp b/src/slowmokit/methods/preprocessing/csv_reader.cpp new file mode 100644 index 0000000..4a3f039 --- /dev/null +++ b/src/slowmokit/methods/preprocessing/csv_reader.cpp @@ -0,0 +1,83 @@ +/** + * @file methods/preprocessing/csv_reader.cpp + * + * Implementation of Csv reader + */ + +#include "csv_reader.hpp" + +template +std::vector> readCsv(std::string name, bool index) +{ + std::ifstream file; + file.open(name); + std::string output; + if (index) + { + getline(file, output); + } + getline(file, output); + bool duble = true; + std::vector isDouble; + std::string current = ""; + for (int i = 0; i < output.size(); i++) + { + if (output[i] == ',') + { + if (duble and current.size() > 0) + { + isDouble.push_back(1); + } + else + { + isDouble.push_back(0); + } + duble = true; + current = ""; + } + else + { + current += output[i]; + if ((output[i] >= '0' and output[i] <= '9') or output[i] == '.') + { + continue; + } + else + { + duble = false; + } + } + } + file.close(); + + // Storing all values as string with their data type at top of them + std::vector> csvFile; + std::vector first(isDouble.size()); + for (int i = 0; i < isDouble.size(); i++) + { + first[i] = std::to_string(isDouble[i]); + } + csvFile.push_back(first); + file.open(name); + + while (getline(file, output)) + { + first.clear(); + current = ""; + for (int i = 0; i < output.size(); i++) + { + if (output[i] == ',') + { + first.push_back(current); + current = ""; + } + else + { + current += output[i]; + } + } + csvFile.push_back(first); + } + + return csvFile; +} \ No newline at end of file diff --git a/src/slowmokit/methods/preprocessing/csv_reader.hpp b/src/slowmokit/methods/preprocessing/csv_reader.hpp new file mode 100644 index 0000000..56ab025 --- /dev/null +++ b/src/slowmokit/methods/preprocessing/csv_reader.hpp @@ -0,0 +1,15 @@ +/** + * @file methods/preprocessing/csv_reader.hpp + * + * Easy include To Return the csv read data + */ + +#ifndef SLOWMOKIT_CSV_READER_HPP +#define SLOWMOKIT_CSV_READER_HPP + +#include "../../core.hpp" + +template +std::vector> readCsv(std::string name, bool index); + +#endif // SLOWMOKIT_CSV_READER_HPP \ No newline at end of file