diff --git a/machine_learning/Data_classification/Knn_neibour.py b/machine_learning/Data_classification/Knn_neibour.py new file mode 100644 index 0000000..aa3ddfc --- /dev/null +++ b/machine_learning/Data_classification/Knn_neibour.py @@ -0,0 +1,47 @@ + +# Separate features and labels +def separate_features_labels(data): + features = [point[:-1] for point in data] + labels = [point[-1] for point in data] + return features, labels + +def euclidean_distance(point1, point2): + return sum((a - b) ** 2 for a, b in zip(point1, point2)) ** 0.5 + +def knn_classify(test_point, data_points, k): + features, labels = separate_features_labels(data_points) + distances = [] + + for index, data_point in enumerate(features): + distance = euclidean_distance(test_point, data_point) + distances.append((distance, labels[index])) + + distances.sort(key=lambda x: x[0]) + k_nearest_labels = [label for _, label in distances[:k]] + + prediction = max(set(k_nearest_labels), key=k_nearest_labels.count) + return prediction + +if __name__ == "__main__": + data_points = [ + (5, 3, 0), (10, 15, 1), (8, 6, 0), (3, 7, 0), (12, 8, 1), + (7, 14, 1), (4, 5, 0), (6, 9, 0), (14, 12, 1), (9, 11, 1), + (11, 5, 0), (13, 7, 1), (6, 10, 0), (8, 13, 1), (5, 14, 1), + (7, 4, 0), (10, 6, 0), (12, 9, 1), (9, 15, 1), (11, 13, 1), + (3, 5, 0), (4, 10, 0), (14, 14, 1), (13, 9, 1), (8, 8, 0), + (7, 7, 0), (15, 11, 1), (12, 6, 0), (9, 10, 0), (5, 13, 1), + (6, 12, 1), (10, 10, 0), (13, 11, 1), (4, 8, 0), (3, 9, 0), + (15, 13, 1), (14, 9, 1), (11, 7, 0), (8, 12, 1), (7, 13, 1), + (5, 10, 0), (6, 11, 0), (9, 7, 0), (10, 9, 0), (12, 15, 1), + (14, 13, 1), (15, 10, 1), (11, 12, 1), (4, 6, 0), (3, 8, 0) + ] + + test_points = [ + (2, 1), (5, 5), (11, 8), (7, 12), (14, 13), + (10, 9), (6, 10), (12, 12), (4, 7), (13, 14) + ] + + k = 5 + for test_point in test_points: + prediction = knn_classify(test_point, data_points, k) + print(f"The predicted class for the test point {test_point} is: {prediction}") diff --git a/machine_learning/Data_classification/README.md b/machine_learning/Data_classification/README.md new file mode 100644 index 0000000..97667c6 --- /dev/null +++ b/machine_learning/Data_classification/README.md @@ -0,0 +1,54 @@ +# Simple KNN & Decision Tree Classifiers + +This project demonstrates the implementation of the k-Nearest Neighbors (KNN) and Decision Tree algorithms from scratch in Python using a sample dataset of 2D points with binary class labels. + +## Features + +- **KNN Classifier:** Find the class of a test point using the majority class of the k nearest neighbors. +- **Decision Tree Classifier:** Build a decision tree classifier based on information gain and classify new data points. +- **Sample Dataset:** 50 manually defined (x, y, label) tuples. +- **Multiple Test Points:** Easily classify a list of new data points. + +## Requirements + +- Python 3.x +- (Optional) Libraries like numpy, pandas, scikit-learn, matplotlib for future enhancements. + +See [`requirements.txt`](requirements.txt) for details. + +## Usage + +1. Clone this repository: + + ``` + git clone https://github.com/yourusername/simple-knn-decisiontree.git + cd simple-knn-decisiontree + ``` + +2. Run the classifiers: + + ``` + python knn_decision_tree.py + ``` + + You can modify the `test_points` list to classify different points. + +## Files + +- `knn_decision_tree.py`: Contains the implementations for KNN and Decision Tree, plus the sample dataset. +- `requirements.txt`: (Optional) Lists external libraries for extended functionality. +- `README.md`: Project introduction and usage. + +## Extending the Project + +- Integrate sklearn for quick benchmarking. +- Add plotting with matplotlib to visualize the dataset and decision boundaries. +- Use pandas for reading larger datasets from CSV files. + +## License + +MIT License + +--- + +**Enjoy experimenting with classic machine learning algorithms!** diff --git a/machine_learning/Data_classification/desiciontree.py b/machine_learning/Data_classification/desiciontree.py new file mode 100644 index 0000000..e96a21e --- /dev/null +++ b/machine_learning/Data_classification/desiciontree.py @@ -0,0 +1,77 @@ +import math +from collections import Counter + +def entropy(labels): + total = len(labels) + counts = Counter(labels) + return -sum((count/total) * math.log2(count/total) for count in counts.values()) + +def best_split(data): + best_gain = 0 + best_feature = None + best_threshold = None + base_entropy = entropy([point[-1] for point in data]) + n_features = len(data[0]) - 1 + + for feature in range(n_features): + thresholds = set([point[feature] for point in data]) + for threshold in thresholds: + left = [point for point in data if point[feature] <= threshold] + right = [point for point in data if point[feature] > threshold] + if not left or not right: + continue + p_left = len(left) / len(data) + p_right = len(right) / len(data) + gain = base_entropy - (p_left * entropy([point[-1] for point in left]) + + p_right * entropy([point[-1] for point in right])) + if gain > best_gain: + best_gain = gain + best_feature = feature + best_threshold = threshold + return best_feature, best_threshold + +def build_tree(data, depth=0, max_depth=5): + labels = [point[-1] for point in data] + if len(set(labels)) == 1 or depth >= max_depth: + return Counter(labels).most_common(1)[0][0] # Leaf node + feature, threshold = best_split(data) + if feature is None: + return Counter(labels).most_common(1)[0][0] + left = [point for point in data if point[feature] <= threshold] + right = [point for point in data if point[feature] > threshold] + return { + "feature": feature, + "threshold": threshold, + "left": build_tree(left, depth + 1, max_depth), + "right": build_tree(right, depth + 1, max_depth) + } + +def predict(tree, point): + while isinstance(tree, dict): + if point[tree["feature"]] <= tree["threshold"]: + tree = tree["left"] + else: + tree = tree["right"] + return tree + +if __name__ == "__main__": + data_points = [ + (5, 3, 0), (10, 15, 1), (8, 6, 0), (3, 7, 0), (12, 8, 1), + (7, 14, 1), (4, 5, 0), (6, 9, 0), (14, 12, 1), (9, 11, 1), + (11, 5, 0), (13, 7, 1), (6, 10, 0), (8, 13, 1), (5, 14, 1), + (7, 4, 0), (10, 6, 0), (12, 9, 1), (9, 15, 1), (11, 13, 1), + (3, 5, 0), (4, 10, 0), (14, 14, 1), (13, 9, 1), (8, 8, 0), + (7, 7, 0), (15, 11, 1), (12, 6, 0), (9, 10, 0), (5, 13, 1), + (6, 12, 1), (10, 10, 0), (13, 11, 1), (4, 8, 0), (3, 9, 0), + (15, 13, 1), (14, 9, 1), (11, 7, 0), (8, 12, 1), (7, 13, 1), + (5, 10, 0), (6, 11, 0), (9, 7, 0), (10, 9, 0), (12, 15, 1), + (14, 13, 1), (15, 10, 1), (11, 12, 1), (4, 6, 0), (3, 8, 0) + ] + tree = build_tree(data_points, max_depth=5) + test_points = [ + (2, 1), (5, 5), (11, 8), (7, 12), (14, 13), + (10, 9), (6, 10), (12, 12), (4, 7), (13, 14) + ] + for test_point in test_points: + result = predict(tree, test_point) + print(f"Predicted class for {test_point}: {result}") diff --git a/machine_learning/Data_classification/requirements.txt b/machine_learning/Data_classification/requirements.txt new file mode 100644 index 0000000..2f9c342 --- /dev/null +++ b/machine_learning/Data_classification/requirements.txt @@ -0,0 +1,4 @@ +numpy +scikit-learn +pandas +matplotlib