Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions machine_learning/Data_classification/Knn_neibour.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

# Separate features and labels
def separate_features_labels(data):
features = [point[:-1] for point in data]
labels = [point[-1] for point in data]
return features, labels

def euclidean_distance(point1, point2):
return sum((a - b) ** 2 for a, b in zip(point1, point2)) ** 0.5

def knn_classify(test_point, data_points, k):
features, labels = separate_features_labels(data_points)
distances = []

for index, data_point in enumerate(features):
distance = euclidean_distance(test_point, data_point)
distances.append((distance, labels[index]))

distances.sort(key=lambda x: x[0])
k_nearest_labels = [label for _, label in distances[:k]]

prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
return prediction

if __name__ == "__main__":
data_points = [
(5, 3, 0), (10, 15, 1), (8, 6, 0), (3, 7, 0), (12, 8, 1),
(7, 14, 1), (4, 5, 0), (6, 9, 0), (14, 12, 1), (9, 11, 1),
(11, 5, 0), (13, 7, 1), (6, 10, 0), (8, 13, 1), (5, 14, 1),
(7, 4, 0), (10, 6, 0), (12, 9, 1), (9, 15, 1), (11, 13, 1),
(3, 5, 0), (4, 10, 0), (14, 14, 1), (13, 9, 1), (8, 8, 0),
(7, 7, 0), (15, 11, 1), (12, 6, 0), (9, 10, 0), (5, 13, 1),
(6, 12, 1), (10, 10, 0), (13, 11, 1), (4, 8, 0), (3, 9, 0),
(15, 13, 1), (14, 9, 1), (11, 7, 0), (8, 12, 1), (7, 13, 1),
(5, 10, 0), (6, 11, 0), (9, 7, 0), (10, 9, 0), (12, 15, 1),
(14, 13, 1), (15, 10, 1), (11, 12, 1), (4, 6, 0), (3, 8, 0)
]

test_points = [
(2, 1), (5, 5), (11, 8), (7, 12), (14, 13),
(10, 9), (6, 10), (12, 12), (4, 7), (13, 14)
]

k = 5
for test_point in test_points:
prediction = knn_classify(test_point, data_points, k)
print(f"The predicted class for the test point {test_point} is: {prediction}")
54 changes: 54 additions & 0 deletions machine_learning/Data_classification/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Simple KNN & Decision Tree Classifiers

This project demonstrates the implementation of the k-Nearest Neighbors (KNN) and Decision Tree algorithms from scratch in Python using a sample dataset of 2D points with binary class labels.

## Features

- **KNN Classifier:** Find the class of a test point using the majority class of the k nearest neighbors.
- **Decision Tree Classifier:** Build a decision tree classifier based on information gain and classify new data points.
- **Sample Dataset:** 50 manually defined (x, y, label) tuples.
- **Multiple Test Points:** Easily classify a list of new data points.

## Requirements

- Python 3.x
- (Optional) Libraries like numpy, pandas, scikit-learn, matplotlib for future enhancements.

See [`requirements.txt`](requirements.txt) for details.

## Usage

1. Clone this repository:

```
git clone https://github.com/yourusername/simple-knn-decisiontree.git
cd simple-knn-decisiontree
```

2. Run the classifiers:

```
python knn_decision_tree.py
```

You can modify the `test_points` list to classify different points.

## Files

- `knn_decision_tree.py`: Contains the implementations for KNN and Decision Tree, plus the sample dataset.
- `requirements.txt`: (Optional) Lists external libraries for extended functionality.
- `README.md`: Project introduction and usage.

## Extending the Project

- Integrate sklearn for quick benchmarking.
- Add plotting with matplotlib to visualize the dataset and decision boundaries.
- Use pandas for reading larger datasets from CSV files.

## License

MIT License

---

**Enjoy experimenting with classic machine learning algorithms!**
77 changes: 77 additions & 0 deletions machine_learning/Data_classification/desiciontree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import math
from collections import Counter

def entropy(labels):
total = len(labels)
counts = Counter(labels)
return -sum((count/total) * math.log2(count/total) for count in counts.values())

def best_split(data):
best_gain = 0
best_feature = None
best_threshold = None
base_entropy = entropy([point[-1] for point in data])
n_features = len(data[0]) - 1

for feature in range(n_features):
thresholds = set([point[feature] for point in data])
for threshold in thresholds:
left = [point for point in data if point[feature] <= threshold]
right = [point for point in data if point[feature] > threshold]
if not left or not right:
continue
p_left = len(left) / len(data)
p_right = len(right) / len(data)
gain = base_entropy - (p_left * entropy([point[-1] for point in left]) +
p_right * entropy([point[-1] for point in right]))
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold

def build_tree(data, depth=0, max_depth=5):
labels = [point[-1] for point in data]
if len(set(labels)) == 1 or depth >= max_depth:
return Counter(labels).most_common(1)[0][0] # Leaf node
feature, threshold = best_split(data)
if feature is None:
return Counter(labels).most_common(1)[0][0]
left = [point for point in data if point[feature] <= threshold]
right = [point for point in data if point[feature] > threshold]
return {
"feature": feature,
"threshold": threshold,
"left": build_tree(left, depth + 1, max_depth),
"right": build_tree(right, depth + 1, max_depth)
}

def predict(tree, point):
while isinstance(tree, dict):
if point[tree["feature"]] <= tree["threshold"]:
tree = tree["left"]
else:
tree = tree["right"]
return tree

if __name__ == "__main__":
data_points = [
(5, 3, 0), (10, 15, 1), (8, 6, 0), (3, 7, 0), (12, 8, 1),
(7, 14, 1), (4, 5, 0), (6, 9, 0), (14, 12, 1), (9, 11, 1),
(11, 5, 0), (13, 7, 1), (6, 10, 0), (8, 13, 1), (5, 14, 1),
(7, 4, 0), (10, 6, 0), (12, 9, 1), (9, 15, 1), (11, 13, 1),
(3, 5, 0), (4, 10, 0), (14, 14, 1), (13, 9, 1), (8, 8, 0),
(7, 7, 0), (15, 11, 1), (12, 6, 0), (9, 10, 0), (5, 13, 1),
(6, 12, 1), (10, 10, 0), (13, 11, 1), (4, 8, 0), (3, 9, 0),
(15, 13, 1), (14, 9, 1), (11, 7, 0), (8, 12, 1), (7, 13, 1),
(5, 10, 0), (6, 11, 0), (9, 7, 0), (10, 9, 0), (12, 15, 1),
(14, 13, 1), (15, 10, 1), (11, 12, 1), (4, 6, 0), (3, 8, 0)
]
tree = build_tree(data_points, max_depth=5)
test_points = [
(2, 1), (5, 5), (11, 8), (7, 12), (14, 13),
(10, 9), (6, 10), (12, 12), (4, 7), (13, 14)
]
for test_point in test_points:
result = predict(tree, test_point)
print(f"Predicted class for {test_point}: {result}")
4 changes: 4 additions & 0 deletions machine_learning/Data_classification/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy
scikit-learn
pandas
matplotlib