Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 170 additions & 0 deletions one_hot_encoding.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "IwUR_lpKN6f5"
},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
"\n",
"# The LabelEncoder encodes a sequence of bases as a sequence of integers.\n",
"#Create an instance of LabelEncoder()\n",
"integer_encoder = LabelEncoder() # Height = [\"short\", \"medium\", \"tall\"] ----> Height = [1, 0, 2]\n",
"\n",
"# The OneHotEncoder converts an array of integers to a sparse matrix where \n",
"# each row corresponds to one possible value of each feature.\n",
"#Create an instance of One-hot-encoder\n",
"one_hot_encoder = OneHotEncoder(categories=[list(range(4))]) #'auto' Gender = [Male, Female] -----> Gender = [[0 , 1], \n",
" # [1 , 0]]"
]
},
{
"cell_type": "code",
"source": [
"sequence =list(\"AATGTC\") #['A', 'A', 'T', 'G', 'T', 'C']\n",
"integer_encoded = integer_encoder.fit_transform(sequence)\n",
"print(integer_encoded)\n",
"print(type(integer_encoded))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Nrqwp6z9OTgd",
"outputId": "69cc057a-b7c4-43c6-8327-909a6e568970"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[0 0 3 2 3 1]\n",
"<class 'numpy.ndarray'>\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"integer_encoded = integer_encoded.reshape(-1, 1)\n",
"print(\"Vetical_integer_encoded = \\n\", integer_encoded)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4Cvfc6ZAOaEa",
"outputId": "0f00d928-81fe-4f7a-f2fa-44593569badd"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Vetical_integer_encoded = \n",
" [[0]\n",
" [0]\n",
" [3]\n",
" [2]\n",
" [3]\n",
" [1]]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)\n",
"print(one_hot_encoded)\n",
"print(type(one_hot_encoded))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "skl9Tk5oPLNN",
"outputId": "fe26d7b8-73b7-4c61-cbd4-0f4117c4b27c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" (0, 0)\t1.0\n",
" (1, 0)\t1.0\n",
" (2, 3)\t1.0\n",
" (3, 2)\t1.0\n",
" (4, 3)\t1.0\n",
" (5, 1)\t1.0\n",
"<class 'scipy.sparse.csr.csr_matrix'>\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"input_features = one_hot_encoded.toarray()\n",
"print('One hot encoding of Sequence:\\n', input_features ,'\\n')\n",
"print(type(input_features))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0GiQYY-kPrsM",
"outputId": "4b8f3f8b-f98d-47fc-9953-2e763e149fa1"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"One hot encoding of Sequence:\n",
" [[1. 0. 0. 0.]\n",
" [1. 0. 0. 0.]\n",
" [0. 0. 0. 1.]\n",
" [0. 0. 1. 0.]\n",
" [0. 0. 0. 1.]\n",
" [0. 1. 0. 0.]] \n",
"\n",
"<class 'numpy.ndarray'>\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "iTQznWLodBvL"
},
"execution_count": null,
"outputs": []
}
]
}