From c64e622e2886cf4492b407ff61c2de169394f890 Mon Sep 17 00:00:00 2001 From: ParisaHTM Date: Mon, 21 Nov 2022 19:45:02 -0500 Subject: [PATCH] one-Hot encoding version of a short genomic sequence --- one_hot_encoding.ipynb | 170 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 one_hot_encoding.ipynb diff --git a/one_hot_encoding.ipynb b/one_hot_encoding.ipynb new file mode 100644 index 0000000..d67f60e --- /dev/null +++ b/one_hot_encoding.ipynb @@ -0,0 +1,170 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IwUR_lpKN6f5" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", + "\n", + "# The LabelEncoder encodes a sequence of bases as a sequence of integers.\n", + "#Create an instance of LabelEncoder()\n", + "integer_encoder = LabelEncoder() # Height = [\"short\", \"medium\", \"tall\"] ----> Height = [1, 0, 2]\n", + "\n", + "# The OneHotEncoder converts an array of integers to a sparse matrix where \n", + "# each row corresponds to one possible value of each feature.\n", + "#Create an instance of One-hot-encoder\n", + "one_hot_encoder = OneHotEncoder(categories=[list(range(4))]) #'auto' Gender = [Male, Female] -----> Gender = [[0 , 1], \n", + " # [1 , 0]]" + ] + }, + { + "cell_type": "code", + "source": [ + "sequence =list(\"AATGTC\") #['A', 'A', 'T', 'G', 'T', 'C']\n", + "integer_encoded = integer_encoder.fit_transform(sequence)\n", + "print(integer_encoded)\n", + "print(type(integer_encoded))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Nrqwp6z9OTgd", + "outputId": "69cc057a-b7c4-43c6-8327-909a6e568970" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0 0 3 2 3 1]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "integer_encoded = integer_encoded.reshape(-1, 1)\n", + "print(\"Vetical_integer_encoded = \\n\", integer_encoded)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4Cvfc6ZAOaEa", + "outputId": "0f00d928-81fe-4f7a-f2fa-44593569badd" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Vetical_integer_encoded = \n", + " [[0]\n", + " [0]\n", + " [3]\n", + " [2]\n", + " [3]\n", + " [1]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)\n", + "print(one_hot_encoded)\n", + "print(type(one_hot_encoded))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "skl9Tk5oPLNN", + "outputId": "fe26d7b8-73b7-4c61-cbd4-0f4117c4b27c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " (0, 0)\t1.0\n", + " (1, 0)\t1.0\n", + " (2, 3)\t1.0\n", + " (3, 2)\t1.0\n", + " (4, 3)\t1.0\n", + " (5, 1)\t1.0\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "input_features = one_hot_encoded.toarray()\n", + "print('One hot encoding of Sequence:\\n', input_features ,'\\n')\n", + "print(type(input_features))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0GiQYY-kPrsM", + "outputId": "4b8f3f8b-f98d-47fc-9953-2e763e149fa1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "One hot encoding of Sequence:\n", + " [[1. 0. 0. 0.]\n", + " [1. 0. 0. 0.]\n", + " [0. 0. 0. 1.]\n", + " [0. 0. 1. 0.]\n", + " [0. 0. 0. 1.]\n", + " [0. 1. 0. 0.]] \n", + "\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "iTQznWLodBvL" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file