From c64e622e2886cf4492b407ff61c2de169394f890 Mon Sep 17 00:00:00 2001
From: ParisaHTM <qxy699@mocs.utc.edu>
Date: Mon, 21 Nov 2022 19:45:02 -0500
Subject: [PATCH] one-Hot encoding version of a short genomic sequence

---
 one_hot_encoding.ipynb | 170 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 one_hot_encoding.ipynb
diff --git a/one_hot_encoding.ipynb b/one_hot_encoding.ipynb
new file mode 100644
index 0000000..d67f60e
--- /dev/null
+++ b/one_hot_encoding.ipynb
@@ -0,0 +1,170 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IwUR_lpKN6f5"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
+        "\n",
+        "# The LabelEncoder encodes a sequence of bases as a sequence of integers.\n",
+        "#Create an instance of LabelEncoder()\n",
+        "integer_encoder = LabelEncoder()  # Height = [\"short\", \"medium\", \"tall\"] ----> Height = [1, 0, 2]\n",
+        "\n",
+        "# The OneHotEncoder converts an array of integers to a sparse matrix where \n",
+        "# each row corresponds to one possible value of each feature.\n",
+        "#Create an instance of One-hot-encoder\n",
+        "one_hot_encoder = OneHotEncoder(categories=[list(range(4))]) #'auto'  Gender = [Male, Female] -----> Gender = [[0 , 1], \n",
+        " #                                                                                                             [1 , 0]]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sequence =list(\"AATGTC\")  #['A', 'A', 'T', 'G', 'T', 'C']\n",
+        "integer_encoded = integer_encoder.fit_transform(sequence)\n",
+        "print(integer_encoded)\n",
+        "print(type(integer_encoded))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Nrqwp6z9OTgd",
+        "outputId": "69cc057a-b7c4-43c6-8327-909a6e568970"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[0 0 3 2 3 1]\n",
+            "<class 'numpy.ndarray'>\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "integer_encoded = integer_encoded.reshape(-1, 1)\n",
+        "print(\"Vetical_integer_encoded = \\n\", integer_encoded)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4Cvfc6ZAOaEa",
+        "outputId": "0f00d928-81fe-4f7a-f2fa-44593569badd"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Vetical_integer_encoded = \n",
+            " [[0]\n",
+            " [0]\n",
+            " [3]\n",
+            " [2]\n",
+            " [3]\n",
+            " [1]]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)\n",
+        "print(one_hot_encoded)\n",
+        "print(type(one_hot_encoded))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "skl9Tk5oPLNN",
+        "outputId": "fe26d7b8-73b7-4c61-cbd4-0f4117c4b27c"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  (0, 0)\t1.0\n",
+            "  (1, 0)\t1.0\n",
+            "  (2, 3)\t1.0\n",
+            "  (3, 2)\t1.0\n",
+            "  (4, 3)\t1.0\n",
+            "  (5, 1)\t1.0\n",
+            "<class 'scipy.sparse.csr.csr_matrix'>\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "input_features = one_hot_encoded.toarray()\n",
+        "print('One hot encoding of Sequence:\\n', input_features ,'\\n')\n",
+        "print(type(input_features))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0GiQYY-kPrsM",
+        "outputId": "4b8f3f8b-f98d-47fc-9953-2e763e149fa1"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "One hot encoding of Sequence:\n",
+            " [[1. 0. 0. 0.]\n",
+            " [1. 0. 0. 0.]\n",
+            " [0. 0. 0. 1.]\n",
+            " [0. 0. 1. 0.]\n",
+            " [0. 0. 0. 1.]\n",
+            " [0. 1. 0. 0.]] \n",
+            "\n",
+            "<class 'numpy.ndarray'>\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "iTQznWLodBvL"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file