From f879cc4422aeefc1e6436bfbe71e4a7a3ed2f317 Mon Sep 17 00:00:00 2001 From: Chanin Nantasenamat <51851491+dataprofessor@users.noreply.github.com> Date: Sat, 2 May 2020 00:03:21 +0700 Subject: [PATCH] Add files via upload --- python/CDD_ML_Part_1_bioactivity_data.ipynb | 1999 +++++++++++++------ 1 file changed, 1379 insertions(+), 620 deletions(-) diff --git a/python/CDD_ML_Part_1_bioactivity_data.ipynb b/python/CDD_ML_Part_1_bioactivity_data.ipynb index c16a9d6..67b418c 100644 --- a/python/CDD_ML_Part_1_bioactivity_data.ipynb +++ b/python/CDD_ML_Part_1_bioactivity_data.ipynb @@ -69,10 +69,10 @@ "metadata": { "id": "cJGExHQBfLh7", "colab_type": "code", - "outputId": "3ea7812f-d7ec-40ab-8542-e65d4d0e39f8", + "outputId": "783c9cb5-c5d4-4545-a9d3-6c2a2f2b0e53", "colab": { "base_uri": "https://localhost:8080/", - "height": 181 + "height": 349 } }, "source": [ @@ -83,14 +83,24 @@ { "output_type": "stream", "text": [ - "Requirement already satisfied: chembl_webresource_client in /usr/local/lib/python3.6/dist-packages (0.10.1)\n", - "Requirement already satisfied: easydict in /usr/local/lib/python3.6/dist-packages (from chembl_webresource_client) (1.9)\n", + "Collecting chembl_webresource_client\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/74/c4/6526156c7e2f164a0fc061aae20d383f0b6b1e79957510a64382e676e2dc/chembl-webresource-client-0.10.1.tar.gz (53kB)\n", + "\r\u001b[K |██████▏ | 10kB 18.6MB/s eta 0:00:01\r\u001b[K |████████████▎ | 20kB 6.7MB/s eta 0:00:01\r\u001b[K |██████████████████▍ | 30kB 7.8MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 40kB 8.4MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 51kB 7.2MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 61kB 4.6MB/s \n", + "\u001b[?25hRequirement already satisfied: urllib3 in /usr/local/lib/python3.6/dist-packages (from chembl_webresource_client) (1.24.3)\n", "Requirement already satisfied: requests>=2.18.4 in /usr/local/lib/python3.6/dist-packages (from chembl_webresource_client) (2.21.0)\n", - "Requirement already satisfied: requests-cache>=0.4.7 in /usr/local/lib/python3.6/dist-packages (from chembl_webresource_client) (0.5.2)\n", - "Requirement already satisfied: urllib3 in /usr/local/lib/python3.6/dist-packages (from chembl_webresource_client) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.18.4->chembl_webresource_client) (2019.11.28)\n", + "Collecting requests-cache>=0.4.7\n", + " Downloading https://files.pythonhosted.org/packages/7f/55/9b1c40eb83c16d8fc79c5f6c2ffade04208b080670fbfc35e0a5effb5a92/requests_cache-0.5.2-py2.py3-none-any.whl\n", + "Requirement already satisfied: easydict in /usr/local/lib/python3.6/dist-packages (from chembl_webresource_client) (1.9)\n", "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.18.4->chembl_webresource_client) (3.0.4)\n", - "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.18.4->chembl_webresource_client) (2.8)\n" + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.18.4->chembl_webresource_client) (2020.4.5.1)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.18.4->chembl_webresource_client) (2.8)\n", + "Building wheels for collected packages: chembl-webresource-client\n", + " Building wheel for chembl-webresource-client (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for chembl-webresource-client: filename=chembl_webresource_client-0.10.1-cp36-none-any.whl size=57153 sha256=c1e2f3e1514ff0b430f04c4dfe2d1bf8fc00e424b9883d7caae9ac0f6d94c36a\n", + " Stored in directory: /root/.cache/pip/wheels/81/8e/3b/4ec9940a01673307821600bfac28b17971caf84ff2b64653cb\n", + "Successfully built chembl-webresource-client\n", + "Installing collected packages: requests-cache, chembl-webresource-client\n", + "Successfully installed chembl-webresource-client-0.10.1 requests-cache-0.5.2\n" ], "name": "stdout" } @@ -146,16 +156,16 @@ "metadata": { "id": "Vxtp79so4ZjF", "colab_type": "code", - "outputId": "b27a6a74-5a2e-45f4-da6a-2f27836a69be", + "outputId": "e90dde45-1c0d-4fd9-f693-cb6e6032e2cd", "colab": { "base_uri": "https://localhost:8080/", - "height": 507 + "height": 145 } }, "source": [ "# Target search for coronavirus\n", "target = new_client.target\n", - "target_query = target.search('coronavirus')\n", + "target_query = target.search('aromatase')\n", "targets = pd.DataFrame.from_dict(target_query)\n", "targets" ], @@ -197,109 +207,44 @@ " \n", " \n", " 0\n", - " []\n", - " Coronavirus\n", - " Coronavirus\n", - " 17.0\n", - " False\n", - " CHEMBL613732\n", - " []\n", - " ORGANISM\n", - " 11119\n", - " \n", - " \n", - " 1\n", - " []\n", - " SARS coronavirus\n", - " SARS coronavirus\n", - " 14.0\n", - " False\n", - " CHEMBL612575\n", - " []\n", - " ORGANISM\n", - " 227859\n", - " \n", - " \n", - " 2\n", - " []\n", - " Feline coronavirus\n", - " Feline coronavirus\n", - " 14.0\n", - " False\n", - " CHEMBL612744\n", - " []\n", - " ORGANISM\n", - " 12663\n", - " \n", - " \n", - " 3\n", - " []\n", - " Human coronavirus 229E\n", - " Human coronavirus 229E\n", - " 12.0\n", + " [{'xref_id': 'P11511', 'xref_name': None, 'xre...\n", + " Homo sapiens\n", + " Cytochrome P450 19A1\n", + " 19.0\n", " False\n", - " CHEMBL613837\n", - " []\n", - " ORGANISM\n", - " 11137\n", - " \n", - " \n", - " 4\n", - " [{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 10.0\n", - " False\n", - " CHEMBL3927\n", - " [{'accession': 'P0C6U8', 'component_descriptio...\n", + " CHEMBL1978\n", + " [{'accession': 'P11511', 'component_descriptio...\n", " SINGLE PROTEIN\n", - " 227859\n", - " \n", - " \n", - " 5\n", - " []\n", - " Middle East respiratory syndrome-related coron...\n", - " Middle East respiratory syndrome-related coron...\n", - " 9.0\n", - " False\n", - " CHEMBL4296578\n", - " []\n", - " ORGANISM\n", - " 1335626\n", + " 9606\n", " \n", " \n", - " 6\n", - " [{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...\n", - " SARS coronavirus\n", - " Replicase polyprotein 1ab\n", - " 4.0\n", + " 1\n", + " [{'xref_id': 'P22443', 'xref_name': None, 'xre...\n", + " Rattus norvegicus\n", + " Cytochrome P450 19A1\n", + " 19.0\n", " False\n", - " CHEMBL5118\n", - " [{'accession': 'P0C6X7', 'component_descriptio...\n", + " CHEMBL3859\n", + " [{'accession': 'P22443', 'component_descriptio...\n", " SINGLE PROTEIN\n", - " 227859\n", + " 10116\n", " \n", " \n", "\n", "" ], "text/plain": [ - " cross_references ... tax_id\n", - "0 [] ... 11119\n", - "1 [] ... 227859\n", - "2 [] ... 12663\n", - "3 [] ... 11137\n", - "4 [{'xref_id': 'P0C6U8', 'xref_name': None, 'xre... ... 227859\n", - "5 [] ... 1335626\n", - "6 [{'xref_id': 'P0C6X7', 'xref_name': None, 'xre... ... 227859\n", + " cross_references ... tax_id\n", + "0 [{'xref_id': 'P11511', 'xref_name': None, 'xre... ... 9606\n", + "1 [{'xref_id': 'P22443', 'xref_name': None, 'xre... ... 10116\n", "\n", - "[7 rows x 9 columns]" + "[2 rows x 9 columns]" ] }, "metadata": { "tags": [] }, - "execution_count": 32 + "execution_count": 42 } ] }, @@ -310,7 +255,7 @@ "colab_type": "text" }, "source": [ - "### **Select and retrieve bioactivity data for *SARS coronavirus 3C-like proteinase* (fourth entry)**" + "### **Select and retrieve bioactivity data for *SARS coronavirus 3C-like proteinase* (fifth entry)**" ] }, { @@ -320,7 +265,7 @@ "colab_type": "text" }, "source": [ - "We will assign the fourth entry (which corresponds to the target protein, *coronavirus 3C-like proteinase*) to the ***selected_target*** variable " + "We will assign the fifth entry (which corresponds to the target protein, *coronavirus 3C-like proteinase*) to the ***selected_target*** variable " ] }, { @@ -328,7 +273,7 @@ "metadata": { "id": "StrcHMVLha7u", "colab_type": "code", - "outputId": "01d78bf2-b4aa-4b6e-b80c-feafba88e7c9", + "outputId": "a558535b-c66a-42ce-8604-3cf34dbff90b", "colab": { "base_uri": "https://localhost:8080/", "height": 35 @@ -350,7 +295,7 @@ "metadata": { "tags": [] }, - "execution_count": 33 + "execution_count": 4 } ] }, @@ -396,14 +341,14 @@ "metadata": { "id": "s9iUAXFdSkoM", "colab_type": "code", - "outputId": "d35ce7b2-4fea-4478-f859-401343657db3", + "outputId": "c9b681cc-97ab-40fb-a735-5e2b39612f8c", "colab": { "base_uri": "https://localhost:8080/", - "height": 779 + "height": 265 } }, "source": [ - "df" + "df.head(3)" ], "execution_count": 0, "outputs": [ @@ -613,400 +558,53 @@ " None\n", " 13.5\n", " \n", - " \n", - " 3\n", - " None\n", - " 1481065\n", - " []\n", - " CHEMBL829584\n", - " In vitro inhibitory concentration against SARS...\n", - " B\n", - " BAO_0000190\n", - " BAO_0000357\n", - " single protein format\n", - " O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21\n", - " None\n", - " None\n", - " CHEMBL1139624\n", - " Bioorg. Med. Chem. Lett.\n", - " 2005\n", - " {'bei': '16.64', 'le': '0.32', 'lle': '1.25', ...\n", - " CHEMBL426082\n", - " None\n", - " CHEMBL426082\n", - " 4.88\n", - " False\n", - " http://www.openphacts.org/units/Nanomolar\n", - " 384075\n", - " =\n", - " 1\n", - " True\n", - " =\n", - " None\n", - " IC50\n", - " nM\n", - " None\n", - " 13110.0\n", - " CHEMBL3927\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 227859\n", - " None\n", - " None\n", - " IC50\n", - " uM\n", - " UO_0000065\n", - " None\n", - " 13.11\n", - " \n", - " \n", - " 4\n", - " None\n", - " 1481066\n", - " []\n", - " CHEMBL829584\n", - " In vitro inhibitory concentration against SARS...\n", - " B\n", - " BAO_0000190\n", - " BAO_0000357\n", - " single protein format\n", - " O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]\n", - " None\n", - " None\n", - " CHEMBL1139624\n", - " Bioorg. Med. Chem. Lett.\n", - " 2005\n", - " {'bei': '16.84', 'le': '0.32', 'lle': '2.16', ...\n", - " CHEMBL187717\n", - " None\n", - " CHEMBL187717\n", - " 5.70\n", - " False\n", - " http://www.openphacts.org/units/Nanomolar\n", - " 384234\n", - " =\n", - " 1\n", - " True\n", - " =\n", - " None\n", - " IC50\n", - " nM\n", - " None\n", - " 2000.0\n", - " CHEMBL3927\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 227859\n", - " None\n", - " None\n", - " IC50\n", - " uM\n", - " UO_0000065\n", - " None\n", - " 2.0\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 128\n", - " None\n", - " 12041507\n", - " []\n", - " CHEMBL2150313\n", - " Inhibition of SARS-CoV PLpro expressed in Esch...\n", - " B\n", - " BAO_0000190\n", - " BAO_0000019\n", - " assay format\n", - " COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...\n", - " None\n", - " None\n", - " CHEMBL2146458\n", - " Bioorg. Med. Chem.\n", - " 2012\n", - " {'bei': '14.70', 'le': '0.27', 'lle': '1.57', ...\n", - " CHEMBL2146517\n", - " METHYL TANSHINONATE\n", - " CHEMBL2146517\n", - " 4.97\n", - " False\n", - " http://www.openphacts.org/units/Nanomolar\n", - " 1727226\n", - " =\n", - " 1\n", - " True\n", - " =\n", - " None\n", - " IC50\n", - " nM\n", - " None\n", - " 10600.0\n", - " CHEMBL3927\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 227859\n", - " None\n", - " None\n", - " IC50\n", - " uM\n", - " UO_0000065\n", - " None\n", - " 10.6\n", - " \n", - " \n", - " 129\n", - " None\n", - " 12041508\n", - " []\n", - " CHEMBL2150313\n", - " Inhibition of SARS-CoV PLpro expressed in Esch...\n", - " B\n", - " BAO_0000190\n", - " BAO_0000019\n", - " assay format\n", - " C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C\n", - " None\n", - " None\n", - " CHEMBL2146458\n", - " Bioorg. Med. Chem.\n", - " 2012\n", - " {'bei': '16.86', 'le': '0.31', 'lle': '1.56', ...\n", - " CHEMBL187460\n", - " CRYPTOTANSHINONE\n", - " CHEMBL187460\n", - " 5.00\n", - " False\n", - " http://www.openphacts.org/units/Nanomolar\n", - " 1727227\n", - " =\n", - " 1\n", - " True\n", - " =\n", - " None\n", - " IC50\n", - " nM\n", - " None\n", - " 10100.0\n", - " CHEMBL3927\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 227859\n", - " None\n", - " None\n", - " IC50\n", - " uM\n", - " UO_0000065\n", - " None\n", - " 10.1\n", - " \n", - " \n", - " 130\n", - " None\n", - " 12041509\n", - " []\n", - " CHEMBL2150313\n", - " Inhibition of SARS-CoV PLpro expressed in Esch...\n", - " B\n", - " BAO_0000190\n", - " BAO_0000019\n", - " assay format\n", - " Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12\n", - " None\n", - " None\n", - " CHEMBL2146458\n", - " Bioorg. Med. Chem.\n", - " 2012\n", - " {'bei': '17.88', 'le': '0.32', 'lle': '0.84', ...\n", - " CHEMBL363535\n", - " TANSHINONE I\n", - " CHEMBL363535\n", - " 4.94\n", - " False\n", - " http://www.openphacts.org/units/Nanomolar\n", - " 1727228\n", - " =\n", - " 1\n", - " True\n", - " =\n", - " None\n", - " IC50\n", - " nM\n", - " None\n", - " 11500.0\n", - " CHEMBL3927\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 227859\n", - " None\n", - " None\n", - " IC50\n", - " uM\n", - " UO_0000065\n", - " None\n", - " 11.5\n", - " \n", - " \n", - " 131\n", - " None\n", - " 12041510\n", - " []\n", - " CHEMBL2150313\n", - " Inhibition of SARS-CoV PLpro expressed in Esch...\n", - " B\n", - " BAO_0000190\n", - " BAO_0000019\n", - " assay format\n", - " Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1\n", - " None\n", - " None\n", - " CHEMBL2146458\n", - " Bioorg. Med. Chem.\n", - " 2012\n", - " {'bei': '17.86', 'le': '0.32', 'lle': '1.68', ...\n", - " CHEMBL227075\n", - " DIHYDROTANSHINONE I\n", - " CHEMBL227075\n", - " 4.97\n", - " False\n", - " http://www.openphacts.org/units/Nanomolar\n", - " 1727229\n", - " =\n", - " 1\n", - " True\n", - " =\n", - " None\n", - " IC50\n", - " nM\n", - " None\n", - " 10700.0\n", - " CHEMBL3927\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 227859\n", - " None\n", - " None\n", - " IC50\n", - " uM\n", - " UO_0000065\n", - " None\n", - " 10.7\n", - " \n", - " \n", - " 132\n", - " None\n", - " 12041511\n", - " []\n", - " CHEMBL2150313\n", - " Inhibition of SARS-CoV PLpro expressed in Esch...\n", - " B\n", - " BAO_0000190\n", - " BAO_0000019\n", - " assay format\n", - " CC(C)C1=Cc2ccc3c(c2C(=O)C1=O)CCCC3(C)C\n", - " None\n", - " None\n", - " CHEMBL2146458\n", - " Bioorg. Med. Chem.\n", - " 2012\n", - " {'bei': '14.53', 'le': '0.27', 'lle': '-0.01',...\n", - " CHEMBL45830\n", - " MILTIRONE\n", - " CHEMBL45830\n", - " 4.10\n", - " False\n", - " http://www.openphacts.org/units/Nanomolar\n", - " 1727230\n", - " =\n", - " 1\n", - " True\n", - " =\n", - " None\n", - " IC50\n", - " nM\n", - " None\n", - " 78900.0\n", - " CHEMBL3927\n", - " SARS coronavirus\n", - " SARS coronavirus 3C-like proteinase\n", - " 227859\n", - " None\n", - " None\n", - " IC50\n", - " uM\n", - " UO_0000065\n", - " None\n", - " 78.9\n", - " \n", " \n", "\n", - "

133 rows × 43 columns

\n", "" ], "text/plain": [ - " activity_comment activity_id ... upper_value value\n", - "0 None 1480935 ... None 7.2\n", - "1 None 1480936 ... None 9.4\n", - "2 None 1481061 ... None 13.5\n", - "3 None 1481065 ... None 13.11\n", - "4 None 1481066 ... None 2.0\n", - ".. ... ... ... ... ...\n", - "128 None 12041507 ... None 10.6\n", - "129 None 12041508 ... None 10.1\n", - "130 None 12041509 ... None 11.5\n", - "131 None 12041510 ... None 10.7\n", - "132 None 12041511 ... None 78.9\n", + " activity_comment activity_id ... upper_value value\n", + "0 None 1480935 ... None 7.2\n", + "1 None 1480936 ... None 9.4\n", + "2 None 1481061 ... None 13.5\n", "\n", - "[133 rows x 43 columns]" + "[3 rows x 43 columns]" ] }, "metadata": { "tags": [] }, - "execution_count": 36 + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oNtBv36dYhxy", + "colab_type": "code", + "outputId": "db6a7832-55eb-484c-b56c-98cdcd5944dd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + } + }, + "source": [ + "df.standard_type.unique()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array(['IC50'], dtype=object)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 11 } ] }, @@ -1058,10 +656,10 @@ "metadata": { "id": "6RBX658q65A5", "colab_type": "code", - "outputId": "8211e0e0-2e3b-4538-fcff-cc584def8283", + "outputId": "04a014cd-9f34-4a8f-e45f-50b380d9d41b", "colab": { "base_uri": "https://localhost:8080/", - "height": 35 + "height": 124 } }, "source": [ @@ -1073,6 +671,10 @@ { "output_type": "stream", "text": [ + "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n", + "\n", + "Enter your authorization code:\n", + "··········\n", "Mounted at /content/gdrive/\n" ], "name": "stdout" @@ -1094,25 +696,13 @@ "metadata": { "id": "tew-UtUWIS__", "colab_type": "code", - "outputId": "7ed3c80f-c7fa-4168-dfe4-2695e60ff08f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - } + "colab": {} }, "source": [ - "! mkdir \"/content/gdrive/My Drive/Colab Notebooks/data\"" + "! mkdir \"/content/gdrive/My Drive/Colab Notebooks/data2\"" ], "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "mkdir: cannot create directory ‘/content/gdrive/My Drive/Colab Notebooks/data’: File exists\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "code", @@ -1132,21 +722,22 @@ "metadata": { "id": "iRIr1QiEJtuw", "colab_type": "code", - "outputId": "a919e653-e47b-40bb-b229-3fde27ebf202", + "outputId": "e400f4d9-3ce7-4822-8837-33eb2499c1c1", "colab": { "base_uri": "https://localhost:8080/", - "height": 35 + "height": 52 } }, "source": [ - "! ls \"/content/gdrive/My Drive/Colab Notebooks/data\"" + "! ls -l \"/content/gdrive/My Drive/Colab Notebooks/data\"" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ - "bioactivity_data.csv\n" + "total 69\n", + "-rw------- 1 root root 70010 Apr 29 17:10 bioactivity_data.csv\n" ], "name": "stdout" } @@ -1167,13 +758,25 @@ "metadata": { "id": "FO3cZC5vnCht", "colab_type": "code", - "colab": {} + "outputId": "f5e07f1f-7a24-4d8e-ca52-e5e36e4daea1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + } }, "source": [ - "!ls" + "! ls" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "bioactivity_data.csv gdrive sample_data\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -1190,13 +793,34 @@ "metadata": { "id": "jwEJjx5b5gAn", "colab_type": "code", - "colab": {} + "outputId": "69dce8c6-565d-4537-952e-b01da9f2fd83", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 211 + } }, "source": [ "! head bioactivity_data.csv" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value\n", + ",1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease),B,BAO_0000190,BAO_0000357,single protein format,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '18.28', 'le': '0.33', 'lle': '3.25', 'sei': '5.90'}\",CHEMBL187579,,CHEMBL187579,5.14,False,http://www.openphacts.org/units/Nanomolar,384103,=,1,True,=,,IC50,nM,,7200.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2\n", + ",1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease),B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '12.10', 'le': '0.33', 'lle': '1.22', 'sei': '13.45'}\",CHEMBL188487,,CHEMBL188487,5.03,False,http://www.openphacts.org/units/Nanomolar,383984,=,1,True,=,,IC50,nM,,9400.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4\n", + ",1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease) at 20 uM,B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '11.56', 'le': '0.29', 'lle': '2.21', 'sei': '8.72'}\",CHEMBL185698,,CHEMBL185698,4.87,False,http://www.openphacts.org/units/Nanomolar,384106,=,1,True,=,,IC50,nM,,13500.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5\n", + ",1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease),B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '16.64', 'le': '0.32', 'lle': '1.25', 'sei': '13.06'}\",CHEMBL426082,,CHEMBL426082,4.88,False,http://www.openphacts.org/units/Nanomolar,384075,=,1,True,=,,IC50,nM,,13110.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11\n", + ",1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease),B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '16.84', 'le': '0.32', 'lle': '2.16', 'sei': '7.08'}\",CHEMBL187717,,CHEMBL187717,5.70,False,http://www.openphacts.org/units/Nanomolar,384234,=,1,True,=,,IC50,nM,,2000.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0\n", + ",1481068,[],CHEMBL828143,In vitro inhibitory concentration SARS coronavirus main protease (SARS CoV 3C-like protease) ,B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c(Br)cccc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '16.14', 'le': '0.37', 'lle': '1.62', 'sei': '16.07'}\",CHEMBL365134,,CHEMBL365134,6.01,False,http://www.openphacts.org/units/Nanomolar,384081,=,1,True,=,,IC50,nM,,980.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,0.98\n", + ",1481088,[],CHEMBL829584,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease),B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccc(F)cc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '17.08', 'le': '0.33', 'lle': '1.55', 'sei': '14.22'}\",CHEMBL187598,,CHEMBL187598,5.32,False,http://www.openphacts.org/units/Nanomolar,384303,=,1,True,=,,IC50,nM,,4820.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,4.82\n", + ",1481089,[],CHEMBL829584,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease),B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccc(I)cc21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '14.36', 'le': '0.37', 'lle': '1.78', 'sei': '16.11'}\",CHEMBL190743,,CHEMBL190743,6.02,False,http://www.openphacts.org/units/Nanomolar,384329,=,1,True,=,,IC50,nM,,950.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,0.95\n", + ",1481093,[],CHEMBL829584,In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease),B,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(Cc2cc3ccccc3s2)c2cccc(Cl)c21,,,CHEMBL1139624,Bioorg. Med. Chem. Lett.,2005,\"{'bei': '15.10', 'le': '0.31', 'lle': '0.67', 'sei': '13.24'}\",CHEMBL365469,,CHEMBL365469,4.95,False,http://www.openphacts.org/units/Nanomolar,384283,=,1,True,=,,IC50,nM,,11200.0,CHEMBL3927,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,11.2\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", @@ -1214,161 +838,1307 @@ "metadata": { "id": "hkVOdk6ZR396", "colab_type": "code", - "colab": {} + "outputId": "fc08d57e-f832-4cb0-90f2-dc7394b0209d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 782 + } }, "source": [ "df2 = df[df.standard_value.notna()]\n", "df2" ], "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y-qNsUlmjS25", - "colab_type": "text" - }, - "source": [ - "Apparently, for this dataset there is no missing data. But we can use the above code cell for bioactivity data of other target protein." - ] - }, - { - "cell_type": "markdown", - "metadata": { + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
activity_commentactivity_idactivity_propertiesassay_chembl_idassay_descriptionassay_typebao_endpointbao_formatbao_labelcanonical_smilesdata_validity_commentdata_validity_descriptiondocument_chembl_iddocument_journaldocument_yearligand_efficiencymolecule_chembl_idmolecule_pref_nameparent_molecule_chembl_idpchembl_valuepotential_duplicatequdt_unitsrecord_idrelationsrc_idstandard_flagstandard_relationstandard_text_valuestandard_typestandard_unitsstandard_upper_valuestandard_valuetarget_chembl_idtarget_organismtarget_pref_nametarget_tax_idtext_valuetoidtypeunitsuo_unitsupper_valuevalue
0None1480935[]CHEMBL829584In vitro inhibitory concentration against SARS...BBAO_0000190BAO_0000357single protein formatCc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21NoneNoneCHEMBL1139624Bioorg. Med. Chem. Lett.2005{'bei': '18.28', 'le': '0.33', 'lle': '3.25', ...CHEMBL187579NoneCHEMBL1875795.14Falsehttp://www.openphacts.org/units/Nanomolar384103=1True=NoneIC50nMNone7200.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None7.2
1None1480936[]CHEMBL829584In vitro inhibitory concentration against SARS...BBAO_0000190BAO_0000357single protein formatO=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21NoneNoneCHEMBL1139624Bioorg. Med. Chem. Lett.2005{'bei': '12.10', 'le': '0.33', 'lle': '1.22', ...CHEMBL188487NoneCHEMBL1884875.03Falsehttp://www.openphacts.org/units/Nanomolar383984=1True=NoneIC50nMNone9400.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None9.4
2None1481061[]CHEMBL830868In vitro inhibitory concentration against SARS...BBAO_0000190BAO_0000357single protein formatO=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21NoneNoneCHEMBL1139624Bioorg. Med. Chem. Lett.2005{'bei': '11.56', 'le': '0.29', 'lle': '2.21', ...CHEMBL185698NoneCHEMBL1856984.87Falsehttp://www.openphacts.org/units/Nanomolar384106=1True=NoneIC50nMNone13500.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None13.5
3None1481065[]CHEMBL829584In vitro inhibitory concentration against SARS...BBAO_0000190BAO_0000357single protein formatO=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21NoneNoneCHEMBL1139624Bioorg. Med. Chem. Lett.2005{'bei': '16.64', 'le': '0.32', 'lle': '1.25', ...CHEMBL426082NoneCHEMBL4260824.88Falsehttp://www.openphacts.org/units/Nanomolar384075=1True=NoneIC50nMNone13110.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None13.11
4None1481066[]CHEMBL829584In vitro inhibitory concentration against SARS...BBAO_0000190BAO_0000357single protein formatO=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]NoneNoneCHEMBL1139624Bioorg. Med. Chem. Lett.2005{'bei': '16.84', 'le': '0.32', 'lle': '2.16', ...CHEMBL187717NoneCHEMBL1877175.70Falsehttp://www.openphacts.org/units/Nanomolar384234=1True=NoneIC50nMNone2000.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None2.0
....................................................................................................................................
128None12041507[]CHEMBL2150313Inhibition of SARS-CoV PLpro expressed in Esch...BBAO_0000190BAO_0000019assay formatCOC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...NoneNoneCHEMBL2146458Bioorg. Med. Chem.2012{'bei': '14.70', 'le': '0.27', 'lle': '1.57', ...CHEMBL2146517METHYL TANSHINONATECHEMBL21465174.97Falsehttp://www.openphacts.org/units/Nanomolar1727226=1True=NoneIC50nMNone10600.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None10.6
129None12041508[]CHEMBL2150313Inhibition of SARS-CoV PLpro expressed in Esch...BBAO_0000190BAO_0000019assay formatC[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)CNoneNoneCHEMBL2146458Bioorg. Med. Chem.2012{'bei': '16.86', 'le': '0.31', 'lle': '1.56', ...CHEMBL187460CRYPTOTANSHINONECHEMBL1874605.00Falsehttp://www.openphacts.org/units/Nanomolar1727227=1True=NoneIC50nMNone10100.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None10.1
130None12041509[]CHEMBL2150313Inhibition of SARS-CoV PLpro expressed in Esch...BBAO_0000190BAO_0000019assay formatCc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12NoneNoneCHEMBL2146458Bioorg. Med. Chem.2012{'bei': '17.88', 'le': '0.32', 'lle': '0.84', ...CHEMBL363535TANSHINONE ICHEMBL3635354.94Falsehttp://www.openphacts.org/units/Nanomolar1727228=1True=NoneIC50nMNone11500.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None11.5
131None12041510[]CHEMBL2150313Inhibition of SARS-CoV PLpro expressed in Esch...BBAO_0000190BAO_0000019assay formatCc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1NoneNoneCHEMBL2146458Bioorg. Med. Chem.2012{'bei': '17.86', 'le': '0.32', 'lle': '1.68', ...CHEMBL227075DIHYDROTANSHINONE ICHEMBL2270754.97Falsehttp://www.openphacts.org/units/Nanomolar1727229=1True=NoneIC50nMNone10700.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None10.7
132None12041511[]CHEMBL2150313Inhibition of SARS-CoV PLpro expressed in Esch...BBAO_0000190BAO_0000019assay formatCC(C)C1=Cc2ccc3c(c2C(=O)C1=O)CCCC3(C)CNoneNoneCHEMBL2146458Bioorg. Med. Chem.2012{'bei': '14.53', 'le': '0.27', 'lle': '-0.01',...CHEMBL45830MILTIRONECHEMBL458304.10Falsehttp://www.openphacts.org/units/Nanomolar1727230=1True=NoneIC50nMNone78900.0CHEMBL3927SARS coronavirusSARS coronavirus 3C-like proteinase227859NoneNoneIC50uMUO_0000065None78.9
\n", + "

133 rows × 43 columns

\n", + "
" + ], + "text/plain": [ + " activity_comment activity_id ... upper_value value\n", + "0 None 1480935 ... None 7.2\n", + "1 None 1480936 ... None 9.4\n", + "2 None 1481061 ... None 13.5\n", + "3 None 1481065 ... None 13.11\n", + "4 None 1481066 ... None 2.0\n", + ".. ... ... ... ... ...\n", + "128 None 12041507 ... None 10.6\n", + "129 None 12041508 ... None 10.1\n", + "130 None 12041509 ... None 11.5\n", + "131 None 12041510 ... None 10.7\n", + "132 None 12041511 ... None 78.9\n", + "\n", + "[133 rows x 43 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 23 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y-qNsUlmjS25", + "colab_type": "text" + }, + "source": [ + "Apparently, for this dataset there is no missing data. But we can use the above code cell for bioactivity data of other target protein." + ] + }, + { + "cell_type": "markdown", + "metadata": { "id": "5H4sSFAWhV9B", "colab_type": "text" }, "source": [ - "## **Data pre-processing of the bioactivity data**" + "## **Data pre-processing of the bioactivity data**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tO22XVlzhkXR", + "colab_type": "text" + }, + "source": [ + "### **Labeling compounds as either being active, inactive or intermediate**\n", + "The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**. " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1E8rz7oMOd-5", + "colab_type": "code", + "colab": {} + }, + "source": [ + "bioactivity_class = []\n", + "for i in df2.standard_value:\n", + " if float(i) >= 10000:\n", + " bioactivity_class.append(\"inactive\")\n", + " elif float(i) <= 1000:\n", + " bioactivity_class.append(\"active\")\n", + " else:\n", + " bioactivity_class.append(\"intermediate\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PFsmb2N9hnTB", + "colab_type": "text" + }, + "source": [ + "### **Iterate the *molecule_chembl_id* to a list**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "DMJng9xnVnMM", + "colab_type": "code", + "colab": {} + }, + "source": [ + "mol_cid = []\n", + "for i in df2.molecule_chembl_id:\n", + " mol_cid.append(i)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YRieJc9dhuVZ", + "colab_type": "text" + }, + "source": [ + "### **Iterate *canonical_smiles* to a list**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AT8qUBk1eVmj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "canonical_smiles = []\n", + "for i in df2.canonical_smiles:\n", + " canonical_smiles.append(i)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DZFugUXxhwjE", + "colab_type": "text" + }, + "source": [ + "### **Iterate *standard_value* to a list**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZaPt-FjEZNBe", + "colab_type": "code", + "colab": {} + }, + "source": [ + "standard_value = []\n", + "for i in df2.standard_value:\n", + " standard_value.append(i)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nv2dzid_hzKd", + "colab_type": "text" + }, + "source": [ + "### **Combine the 4 lists into a dataframe**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TWlYO4I3Wrh-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))\n", + "df3 = pd.DataFrame( data_tuples, columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Li64nUiZQ-y2", + "colab_type": "code", + "outputId": "a1d4cdb5-922d-4573-9f8b-abcb7ef72a58", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 415 + } + }, + "source": [ + "df3" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
molecule_chembl_idcanonical_smilesbioactivity_classstandard_value
0CHEMBL187579Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21intermediate7200.0
1CHEMBL188487O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21intermediate9400.0
2CHEMBL185698O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21inactive13500.0
3CHEMBL426082O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21inactive13110.0
4CHEMBL187717O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]intermediate2000.0
...............
128CHEMBL2146517COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...inactive10600.0
129CHEMBL187460C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)Cinactive10100.0
130CHEMBL363535Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12inactive11500.0
131CHEMBL227075Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1inactive10700.0
132CHEMBL45830CC(C)C1=Cc2ccc3c(c2C(=O)C1=O)CCCC3(C)Cinactive78900.0
\n", + "

133 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " molecule_chembl_id ... standard_value\n", + "0 CHEMBL187579 ... 7200.0\n", + "1 CHEMBL188487 ... 9400.0\n", + "2 CHEMBL185698 ... 13500.0\n", + "3 CHEMBL426082 ... 13110.0\n", + "4 CHEMBL187717 ... 2000.0\n", + ".. ... ... ...\n", + "128 CHEMBL2146517 ... 10600.0\n", + "129 CHEMBL187460 ... 10100.0\n", + "130 CHEMBL363535 ... 11500.0\n", + "131 CHEMBL227075 ... 10700.0\n", + "132 CHEMBL45830 ... 78900.0\n", + "\n", + "[133 rows x 4 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 34 + } ] }, { "cell_type": "markdown", "metadata": { - "id": "tO22XVlzhkXR", + "id": "vE0Vvo6ic3MI", "colab_type": "text" }, "source": [ - "### **Labeling compounds as either being active, inactive or intermediate**\n", - "The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**. " + "### **Alternative method**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VICiiCtqc2ne", + "colab_type": "code", + "outputId": "0b39e703-b724-4f02-d86c-ea07791cdeed", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 415 + } + }, + "source": [ + "selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']\n", + "df3 = df2[selection]\n", + "df3" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
molecule_chembl_idcanonical_smilesstandard_value
0CHEMBL187579Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc217200.0
1CHEMBL188487O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc219400.0
2CHEMBL185698O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc2113500.0
3CHEMBL426082O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc2113110.0
4CHEMBL187717O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]2000.0
............
128CHEMBL2146517COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...10600.0
129CHEMBL187460C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C10100.0
130CHEMBL363535Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc1211500.0
131CHEMBL227075Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO110700.0
132CHEMBL45830CC(C)C1=Cc2ccc3c(c2C(=O)C1=O)CCCC3(C)C78900.0
\n", + "

133 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " molecule_chembl_id ... standard_value\n", + "0 CHEMBL187579 ... 7200.0\n", + "1 CHEMBL188487 ... 9400.0\n", + "2 CHEMBL185698 ... 13500.0\n", + "3 CHEMBL426082 ... 13110.0\n", + "4 CHEMBL187717 ... 2000.0\n", + ".. ... ... ...\n", + "128 CHEMBL2146517 ... 10600.0\n", + "129 CHEMBL187460 ... 10100.0\n", + "130 CHEMBL363535 ... 11500.0\n", + "131 CHEMBL227075 ... 10700.0\n", + "132 CHEMBL45830 ... 78900.0\n", + "\n", + "[133 rows x 3 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 37 + } ] }, { "cell_type": "code", "metadata": { - "id": "1E8rz7oMOd-5", + "id": "d8nV77oWdbq1", "colab_type": "code", - "colab": {} + "outputId": "2df59721-3567-48bc-a732-a0b09fa8aa12", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 415 + } }, "source": [ - "bioactivity_class = []\n", - "for i in df2.standard_value:\n", - " if float(i) >= 10000:\n", - " bioactivity_class.append(\"inactive\")\n", - " elif float(i) <= 1000:\n", - " bioactivity_class.append(\"active\")\n", - " else:\n", - " bioactivity_class.append(\"intermediate\")" + "pd.concat([df3,pd.Series(bioactivity_class)], axis=1)" ], "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PFsmb2N9hnTB", - "colab_type": "text" - }, - "source": [ - "### **Iterate the *molecule_chembl_id* to a list**" + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
molecule_chembl_idcanonical_smilesbioactivity_classstandard_value0
0CHEMBL187579Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21intermediate7200.0intermediate
1CHEMBL188487O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21intermediate9400.0intermediate
2CHEMBL185698O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21inactive13500.0inactive
3CHEMBL426082O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21inactive13110.0inactive
4CHEMBL187717O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]intermediate2000.0intermediate
..................
128CHEMBL2146517COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...inactive10600.0inactive
129CHEMBL187460C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)Cinactive10100.0inactive
130CHEMBL363535Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12inactive11500.0inactive
131CHEMBL227075Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1inactive10700.0inactive
132CHEMBL45830CC(C)C1=Cc2ccc3c(c2C(=O)C1=O)CCCC3(C)Cinactive78900.0inactive
\n", + "

133 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " molecule_chembl_id ... 0\n", + "0 CHEMBL187579 ... intermediate\n", + "1 CHEMBL188487 ... intermediate\n", + "2 CHEMBL185698 ... inactive\n", + "3 CHEMBL426082 ... inactive\n", + "4 CHEMBL187717 ... intermediate\n", + ".. ... ... ...\n", + "128 CHEMBL2146517 ... inactive\n", + "129 CHEMBL187460 ... inactive\n", + "130 CHEMBL363535 ... inactive\n", + "131 CHEMBL227075 ... inactive\n", + "132 CHEMBL45830 ... inactive\n", + "\n", + "[133 rows x 5 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 36 + } ] }, - { - "cell_type": "code", - "metadata": { - "id": "DMJng9xnVnMM", - "colab_type": "code", - "colab": {} - }, - "source": [ - "mol_cid = []\n", - "for i in df2.molecule_chembl_id:\n", - " mol_cid.append(i)" - ], - "execution_count": 0, - "outputs": [] - }, { "cell_type": "markdown", "metadata": { - "id": "YRieJc9dhuVZ", + "id": "9tlgyexWh7YJ", "colab_type": "text" }, "source": [ - "### **Iterate *canonical_smiles* to a list**" + "Saves dataframe to CSV file" ] }, { "cell_type": "code", "metadata": { - "id": "AT8qUBk1eVmj", + "id": "nSNia7suXstR", "colab_type": "code", "colab": {} }, "source": [ - "canonical_smiles = []\n", - "for i in df2.canonical_smiles:\n", - " canonical_smiles.append(i)" + "df3.to_csv('bioactivity_preprocessed_data.csv', index=False)" ], "execution_count": 0, "outputs": [] }, - { - "cell_type": "markdown", - "metadata": { - "id": "DZFugUXxhwjE", - "colab_type": "text" - }, - "source": [ - "### **Iterate *standard_value* to a list**" - ] - }, { "cell_type": "code", "metadata": { - "id": "ZaPt-FjEZNBe", + "id": "UuZf5-MEd-H5", "colab_type": "code", - "colab": {} + "outputId": "19e008f4-267b-490b-9b2c-e5f88a47a48a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + } }, "source": [ - "standard_value = []\n", - "for i in df2.standard_value:\n", - " standard_value.append(i)" + "! ls -l" ], "execution_count": 0, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "text": [ + "total 92\n", + "-rw-r--r-- 1 root root 70010 Apr 29 17:07 bioactivity_data.csv\n", + "-rw-r--r-- 1 root root 9326 Apr 29 17:24 bioactivity_preprocessed_data.csv\n", + "drwx------ 4 root root 4096 Apr 29 17:08 gdrive\n", + "drwxr-xr-x 1 root root 4096 Apr 3 16:24 sample_data\n" + ], + "name": "stdout" + } + ] }, { "cell_type": "markdown", "metadata": { - "id": "Nv2dzid_hzKd", + "id": "_C7rqJKTePhV", "colab_type": "text" }, "source": [ - "### **Combine the 4 lists into a dataframe**" + "Let's copy to the Google Drive" ] }, { "cell_type": "code", "metadata": { - "id": "TWlYO4I3Wrh-", + "id": "ZfyvJcENeHDB", "colab_type": "code", "colab": {} }, "source": [ - "data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))\n", - "df3 = pd.DataFrame( data_tuples, columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])" + "! cp bioactivity_preprocessed_data.csv \"/content/gdrive/My Drive/Colab Notebooks/data\"" ], "execution_count": 0, "outputs": [] @@ -1376,39 +2146,28 @@ { "cell_type": "code", "metadata": { - "id": "Li64nUiZQ-y2", + "id": "7PU7yU9leLV5", "colab_type": "code", - "colab": {} + "outputId": "c07ddf6b-e372-4807-bc0f-7f0b2944a0cf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + } }, "source": [ - "df3" + "! ls \"/content/gdrive/My Drive/Colab Notebooks/data\"" ], "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9tlgyexWh7YJ", - "colab_type": "text" - }, - "source": [ - "Saves dataframe to CSV file" + "outputs": [ + { + "output_type": "stream", + "text": [ + "bioactivity_data.csv bioactivity_preprocessed_data.csv\n" + ], + "name": "stdout" + } ] }, - { - "cell_type": "code", - "metadata": { - "id": "nSNia7suXstR", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df3.to_csv('bioactivity_preprocessed_data.csv', index=False)" - ], - "execution_count": 0, - "outputs": [] - }, { "cell_type": "markdown", "metadata": {