Adding files

enjoylife · Nov 18, 2018 · 1c6c663 · 1c6c663
1 parent eeb93fc
commit 1c6c663
Show file tree

Hide file tree

Showing 3 changed files with 621 additions and 0 deletions.
diff --git a/Me_Bot.ipynb b/Me_Bot.ipynb
@@ -0,0 +1,233 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.\n",
+      "SentencePiece model loaded at b'/tmp/tfhub_modules/539544f0a997d91c327c23285ea00c37588d92cc/assets/universal_encoder_8k_spm.model'.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.append('/usr/local/lib/python3.5/dist-packages/')\n",
+    "import tensorflow as tf\n",
+    "import tensorflow_hub as hub\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import http.client, urllib.request, urllib.parse, urllib.error, base64\n",
+    "import json\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "import pickle\n",
+    "import sentencepiece as spm\n",
+    "\n",
+    "module_url = \"https://tfhub.dev/google/universal-sentence-encoder-lite/2\"\n",
+    "embed = hub.Module(module_url)\n",
+    "tf.logging.set_verbosity(tf.logging.WARN)\n",
+    "\n",
+    "module = hub.Module(\"https://tfhub.dev/google/universal-sentence-encoder-lite/2\")\n",
+    "input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])\n",
+    "encodings = module(\n",
+    "    inputs=dict(\n",
+    "        values=input_placeholder.values,\n",
+    "        indices=input_placeholder.indices,\n",
+    "        dense_shape=input_placeholder.dense_shape))\n",
+    "\n",
+    "with tf.Session() as sess:\n",
+    "    spm_path = sess.run(module(signature=\"spm_path\"))\n",
+    "\n",
+    "sp = spm.SentencePieceProcessor()\n",
+    "sp.Load(spm_path)\n",
+    "print(\"SentencePiece model loaded at {}.\".format(spm_path))\n",
+    "\n",
+    "def process_to_IDs_in_sparse_format(sp, sentences):\n",
+    "  # An utility method that processes sentences with the sentence piece processor\n",
+    "  # 'sp' and returns the results in tf.SparseTensor-similar format:\n",
+    "  # (values, indices, dense_shape)\n",
+    "    ids = [sp.EncodeAsIds(x) for x in sentences]\n",
+    "    max_len = max(len(x) for x in ids)\n",
+    "    dense_shape=(len(ids), max_len)\n",
+    "    values=[item for sublist in ids for item in sublist]\n",
+    "    indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]\n",
+    "    return (values, indices, dense_shape)\n",
+    "\n",
+    "def embed_sentence_lite(sentences):\n",
+    "    messages = sentences\n",
+    "    values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)\n",
+    "\n",
+    "    # Reduce logging output.\n",
+    "    tf.logging.set_verbosity(tf.logging.ERROR)\n",
+    "\n",
+    "    with tf.Session() as session:\n",
+    "        session.run([tf.global_variables_initializer(), tf.tables_initializer()])\n",
+    "        message_embeddings = session.run(\n",
+    "          encodings,\n",
+    "          feed_dict={input_placeholder.values: values,\n",
+    "                    input_placeholder.indices: indices,\n",
+    "                    input_placeholder.dense_shape: dense_shape})\n",
+    "    \n",
+    "    return message_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_closest(sentence_rep,query_rep,K):\n",
+    "    top_K = np.argsort(np.sqrt((np.sum(np.square(sentence_rep - query_rep),axis=1))))[:K]\n",
+    "    return top_K"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "f = open('res/other_embeddings.p','rb')\n",
+    "other_embeddings = pickle.load(f)\n",
+    "f.close()\n",
+    "\n",
+    "f = open('res/your_embeddings.p','rb')\n",
+    "your_embeddings = pickle.load(f)\n",
+    "f.close()\n",
+    "\n",
+    "f = open('res/dilogues.p','rb')\n",
+    "pr_to_sp = pickle.load(f)\n",
+    "f.close()\n",
+    "\n",
+    "\n",
+    "f = open('res/your_sents.p','rb')\n",
+    "your_sentences = pickle.load(f)\n",
+    "f.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "keys = list(pr_to_sp.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = open('res/key_embeddings.p','rb')\n",
+    "key_embeddings = pickle.load(f)\n",
+    "f.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def speak_like_me(query,K,your_embeddings,other_embeddings,your_sen):\n",
+    "    other_query = [query]\n",
+    "    query_embedding = embed_sentence_lite(other_query)\n",
+    "    closest_your = find_closest(your_embeddings,query_embedding,K)\n",
+    "    for cl in closest_your:\n",
+    "        print(your_sentences[cl])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def respond_like_me(query,K,key_embeddings,keys):\n",
+    "    other_query = [query]\n",
+    "    query_embedding = embed_sentence_lite(other_query)\n",
+    "    closest_other = find_closest(key_embeddings,query_embedding,K+2)\n",
+    "    for k in closest_other[3:]:\n",
+    "        print(pr_to_sp[keys[k]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Work time now\n",
+      "\n",
+      "Potty :P\n",
+      "\n",
+      "Probably the first time you'll hear me say jt\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "respond_like_me(\"What's up?\",4,key_embeddings,keys)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "So so hungry\n",
+      "\n",
+      "Reeeaaaallly hungry\n",
+      "\n",
+      "I am in the mood to eat\n",
+      "\n",
+      "I want to eat that so badly. 😣\n",
+      "\n",
+      "I want that food\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "speak_like_me(\"I am so hungry\",5,your_embeddings,other_embeddings,your_sentences)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch_tens",
+   "language": "python",
+   "name": "torch_tens"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/clean_whatsapp_chats.py b/clean_whatsapp_chats.py
@@ -0,0 +1,62 @@
+import pickle
+import random
+import sys
+
+chat_file = sys.argv[1]
+
+f = open(chat_file,'r')
+content = f.readlines()
+all_text = []
+your_sents = []
+other_sents = []
+
+YOUR_NAME = 'Spandan Madan'
+OTHER_NAME = 'Pragya Maini'
+
+prev_pr_to_sp = {}
+prev = None
+for line in content[1:]:
+	if 'Missed Voice Call' in line:
+		continue
+	if 'image omitted' in line:
+		continue
+	if ' %s: '%YOUR_NAME in line:
+		text = line.split(' %s: '%YOUR_NAME)[-1]
+		your_sents.append(text)
+		all_text.append(text)
+		if prev == 'None':
+			continue
+		if prev == 'pr':
+			prev_pr_to_sp[other_sents[-1]] = text
+		prev = 'sp'
+	elif ' %s: '%OTHER_NAME in line:
+		text = line.split(' %s: '%OTHER_NAME)[-1]
+		other_sents.append(text)
+		all_text.append(text)
+		prev = 'pr'
+	else:
+		print(line)        
+		all_text[-1] += line
+
+		if prev == 'sp':
+			your_sents[-1] += line
+		elif prev == 'pr':
+			other_sents[-1] += line
+
+f = open('res/dilogues.p','wb')
+pickle.dump(prev_pr_to_sp,f)
+f.close()
+
+
+f = open('res/dilogues.p','wb')
+pickle.dump(prev_pr_to_sp,f)
+f.close()
+
+
+f = open('res/your_sents.p','wb')
+pickle.dump(your_sents,f)
+f.close()
+
+f = open('res/other_sents.p','wb')
+pickle.dump(other_sents,f)
+f.close()