Skip to content

Commit

Permalink
Adding files
Browse files Browse the repository at this point in the history
  • Loading branch information
[email protected] committed Nov 18, 2018
1 parent eeb93fc commit 1c6c663
Show file tree
Hide file tree
Showing 3 changed files with 621 additions and 0 deletions.
233 changes: 233 additions & 0 deletions Me_Bot.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.\n",
"SentencePiece model loaded at b'/tmp/tfhub_modules/539544f0a997d91c327c23285ea00c37588d92cc/assets/universal_encoder_8k_spm.model'.\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append('/usr/local/lib/python3.5/dist-packages/')\n",
"import tensorflow as tf\n",
"import tensorflow_hub as hub\n",
"import numpy as np\n",
"import os\n",
"import http.client, urllib.request, urllib.parse, urllib.error, base64\n",
"import json\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"import pickle\n",
"import sentencepiece as spm\n",
"\n",
"module_url = \"https://tfhub.dev/google/universal-sentence-encoder-lite/2\"\n",
"embed = hub.Module(module_url)\n",
"tf.logging.set_verbosity(tf.logging.WARN)\n",
"\n",
"module = hub.Module(\"https://tfhub.dev/google/universal-sentence-encoder-lite/2\")\n",
"input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])\n",
"encodings = module(\n",
" inputs=dict(\n",
" values=input_placeholder.values,\n",
" indices=input_placeholder.indices,\n",
" dense_shape=input_placeholder.dense_shape))\n",
"\n",
"with tf.Session() as sess:\n",
" spm_path = sess.run(module(signature=\"spm_path\"))\n",
"\n",
"sp = spm.SentencePieceProcessor()\n",
"sp.Load(spm_path)\n",
"print(\"SentencePiece model loaded at {}.\".format(spm_path))\n",
"\n",
"def process_to_IDs_in_sparse_format(sp, sentences):\n",
" # An utility method that processes sentences with the sentence piece processor\n",
" # 'sp' and returns the results in tf.SparseTensor-similar format:\n",
" # (values, indices, dense_shape)\n",
" ids = [sp.EncodeAsIds(x) for x in sentences]\n",
" max_len = max(len(x) for x in ids)\n",
" dense_shape=(len(ids), max_len)\n",
" values=[item for sublist in ids for item in sublist]\n",
" indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]\n",
" return (values, indices, dense_shape)\n",
"\n",
"def embed_sentence_lite(sentences):\n",
" messages = sentences\n",
" values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)\n",
"\n",
" # Reduce logging output.\n",
" tf.logging.set_verbosity(tf.logging.ERROR)\n",
"\n",
" with tf.Session() as session:\n",
" session.run([tf.global_variables_initializer(), tf.tables_initializer()])\n",
" message_embeddings = session.run(\n",
" encodings,\n",
" feed_dict={input_placeholder.values: values,\n",
" input_placeholder.indices: indices,\n",
" input_placeholder.dense_shape: dense_shape})\n",
" \n",
" return message_embeddings"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def find_closest(sentence_rep,query_rep,K):\n",
" top_K = np.argsort(np.sqrt((np.sum(np.square(sentence_rep - query_rep),axis=1))))[:K]\n",
" return top_K"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"f = open('res/other_embeddings.p','rb')\n",
"other_embeddings = pickle.load(f)\n",
"f.close()\n",
"\n",
"f = open('res/your_embeddings.p','rb')\n",
"your_embeddings = pickle.load(f)\n",
"f.close()\n",
"\n",
"f = open('res/dilogues.p','rb')\n",
"pr_to_sp = pickle.load(f)\n",
"f.close()\n",
"\n",
"\n",
"f = open('res/your_sents.p','rb')\n",
"your_sentences = pickle.load(f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"keys = list(pr_to_sp.keys())"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"f = open('res/key_embeddings.p','rb')\n",
"key_embeddings = pickle.load(f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"def speak_like_me(query,K,your_embeddings,other_embeddings,your_sen):\n",
" other_query = [query]\n",
" query_embedding = embed_sentence_lite(other_query)\n",
" closest_your = find_closest(your_embeddings,query_embedding,K)\n",
" for cl in closest_your:\n",
" print(your_sentences[cl])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"def respond_like_me(query,K,key_embeddings,keys):\n",
" other_query = [query]\n",
" query_embedding = embed_sentence_lite(other_query)\n",
" closest_other = find_closest(key_embeddings,query_embedding,K+2)\n",
" for k in closest_other[3:]:\n",
" print(pr_to_sp[keys[k]])"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Work time now\n",
"\n",
"Potty :P\n",
"\n",
"Probably the first time you'll hear me say jt\n",
"\n"
]
}
],
"source": [
"respond_like_me(\"What's up?\",4,key_embeddings,keys)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"So so hungry\n",
"\n",
"Reeeaaaallly hungry\n",
"\n",
"I am in the mood to eat\n",
"\n",
"I want to eat that so badly. 😣\n",
"\n",
"I want that food\n",
"\n"
]
}
],
"source": [
"speak_like_me(\"I am so hungry\",5,your_embeddings,other_embeddings,your_sentences)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "torch_tens",
"language": "python",
"name": "torch_tens"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
62 changes: 62 additions & 0 deletions clean_whatsapp_chats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pickle
import random
import sys

chat_file = sys.argv[1]

f = open(chat_file,'r')
content = f.readlines()
all_text = []
your_sents = []
other_sents = []

YOUR_NAME = 'Spandan Madan'
OTHER_NAME = 'Pragya Maini'

prev_pr_to_sp = {}
prev = None
for line in content[1:]:
if 'Missed Voice Call' in line:
continue
if 'image omitted' in line:
continue
if ' %s: '%YOUR_NAME in line:
text = line.split(' %s: '%YOUR_NAME)[-1]
your_sents.append(text)
all_text.append(text)
if prev == 'None':
continue
if prev == 'pr':
prev_pr_to_sp[other_sents[-1]] = text
prev = 'sp'
elif ' %s: '%OTHER_NAME in line:
text = line.split(' %s: '%OTHER_NAME)[-1]
other_sents.append(text)
all_text.append(text)
prev = 'pr'
else:
print(line)
all_text[-1] += line

if prev == 'sp':
your_sents[-1] += line
elif prev == 'pr':
other_sents[-1] += line

f = open('res/dilogues.p','wb')
pickle.dump(prev_pr_to_sp,f)
f.close()


f = open('res/dilogues.p','wb')
pickle.dump(prev_pr_to_sp,f)
f.close()


f = open('res/your_sents.p','wb')
pickle.dump(your_sents,f)
f.close()

f = open('res/other_sents.p','wb')
pickle.dump(other_sents,f)
f.close()
Loading

0 comments on commit 1c6c663

Please sign in to comment.