# import libraries used:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, ne_chunk
import gensim
import re
import spacy
from time import time
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
Task 1:
a. Document Similarity using:
1. word2vec
2. glove
Method 1: Using word2vec
df = pd.read_csv("/content/simpsons_dataset.csv")
Basic EDA:
| raw_character_text | spoken_words |
---|
0 | Miss Hoover | No, actually, it was a little of both. Sometim... |
---|
1 | Lisa Simpson | Where's Mr. Bergstrom? |
---|
2 | Miss Hoover | I don't know. Although I'd sure like to talk t... |
---|
3 | Lisa Simpson | That life is worth living. |
---|
4 | Edna Krabappel-Flanders | The polls will be open from now until the end ... |
---|
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106730 entries, 0 to 106729
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 raw_character_text 94628 non-null object
1 spoken_words 88999 non-null object
dtypes: object(2)
memory usage: 1.6+ MB
| raw_character_text | spoken_words |
---|
count | 94628 | 88999 |
---|
unique | 4842 | 82720 |
---|
top | Homer Simpson | No. |
---|
freq | 20587 | 212 |
---|
raw_character_text object
spoken_words object
dtype: object
Preprocessing:
raw_character_text 12102
spoken_words 17731
dtype: int64
df = df.dropna().reset_index(drop=True)
df.isnull().sum()
raw_character_text 0
spoken_words 0
dtype: int64
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
# nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
def cleaning(doc):
# Lemmatizes and removes stopwords
# doc needs to be a spacy Doc object
txt = [token.lemma_ for token in doc if not token.is_stop]
# Word2Vec uses context words to learn the vector representation of a target word,
# if a sentence is only one or two words long,
# the benefit for the training is very small
if len(txt) > 2:
return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
Time to clean up everything: 3.76 mins
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape
(58381, 1)
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
Training the model
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=20,
window=2,
vector_size=300,
sample=6e-5,
alpha=0.03,
min_alpha=0.0007,
negative=20,
workers=cores-1)
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
Time to build vocab: 0.01 mins
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
Time to train the model: 0.74 mins
w2v_model.init_sims(replace=True)
<ipython-input-36-c7757d71a30b>:1: DeprecationWarning: Call to deprecated `init_sims` (Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. init_sims() is now obsoleted and will be completely removed in future versions. See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4).
w2v_model.init_sims(replace=True)
WARNING:gensim.models.keyedvectors:destructive init_sims(replace=True) deprecated & no longer required for space-efficiency
w2v_model.wv.most_similar(positive=["homer"])
[('sweetheart', 0.9087448120117188),
('happen', 0.8906446099281311),
('mom_dad', 0.8843812942504883),
('gee', 0.8783718347549438),
('sorry', 0.8774828910827637),
('married', 0.874936580657959),
('worried', 0.8655418157577515),
('glad', 0.8642256855964661),
('screw', 0.8609371781349182),
('kid', 0.8605560064315796)]
w2v_model.wv.most_similar(positive=["homer_simpson"])
[('lady_gentleman', 0.9090253710746765),
('select', 0.8961604833602905),
('trial', 0.8919457793235779),
('host', 0.8909618854522705),
('winner', 0.8879182934761047),
('arrest', 0.8862953186035156),
('enemy', 0.8829513788223267),
('dedicate', 0.8790059089660645),
('broadcast', 0.8761047124862671),
('current', 0.8749897480010986)]
Method 2: Using glove
df = pd.read_table("/content/Restaurant_Reviews.tsv")
| Review | Liked |
---|
0 | Wow... Loved this place. | 1 |
---|
1 | Crust is not good. | 0 |
---|
2 | Not tasty and the texture was just nasty. | 0 |
---|
3 | Stopped by during the late May bank holiday of... | 1 |
---|
4 | The selection on the menu was great and so wer... | 1 |
---|
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Review 1000 non-null object
1 Liked 1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
nlp = spacy.load('en_core_web_sm')
array = df.to_numpy()#convert to numpy
#stop words list
stop_words = set(stopwords.words('english'))
#add other things to stop_words
stop_words.add(',')
stop_words.add('.')
stop_words.add('“')
stop_words.add('’')
stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()
l = len(array)
for i in range(l):
array[i][0] = array[i][0].lower() #Upper to lower
array[i][0] = ''.join((z for z in array[i][0] if not z.isdigit())) #removing numbers
array[i][0] = array[i][0].strip() #Removing white space
array[i][0] = array[i][0].translate(str.maketrans('', '', string.punctuation))#remove Punctuation
array[i][0] = re.sub('http://\S+|https://\S+', '', array[i][0])#remove http adress
word_tokens = word_tokenize(array[i][0]) #Tokenize
array[i][0] = word_tokens
array[i][0] = [word for word in array[i][0] if not word in stopwords.words()]
a = []
le = len(array[i][0])
for word in array[i][0]:
w1 = stemmer.stem(word)
w2 = "".join(word)
#w2 = lemmatizer.lemmatize(word)#lemmatize
a.append(w2)
array[i][0] = a
a_list = array[:,0]
Total_keywords = []
l = len(a_list)
for i in range(l):
l2 = len(a_list[i])
for j in range(l2):
Total_keywords.append(a_list[i][j])
#make a unique vectore
Total_keywords = set(Total_keywords)
print('Total number of keywords is : ',len(Total_keywords))
Total number of keywords is : 1714
n = len(Total_keywords)#lenght of the dimention of the quadratic matrix
matrix = np.zeros(shape=(n,n))# n*n zero matrix
Total_keywords = list(Total_keywords)
l = len(array)
for i in range(n):
for j in range(l):
l2 = len(array[j,0])
for k in range(l2):
if Total_keywords[i] == array[j,0][k]:
if k>0 and k<l2-1:
for s in range(n):
if array[j,0][k-1] == Total_keywords[s]:
matrix[s][i] = matrix[s][i]+1
elif array[j,0][k+1] == Total_keywords[s]:
matrix[s][i] = matrix[s][i]+1
elif k==0 and l2>1:
for s in range(n):
if array[j,0][k+1] == Total_keywords[s]:
matrix[s][i] = matrix[s][i]+1
elif k==l2-1 and l2!=0:
for s in range(n):
if array[j,0][k-1] == Total_keywords[s]:
matrix[s][i] = matrix[s][i]+1
Glove = pd.DataFrame(matrix, columns = Total_keywords )
Glove.set_index([pd.Index(Total_keywords)])
| nyc | loving | revisiting | vegetables | blanket | occasional | welcome | coziness | tartare | feeling | ... | fillet | app | saffron | quality | hella | rightthe | waiting | drawing | unbelievably | flatlined |
---|
nyc | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
loving | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
revisiting | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
vegetables | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
blanket | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
rightthe | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
waiting | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
drawing | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
unbelievably | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
flatlined | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
1714 rows × 1714 columns
Cosine Similarity distance:
#Comparing two vectors by Cosin distance
from scipy.spatial import distance
a = Glove['unsatisfying']#word vector a
b = Glove['refused']#word vector b
a = a.to_numpy()
b = b.to_numpy()
print('cosin similarity is :', distance.cosine(a, b))
cosin similarity is : 1.0
a = Glove['menu']#word vector a
b = Glove['great']#word vector a
a = a.to_numpy()
b = b.to_numpy()
l1 = len(array)
counter = 0
for i in range(l1):
l2 = len(array[i,0])
for j in range(l2):
if 'menu' == array[i,0][j]:
counter = counter + 1
d = len(Total_keywords)
for i in range(d):
if 'menu' == Total_keywords[i]:
f = i
s = Glove.loc[f, 'great']
print('Total number of word a :',counter)
print('Total number of word a after or befor b:',Glove.loc[f, 'great'])
print('probability of event word a to word b is:',s/counter*100,'%')
print('cosin similarity is :', round(distance.cosine(a, b),3))
Total number of word a : 15
Total number of word a after or befor b: 2.0
probability of event word a to word b is: 13.333333333333334 %
cosin similarity is : 0.961