Document Similarity

Measuring Similarity between Documents using Word Embeddings

Photo Credit : Geeks for Geeks
# import libraries used:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, ne_chunk
import gensim
import re
import spacy
from time import time
from gensim.models import Word2Vec

from nltk.stem import WordNetLemmatizer

from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re

Task 1:

a. Document Similarity using:

1. word2vec

2. glove

Method 1: Using word2vec

df = pd.read_csv("/content/simpsons_dataset.csv")

Basic EDA:

df.head()

raw_character_textspoken_words
0Miss HooverNo, actually, it was a little of both. Sometim...
1Lisa SimpsonWhere's Mr. Bergstrom?
2Miss HooverI don't know. Although I'd sure like to talk t...
3Lisa SimpsonThat life is worth living.
4Edna Krabappel-FlandersThe polls will be open from now until the end ...
<script>
  const buttonEl =
    document.querySelector('#df-3c5bc971-a412-472a-9349-37a97632bdb0 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-3c5bc971-a412-472a-9349-37a97632bdb0');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106730 entries, 0 to 106729
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   raw_character_text  94628 non-null  object
 1   spoken_words        88999 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB
df.describe()

raw_character_textspoken_words
count9462888999
unique484282720
topHomer SimpsonNo.
freq20587212
<script>
  const buttonEl =
    document.querySelector('#df-8b0c8324-6a2b-4f6c-bd83-40ca4da975b1 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-8b0c8324-6a2b-4f6c-bd83-40ca4da975b1');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

df.dtypes
raw_character_text    object
spoken_words          object
dtype: object

Preprocessing:

df.isnull().sum()
raw_character_text    12102
spoken_words          17731
dtype: int64
df = df.dropna().reset_index(drop=True)
df.isnull().sum()
raw_character_text    0
spoken_words          0
dtype: int64
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
# nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
Time to clean up everything: 3.76 mins
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape
(58381, 1)
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

Training the model

cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
Time to build vocab: 0.01 mins
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
Time to train the model: 0.74 mins
w2v_model.init_sims(replace=True)
<ipython-input-36-c7757d71a30b>:1: DeprecationWarning: Call to deprecated `init_sims` (Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. init_sims() is now obsoleted and will be completely removed in future versions. See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4).
  w2v_model.init_sims(replace=True)
WARNING:gensim.models.keyedvectors:destructive init_sims(replace=True) deprecated & no longer required for space-efficiency

Let’s checkout the similarity of word with input word as homer

w2v_model.wv.most_similar(positive=["homer"])
[('sweetheart', 0.9087448120117188),
 ('happen', 0.8906446099281311),
 ('mom_dad', 0.8843812942504883),
 ('gee', 0.8783718347549438),
 ('sorry', 0.8774828910827637),
 ('married', 0.874936580657959),
 ('worried', 0.8655418157577515),
 ('glad', 0.8642256855964661),
 ('screw', 0.8609371781349182),
 ('kid', 0.8605560064315796)]
w2v_model.wv.most_similar(positive=["homer_simpson"])
[('lady_gentleman', 0.9090253710746765),
 ('select', 0.8961604833602905),
 ('trial', 0.8919457793235779),
 ('host', 0.8909618854522705),
 ('winner', 0.8879182934761047),
 ('arrest', 0.8862953186035156),
 ('enemy', 0.8829513788223267),
 ('dedicate', 0.8790059089660645),
 ('broadcast', 0.8761047124862671),
 ('current', 0.8749897480010986)]

Method 2: Using glove

df = pd.read_table("/content/Restaurant_Reviews.tsv")
df.head()

ReviewLiked
0Wow... Loved this place.1
1Crust is not good.0
2Not tasty and the texture was just nasty.0
3Stopped by during the late May bank holiday of...1
4The selection on the menu was great and so wer...1
<script>
  const buttonEl =
    document.querySelector('#df-8faed915-e973-429e-a59b-4367dd42a0c5 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-8faed915-e973-429e-a59b-4367dd42a0c5');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
nlp = spacy.load('en_core_web_sm')
array = df.to_numpy()#convert to numpy

#stop words list
stop_words = set(stopwords.words('english'))

#add other things to stop_words
stop_words.add(',')
stop_words.add('.')
stop_words.add('“')
stop_words.add('’')
stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()
l = len(array)

for i in range(l):
    array[i][0] = array[i][0].lower() #Upper to lower
    array[i][0] = ''.join((z for z in array[i][0] if not z.isdigit())) #removing numbers
    array[i][0] = array[i][0].strip() #Removing white space
    array[i][0] = array[i][0].translate(str.maketrans('', '', string.punctuation))#remove Punctuation
    array[i][0] = re.sub('http://\S+|https://\S+', '', array[i][0])#remove http adress
    word_tokens = word_tokenize(array[i][0]) #Tokenize
    array[i][0] = word_tokens

    array[i][0] = [word for word in array[i][0] if not word in stopwords.words()]
    a = []
    le = len(array[i][0])

    for word in array[i][0]:

            w1 = stemmer.stem(word)
            w2 = "".join(word)
            #w2 = lemmatizer.lemmatize(word)#lemmatize
            a.append(w2)
            array[i][0] = a
a_list = array[:,0]
Total_keywords = []

l = len(a_list)
for i in range(l):
    l2 = len(a_list[i])
    for j in range(l2):
        Total_keywords.append(a_list[i][j])
#make a unique vectore
Total_keywords = set(Total_keywords)
print('Total number of keywords is : ',len(Total_keywords))
Total number of keywords is :  1714
n = len(Total_keywords)#lenght of the dimention of the quadratic matrix
matrix = np.zeros(shape=(n,n))# n*n zero matrix
Total_keywords = list(Total_keywords)
l = len(array)
for i in range(n):
    for j in range(l):
        l2 = len(array[j,0])
        for k in range(l2):
            if Total_keywords[i] == array[j,0][k]:
                if k>0 and k<l2-1:
                    for s in range(n):
                        if array[j,0][k-1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1
                        elif array[j,0][k+1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1
                elif k==0 and l2>1:
                    for s in range(n):
                        if array[j,0][k+1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1
                elif k==l2-1 and l2!=0:
                    for s in range(n):
                        if array[j,0][k-1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1
Glove = pd.DataFrame(matrix, columns = Total_keywords )
Glove.set_index([pd.Index(Total_keywords)])

nyclovingrevisitingvegetablesblanketoccasionalwelcomecozinesstartarefeeling...filletappsaffronqualityhellarightthewaitingdrawingunbelievablyflatlined
nyc0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
loving0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
revisiting0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
vegetables0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
blanket0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
rightthe0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
waiting0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
drawing0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
unbelievably0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
flatlined0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0

1714 rows × 1714 columns

<script>
  const buttonEl =
    document.querySelector('#df-cfb379d2-73f4-446b-b215-59281731bc39 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-cfb379d2-73f4-446b-b215-59281731bc39');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

Cosine Similarity distance:

#Comparing two vectors by Cosin distance
from scipy.spatial import distance
a = Glove['unsatisfying']#word vector a
b = Glove['refused']#word vector b
a = a.to_numpy()
b = b.to_numpy()
print('cosin similarity is :', distance.cosine(a, b))
cosin similarity is : 1.0
a = Glove['menu']#word vector a
b = Glove['great']#word vector a
a = a.to_numpy()
b = b.to_numpy()
l1 = len(array)
counter = 0
for i in range(l1):
    l2 = len(array[i,0])
    for j in range(l2):
        if 'menu' == array[i,0][j]:
            counter = counter + 1
d = len(Total_keywords)
for i in range(d):
    if 'menu' == Total_keywords[i]:
        f = i
s = Glove.loc[f, 'great']
print('Total number of word a :',counter)
print('Total number of word a after or befor b:',Glove.loc[f, 'great'])
print('probability of event word a to word b is:',s/counter*100,'%')
print('cosin similarity is :', round(distance.cosine(a, b),3))
Total number of word a : 15
Total number of word a after or befor b: 2.0
probability of event word a to word b is: 13.333333333333334 %
cosin similarity is : 0.961

Edit this page

Srihari Thyagarajan
Srihari Thyagarajan
B Tech AI Senior Student

Hi, I’m Haleshot, a final-year student studying B Tech Artificial Intelligence. I like projects relating to ML, AI, DL, CV, NLP, Image Processing, etc. Currently exploring Python, FastAPI, projects involving AI and platforms such as HuggingFace and Kaggle.

Next
Previous

Related