Document Similarity

Measuring Similarity between Documents using Word Embeddings

Srihari Thyagarajan

Last updated on Jun 3, 2024 8 min read Programming, Text Analysis, Data Science, Academic

Photo Credit : Geeks for Geeks

# import libraries used:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, ne_chunk
import gensim
import re
import spacy
from time import time
from gensim.models import Word2Vec

from nltk.stem import WordNetLemmatizer

from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re

Task 1:

a. Document Similarity using:

1. word2vec

2. glove

Method 1: Using word2vec

df = pd.read_csv("/content/simpsons_dataset.csv")

Basic EDA:

df.head()

	raw_character_text	spoken_words
0	Miss Hoover	No, actually, it was a little of both. Sometim...
1	Lisa Simpson	Where's Mr. Bergstrom?
2	Miss Hoover	I don't know. Although I'd sure like to talk t...
3	Lisa Simpson	That life is worth living.
4	Edna Krabappel-Flanders	The polls will be open from now until the end ...

<script>
  const buttonEl =
    document.querySelector('#df-3c5bc971-a412-472a-9349-37a97632bdb0 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-3c5bc971-a412-472a-9349-37a97632bdb0');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106730 entries, 0 to 106729
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   raw_character_text  94628 non-null  object
 1   spoken_words        88999 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB

df.describe()

	raw_character_text	spoken_words
count	94628	88999
unique	4842	82720
top	Homer Simpson	No.
freq	20587	212

<script>
  const buttonEl =
    document.querySelector('#df-8b0c8324-6a2b-4f6c-bd83-40ca4da975b1 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-8b0c8324-6a2b-4f6c-bd83-40ca4da975b1');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

df.dtypes

raw_character_text    object
spoken_words          object
dtype: object

Preprocessing:

df.isnull().sum()

raw_character_text    12102
spoken_words          17731
dtype: int64

df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
# nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 3.76 mins

df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(58381, 1)

sent = [row.split() for row in df_clean['clean']]

phrases = Phrases(sent, min_count=30, progress_per=10000)

bigram = Phraser(phrases)

sentences = bigram[sent]

Training the model

cores = multiprocessing.cpu_count()

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.01 mins

t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.74 mins

w2v_model.init_sims(replace=True)

<ipython-input-36-c7757d71a30b>:1: DeprecationWarning: Call to deprecated `init_sims` (Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. init_sims() is now obsoleted and will be completely removed in future versions. See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4).
  w2v_model.init_sims(replace=True)
WARNING:gensim.models.keyedvectors:destructive init_sims(replace=True) deprecated & no longer required for space-efficiency

Let’s checkout the similarity of word with input word as homer

w2v_model.wv.most_similar(positive=["homer"])

[('sweetheart', 0.9087448120117188),
 ('happen', 0.8906446099281311),
 ('mom_dad', 0.8843812942504883),
 ('gee', 0.8783718347549438),
 ('sorry', 0.8774828910827637),
 ('married', 0.874936580657959),
 ('worried', 0.8655418157577515),
 ('glad', 0.8642256855964661),
 ('screw', 0.8609371781349182),
 ('kid', 0.8605560064315796)]

w2v_model.wv.most_similar(positive=["homer_simpson"])

[('lady_gentleman', 0.9090253710746765),
 ('select', 0.8961604833602905),
 ('trial', 0.8919457793235779),
 ('host', 0.8909618854522705),
 ('winner', 0.8879182934761047),
 ('arrest', 0.8862953186035156),
 ('enemy', 0.8829513788223267),
 ('dedicate', 0.8790059089660645),
 ('broadcast', 0.8761047124862671),
 ('current', 0.8749897480010986)]

Method 2: Using glove

df = pd.read_table("/content/Restaurant_Reviews.tsv")

df.head()

	Review	Liked
0	Wow... Loved this place.	1
1	Crust is not good.	0
2	Not tasty and the texture was just nasty.	0
3	Stopped by during the late May bank holiday of...	1
4	The selection on the menu was great and so wer...	1

<script>
  const buttonEl =
    document.querySelector('#df-8faed915-e973-429e-a59b-4367dd42a0c5 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-8faed915-e973-429e-a59b-4367dd42a0c5');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB

nlp = spacy.load('en_core_web_sm')

array = df.to_numpy()#convert to numpy

#stop words list
stop_words = set(stopwords.words('english'))

#add other things to stop_words
stop_words.add(',')
stop_words.add('.')
stop_words.add('“')
stop_words.add('’')
stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()
l = len(array)

for i in range(l):
    array[i][0] = array[i][0].lower() #Upper to lower
    array[i][0] = ''.join((z for z in array[i][0] if not z.isdigit())) #removing numbers
    array[i][0] = array[i][0].strip() #Removing white space
    array[i][0] = array[i][0].translate(str.maketrans('', '', string.punctuation))#remove Punctuation
    array[i][0] = re.sub('http://\S+|https://\S+', '', array[i][0])#remove http adress
    word_tokens = word_tokenize(array[i][0]) #Tokenize
    array[i][0] = word_tokens

    array[i][0] = [word for word in array[i][0] if not word in stopwords.words()]
    a = []
    le = len(array[i][0])

    for word in array[i][0]:

            w1 = stemmer.stem(word)
            w2 = "".join(word)
            #w2 = lemmatizer.lemmatize(word)#lemmatize
            a.append(w2)
            array[i][0] = a

a_list = array[:,0]
Total_keywords = []

l = len(a_list)
for i in range(l):
    l2 = len(a_list[i])
    for j in range(l2):
        Total_keywords.append(a_list[i][j])

#make a unique vectore
Total_keywords = set(Total_keywords)
print('Total number of keywords is : ',len(Total_keywords))

Total number of keywords is :  1714

n = len(Total_keywords)#lenght of the dimention of the quadratic matrix
matrix = np.zeros(shape=(n,n))# n*n zero matrix

Total_keywords = list(Total_keywords)

l = len(array)
for i in range(n):
    for j in range(l):
        l2 = len(array[j,0])
        for k in range(l2):
            if Total_keywords[i] == array[j,0][k]:
                if k>0 and k<l2-1:
                    for s in range(n):
                        if array[j,0][k-1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1
                        elif array[j,0][k+1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1
                elif k==0 and l2>1:
                    for s in range(n):
                        if array[j,0][k+1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1
                elif k==l2-1 and l2!=0:
                    for s in range(n):
                        if array[j,0][k-1] == Total_keywords[s]:
                            matrix[s][i] = matrix[s][i]+1

Glove = pd.DataFrame(matrix, columns = Total_keywords )
Glove.set_index([pd.Index(Total_keywords)])

	nyc	loving	revisiting	vegetables	blanket	occasional	welcome	coziness	tartare	feeling	...	fillet	app	saffron	quality	hella	rightthe	waiting	drawing	unbelievably	flatlined
nyc	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
loving	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
revisiting	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
vegetables	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
blanket	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
rightthe	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
waiting	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
drawing	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
unbelievably	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
flatlined	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

1714 rows × 1714 columns

<script>
  const buttonEl =
    document.querySelector('#df-cfb379d2-73f4-446b-b215-59281731bc39 button.colab-df-convert');
  buttonEl.style.display =
    google.colab.kernel.accessAllowed ? 'block' : 'none';

  async function convertToInteractive(key) {
    const element = document.querySelector('#df-cfb379d2-73f4-446b-b215-59281731bc39');
    const dataTable =
      await google.colab.kernel.invokeFunction('convertToInteractive',
                                                [key], {});
    if (!dataTable) return;

    const docLinkHtml = 'Like what you see? Visit the ' +
      '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
      + ' to learn more about interactive tables.';
    element.innerHTML = '';
    dataTable['output_type'] = 'display_data';
    await google.colab.output.renderOutput(dataTable, element);
    const docLink = document.createElement('div');
    docLink.innerHTML = docLinkHtml;
    element.appendChild(docLink);
  }
</script>

Cosine Similarity distance:

#Comparing two vectors by Cosin distance
from scipy.spatial import distance
a = Glove['unsatisfying']#word vector a
b = Glove['refused']#word vector b
a = a.to_numpy()
b = b.to_numpy()

print('cosin similarity is :', distance.cosine(a, b))

cosin similarity is : 1.0

a = Glove['menu']#word vector a
b = Glove['great']#word vector a
a = a.to_numpy()
b = b.to_numpy()
l1 = len(array)
counter = 0
for i in range(l1):
    l2 = len(array[i,0])
    for j in range(l2):
        if 'menu' == array[i,0][j]:
            counter = counter + 1
d = len(Total_keywords)
for i in range(d):
    if 'menu' == Total_keywords[i]:
        f = i
s = Glove.loc[f, 'great']
print('Total number of word a :',counter)
print('Total number of word a after or befor b:',Glove.loc[f, 'great'])
print('probability of event word a to word b is:',s/counter*100,'%')
print('cosin similarity is :', round(distance.cosine(a, b),3))

Total number of word a : 15
Total number of word a after or befor b: 2.0
probability of event word a to word b is: 13.333333333333334 %
cosin similarity is : 0.961

Edit this page

Natural Language Processing Document Similarity Word Embeddings Word2Vec GloVe