nlp=spacy.load("en_core_web_sm",disable=['ner','parser'])# nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speeddefcleaning(doc):# Lemmatizes and removes stopwords# doc needs to be a spacy Doc objecttxt=[token.lemma_fortokenindocifnottoken.is_stop]# Word2Vec uses context words to learn the vector representation of a target word,# if a sentence is only one or two words long,# the benefit for the training is very smalliflen(txt)>2:return' '.join(txt)
t=time()txt=[cleaning(doc)fordocinnlp.pipe(brief_cleaning,batch_size=5000,n_process=-1)]print('Time to clean up everything: {} mins'.format(round((time()-t)/60,2)))
t=time()w2v_model.build_vocab(sentences,progress_per=10000)print('Time to build vocab: {} mins'.format(round((time()-t)/60,2)))
Time to build vocab: 0.01 mins
t=time()w2v_model.train(sentences,total_examples=w2v_model.corpus_count,epochs=30,report_delay=1)print('Time to train the model: {} mins'.format(round((time()-t)/60,2)))
Time to train the model: 0.74 mins
w2v_model.init_sims(replace=True)
<ipython-input-36-c7757d71a30b>:1: DeprecationWarning: Call to deprecated `init_sims` (Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. init_sims() is now obsoleted and will be completely removed in future versions. See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4).
w2v_model.init_sims(replace=True)
WARNING:gensim.models.keyedvectors:destructive init_sims(replace=True) deprecated & no longer required for space-efficiency
Let’s checkout the similarity of word with input word as homer
array=df.to_numpy()#convert to numpy#stop words liststop_words=set(stopwords.words('english'))#add other things to stop_wordsstop_words.add(',')stop_words.add('.')stop_words.add('“')stop_words.add('’')stemmer=PorterStemmer()lemmatizer=WordNetLemmatizer()l=len(array)foriinrange(l):array[i][0]=array[i][0].lower()#Upper to lowerarray[i][0]=''.join((zforzinarray[i][0]ifnotz.isdigit()))#removing numbersarray[i][0]=array[i][0].strip()#Removing white spacearray[i][0]=array[i][0].translate(str.maketrans('','',string.punctuation))#remove Punctuationarray[i][0]=re.sub('http://\S+|https://\S+','',array[i][0])#remove http adressword_tokens=word_tokenize(array[i][0])#Tokenizearray[i][0]=word_tokensarray[i][0]=[wordforwordinarray[i][0]ifnotwordinstopwords.words()]a=[]le=len(array[i][0])forwordinarray[i][0]:w1=stemmer.stem(word)w2="".join(word)#w2 = lemmatizer.lemmatize(word)#lemmatizea.append(w2)array[i][0]=a
<script>
const buttonEl =
document.querySelector('#df-cfb379d2-73f4-446b-b215-59281731bc39 button.colab-df-convert');
buttonEl.style.display =
google.colab.kernel.accessAllowed ? 'block' : 'none';
async function convertToInteractive(key) {
const element = document.querySelector('#df-cfb379d2-73f4-446b-b215-59281731bc39');
const dataTable =
await google.colab.kernel.invokeFunction('convertToInteractive',
[key], {});
if (!dataTable) return;
const docLinkHtml = 'Like what you see? Visit the ' +
'<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
+ ' to learn more about interactive tables.';
element.innerHTML = '';
dataTable['output_type'] = 'display_data';
await google.colab.output.renderOutput(dataTable, element);
const docLink = document.createElement('div');
docLink.innerHTML = docLinkHtml;
element.appendChild(docLink);
}
</script>
Cosine Similarity distance:
#Comparing two vectors by Cosin distancefromscipy.spatialimportdistancea=Glove['unsatisfying']#word vector ab=Glove['refused']#word vector ba=a.to_numpy()b=b.to_numpy()
print('cosin similarity is :',distance.cosine(a,b))
cosin similarity is : 1.0
a=Glove['menu']#word vector ab=Glove['great']#word vector aa=a.to_numpy()b=b.to_numpy()l1=len(array)counter=0foriinrange(l1):l2=len(array[i,0])forjinrange(l2):if'menu'==array[i,0][j]:counter=counter+1d=len(Total_keywords)foriinrange(d):if'menu'==Total_keywords[i]:f=is=Glove.loc[f,'great']print('Total number of word a :',counter)print('Total number of word a after or befor b:',Glove.loc[f,'great'])print('probability of event word a to word b is:',s/counter*100,'%')print('cosin similarity is :',round(distance.cosine(a,b),3))
Total number of word a : 15
Total number of word a after or befor b: 2.0
probability of event word a to word b is: 13.333333333333334 %
cosin similarity is : 0.961
Hi, I’m Haleshot, a senior-year student studying B Tech Artificial Intelligence. I like projects relating to ML, AI, DL, CV, NLP, Image Processing, etc. Currently exploring Python, FastAPI, projects involving AI and platforms such as HuggingFace and Kaggle.