Cosine Similarity
Last updated
Was this helpful?
Last updated
Was this helpful?
Was this helpful?
documents = ("The sky is blue",
"The sun is bright",
"The sun in the sky is bright",
"We can see the shining sun, the bright sun"
)
# import TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Instantiate SKlearn TF-Vectorizer and transform document into TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_matrix.shape # Prints (4,11): 4 sentences, 11 unique words
# Calculate cosine similarity between the first document with each of the other document
display(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix))array([[1. , 0.36651513, 0.52305744, 0.13448867]])