Python Word Similarity Models
This document presents Python code for implementing word similarity models based on vector representations and common similarity metrics.
Imports and Utilities
from __future__ import division
import collections
import math
ddict = collections.defaultdict
Word Similarity Model Class
A generic model for calculating word similarity, parameterized by two functions: one for producing a vector from a word, and one for comparing two vectors.
class WordSimilarityModel(object):
def __init__(self, get_vector, get_similarity):
self._get_vector = get_vector
self._get_similarity = get_similarity
def __call__(self, word1, word2):
vector1 = self._get_vector(word1)
vector2 = self._get_vector(word2)
return self._get_similarity(vector1, vector2)
Word Vector Model Class
A generic model for creating feature vectors for words, based on a list of words and the relations they were observed with.
class WordVectorModel(object):
def __init__(self, word_relation_lists):
self._word_rel_counts = ddict(lambda: ddict(lambda: 0))
self._word_rel_count = 0
self._word_counts = ddict(lambda: 0)
self._rel_counts = ddict(lambda: 0)
# Calculate counts of words and relations
for word, relations in word_relation_lists:
for relation in relations:
self._word_rel_counts[word][relation] += 1
self._word_rel_count += 1
self._word_counts[word] += 1
self._rel_counts[relation] += 1
# Pick a canonical order for the vectors
self._rels = sorted(self._rel_counts)
# Probability of the word appearing with the relation
def get_probability(self, word, rel):
word_rel_count = self._word_rel_counts[word][rel]
word_count = self._word_counts[word]
return word_rel_count / word_count
# Pointwise mutual information between word and relation events
def get_mutual_information(self, word, rel):
word_given_rel_prob = self.get_probability(word, rel)
rel_prob = self._rel_counts[rel] / self._word_rel_count
try:
return math.log(word_given_rel_prob / rel_prob, 2)
except OverflowError:
return 0
# Vector of relation probabilities
def get_probability_vector(self, word):
return self._get_vector(self.get_probability, word)
# Vector of word-relation pointwise mutual informations
def get_mutual_information_vector(self, word):
func = self.get_mutual_information
return self._get_vector(func, word)
# Helper for creating vectors
def _get_vector(self, func, word):
return [func(word, rel) for rel in self._rels]
Vector Similarity Functions
Functions to calculate similarity between two vectors.
# Calculate Jaccard similarity
def get_jaccard_similarity(vector1, vector2):
top = sum(min(x1, x2) for x1, x2 in zip(vector1, vector2))
bottom = sum(max(x1, x2) for x1, x2 in zip(vector1, vector2))
return top / bottom
# Calculate Dice similarity
def get_dice_similarity(vector1, vector2):
top = 2 * sum(min(x1, x2) for x1, x2 in zip(vector1, vector2))
bottom = sum(x1 + x2 for x1, x2 in zip(vector1, vector2))
return top / bottom
Example Usage
Scores may then be generated like this:
>>> words = ...
>>> vector_model = WordVectorModel(get_window_relations(5, words))
>>> get_sim = WordSimilarityModel(vector_model.get_probability_vector,
... get_jaccard_similarity)
>>> get_sim('red', 'green')
0.046843607909485496