Python Word Similarity Models

This document presents Python code for implementing word similarity models based on vector representations and common similarity metrics.

Imports and Utilities

from __future__ import division
import collections
import math

ddict = collections.defaultdict

Word Similarity Model Class

A generic model for calculating word similarity, parameterized by two functions: one for producing a vector from a word, and one for comparing two vectors.

class WordSimilarityModel(object):
    def __init__(self, get_vector, get_similarity):
        self._get_vector = get_vector
        self._get_similarity = get_similarity

    def __call__(self, word1, word2):
        vector1 = self._get_vector(word1)
        vector2 = self._get_vector(word2)
        return self._get_similarity(vector1, vector2)

Word Vector Model Class

A generic model for creating feature vectors for words, based on a list of words and the relations they were observed with.

class WordVectorModel(object):
    def __init__(self, word_relation_lists):
        self._word_rel_counts = ddict(lambda: ddict(lambda: 0))
        self._word_rel_count = 0
        self._word_counts = ddict(lambda: 0)
        self._rel_counts = ddict(lambda: 0)

        # Calculate counts of words and relations
        for word, relations in word_relation_lists:
            for relation in relations:
                self._word_rel_counts[word][relation] += 1
                self._word_rel_count += 1
                self._word_counts[word] += 1
                self._rel_counts[relation] += 1

        # Pick a canonical order for the vectors
        self._rels = sorted(self._rel_counts)

    # Probability of the word appearing with the relation
    def get_probability(self, word, rel):
        word_rel_count = self._word_rel_counts[word][rel]
        word_count = self._word_counts[word]
        return word_rel_count / word_count

    # Pointwise mutual information between word and relation events
    def get_mutual_information(self, word, rel):
        word_given_rel_prob = self.get_probability(word, rel)
        rel_prob = self._rel_counts[rel] / self._word_rel_count
        try:
            return math.log(word_given_rel_prob / rel_prob, 2)
        except OverflowError:
            return 0

    # Vector of relation probabilities
    def get_probability_vector(self, word):
        return self._get_vector(self.get_probability, word)

    # Vector of word-relation pointwise mutual informations
    def get_mutual_information_vector(self, word):
        func = self.get_mutual_information
        return self._get_vector(func, word)

    # Helper for creating vectors
    def _get_vector(self, func, word):
        return [func(word, rel) for rel in self._rels]

Vector Similarity Functions

Functions to calculate similarity between two vectors.

# Calculate Jaccard similarity
def get_jaccard_similarity(vector1, vector2):
    top = sum(min(x1, x2) for x1, x2 in zip(vector1, vector2))
    bottom = sum(max(x1, x2) for x1, x2 in zip(vector1, vector2))
    return top / bottom

# Calculate Dice similarity
def get_dice_similarity(vector1, vector2):
    top = 2 * sum(min(x1, x2) for x1, x2 in zip(vector1, vector2))
    bottom = sum(x1 + x2 for x1, x2 in zip(vector1, vector2))
    return top / bottom

Example Usage

Scores may then be generated like this:

>>> words = ...
>>> vector_model = WordVectorModel(get_window_relations(5, words))
>>> get_sim = WordSimilarityModel(vector_model.get_probability_vector,
... get_jaccard_similarity)
>>> get_sim('red', 'green')
0.046843607909485496