Python Word Similarity Models

Posted on May 11, 2025 in Law and Journalism

This document presents Python code for implementing word similarity models based on vector representations and common similarity metrics.

Imports and Utilities

from __future__ import division
import collections
import math

ddict = collections.defaultdict

Word Similarity Model Class

A generic model for calculating word similarity, parameterized by two functions: one for producing a vector from a word, and one for comparing two vectors.

class WordSimilarityModel(object):
    def __init__(self, get_vector, get_similarity):
        self._get_vector = get_vector
        self._get_similarity = get_similarity

    def __call__(self, word1, word2):
        vector1 = self._get_vector(word1)
        vector2 = self._get_vector(word2)
        return self._get_similarity(vector1, vector2)

Word Vector Model Class

A generic model for creating feature vectors for words, based on a list of words and the relations they were observed with.

class WordVectorModel(object):
    def __init__(self, word_relation_lists):
        self._word_rel_counts = ddict(lambda: ddict(lambda: 0))
        self._word_rel_count = 0
        self._word_counts = ddict(lambda: 0)
        self._rel_counts = ddict(lambda: 0)

        # Calculate counts of words and relations
        for word, relations in word_relation_lists:
            for relation in relations:
                self._word_rel_counts[word][relation] += 1
                self._word_rel_count += 1
                self._word_counts[word] += 1
                self._rel_counts[relation] += 1

        # Pick a canonical order for the vectors
        self._rels = sorted(self._rel_counts)

    # Probability of the word appearing with the relation
    def get_probability(self, word, rel):
        word_rel_count = self._word_rel_counts[word][rel]
        word_count = self._word_counts[word]
        return word_rel_count / word_count

    # Pointwise mutual information between word and relation events
    def get_mutual_information(self, word, rel):
        word_given_rel_prob = self.get_probability(word, rel)
        rel_prob = self._rel_counts[rel] / self._word_rel_count
        try:
            return math.log(word_given_rel_prob / rel_prob, 2)
        except OverflowError:
            return 0

    # Vector of relation probabilities
    def get_probability_vector(self, word):
        return self._get_vector(self.get_probability, word)

    # Vector of word-relation pointwise mutual informations
    def get_mutual_information_vector(self, word):
        func = self.get_mutual_information
        return self._get_vector(func, word)

    # Helper for creating vectors
    def _get_vector(self, func, word):
        return [func(word, rel) for rel in self._rels]

Vector Similarity Functions

Functions to calculate similarity between two vectors.

# Calculate Jaccard similarity
def get_jaccard_similarity(vector1, vector2):
    top = sum(min(x1, x2) for x1, x2 in zip(vector1, vector2))
    bottom = sum(max(x1, x2) for x1, x2 in zip(vector1, vector2))
    return top / bottom

# Calculate Dice similarity
def get_dice_similarity(vector1, vector2):
    top = 2 * sum(min(x1, x2) for x1, x2 in zip(vector1, vector2))
    bottom = sum(x1 + x2 for x1, x2 in zip(vector1, vector2))
    return top / bottom

Example Usage

Scores may then be generated like this:

>>> words = ...
>>> vector_model = WordVectorModel(get_window_relations(5, words))
>>> get_sim = WordSimilarityModel(vector_model.get_probability_vector,
... get_jaccard_similarity)
>>> get_sim('red', 'green')
0.046843607909485496

Python Word Similarity Models

Imports and Utilities

Word Similarity Model Class

Word Vector Model Class

Vector Similarity Functions

Example Usage

Recent Notes

Subjects

Publicidad