# Run this code! But there is no need to pay much attention to this cell at the first pass through the notebook

%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.tri as tri
import scipy.stats as sps

# Pre-calculate some global data used by functions plot_simplex, plot_dirichlet, and plot_points_in_simplex
_corners = np.array([[0, 0], [1, 0], [0.5, 0.75**0.5]])
_invT = np.linalg.inv(_corners[:2]-_corners[2])
_triangle = tri.Triangulation(_corners[:, 0], _corners[:, 1])
_refiner = tri.UniformTriRefiner(_triangle)
_trimesh = _refiner.refine_triangulation(subdiv=7)
# Now convert _trimesh 2D cartesian coordinates to 3D barycentric coordinates (i.e. 3D point on the 2D simplex)
# as described in https://en.wikipedia.org/wiki/Barycentric_coordinate_system#Edge_approach
# We calculate only the first 2 barycentric coordinates as sps.dirichlet.pdf is happy without the last one (l3 = 1-l1+l2)
_tol=1.e-8
_l1l2 = (np.c_[_trimesh.x, _trimesh.y] -_corners[2]) @ _invT
_l1l2 = np.clip(_l1l2, 2*_tol, 1.0-_tol) - _tol # to make sure that none of the probabilities is exactly zero or one

def plot_simplex(class_labels=[""]*3):    
    '''Plot "axis" for 2-simplex. It simply plots a triangle into which we will be ploting points
       representing Categorical distributions (for 3 categories/topics) or a Dirichlet distribution.
    Arguments:
       class_labels: list of 3 strings that are used as labels (i.e. category/topic names) for the
       simplex (triangle) corners
    '''
    plt.triplot(_triangle, linewidth=1)
    #plt.xlim(0, 1)
    #plt.ylim(0, 0.75**0.5)
    plt.axis('equal')
    plt.axis('off')
    plt.text(0,           0, class_labels[0], horizontalalignment='left',   verticalalignment='top')
    plt.text(1,           0, class_labels[1], horizontalalignment='right',  verticalalignment='top')
    plt.text(0.5, 0.75**0.5, class_labels[2], horizontalalignment='center', verticalalignment='bottom')


def plot_dirichlet(alpha, nlevels=128, **kwargs):
    '''Plot Dirichlet pdf in an equilateral triangle (2-simplex).
    Arguments:
        alpha: Dirichlet distribution parameters.
        nlevels (int): Number of contours (shades) to draw.
        kwargs: Keyword args passed on to `plt.tricontourf`.
    '''
    plt.tricontourf(_trimesh, sps.dirichlet.pdf(_l1l2.T, alpha), nlevels, cmap='gray_r', **kwargs)
    
def plot_points_in_simplex(X, **kwargs):
    '''Plots a set of points in the 2-simplex. Each point can represent
       a categorical distribution with 3 categories/topics.
    Arguments:
        X: A Nx3 array in barycentric coordinates of points to plot.
        kwargs: Keyword args passed on to `plt.plot`.
    '''
    plt.plot(*(X @ _corners).T, **kwargs)
    
def plot_topic_distributions(Phi, vocabulary, top_V=10):
    '''Plot words and their probabilities for each topic.
    Arguments:
        Phi: A KxD matrix where rows are topic specific distributions
        vocabulary: List of strings that are words corresponding to columns of Phi
        top_V: Only top_V most likely words and their probabilities are shown.
    '''
    plt.figure(figsize=(20, 4)) 
    for k, topic_dist in enumerate(Phi):
        plt.subplot(1, len(Phi), k+1)
        sort_ixs = np.argsort(topic_dist)[::-1]
        top_V_words = [vocabulary[i] for i in sort_ixs[:top_V]]
        plt.barh(np.arange(len(top_V_words)/2, 0, -0.5), topic_dist[sort_ixs[:top_V]], height=0.4)
        plt.yticks(np.arange(len(top_V_words)/2, 0, -0.5), top_V_words, fontsize=12)
        plt.grid(axis='x', linestyle='--', alpha=0.5)
        plt.title("Topic %2d" % (k+1,))
        plt.xlabel("Probability")


vocabulary   = [  'tenis', 'surfing','software','apple', 'burger'] # TOPIC:
Phi_gt = np.array([[0.7,      0.3,      0.0,      0.0,      0.0],  # sports
                   [0.0,      0.2,      0.5,      0.3,      0.0],  # computers
                   [0.0,      0.0,      0.0,      0.4,      0.6]]) # food
plot_topic_distributions(Phi_gt, vocabulary)


                            # sports computers food  # Collection of documents about ...
alpha4collections = np.array([[ 12,     1,     1],  # sports
                              [  2,     8,     1],  # computers
                              [  1,     1,    13],  # food
                              [ 12,     2,    10]]) # food for sportsmen
documents_per_collection = 20
Theta_gt = []
plt.figure(figsize=(14, 3))

# For each collection
for i, a in enumerate(alpha4collections): 
    plt.subplot(1, len(alpha4collections), i+1)
    plt.title(r'$\alpha$ = (%.3f, %.3f, %.3f)' % tuple(a))
 
    # Plot 2-simplex (a triangle) and add topic labels to the corners of the triangle where those topics have probability 1
    plot_simplex(['Sports', 'Computers', 'Food']) 
    
    # Plot collection-specific Dirichlet distribution (in grayscale)
    plot_dirichlet(a)                             
    
    # Sample topic mixture weights for all documents in one collection
    thetas4collection = sps.dirichlet.rvs(a, documents_per_collection)

    # Plot topic mixture weights as points in the simplex
    plot_points_in_simplex(thetas4collection, c='b', ls='none', marker='+') 
    Theta_gt.append(thetas4collection)
    
# Concatenate all topic weights from all collections into a single matrix
Theta_gt = np.concatenate(Theta_gt)

# For each document, remember the label saying which collection it comes from
collection_label = np.repeat(range(len(alpha4collections)), documents_per_collection)


plot_simplex([vocabulary[np.argmax(p)] for p in Phi_gt])
markers = ['+', 'x', '^', '*']
colors = ['r', 'b', 'g', 'm']
for i, t in enumerate(Theta_gt):
  plot_points_in_simplex(t, c=colors[collection_label[i]], ls='none', marker=markers[collection_label[i]])


PhiTheta = Theta_gt @ Phi_gt
N=100
M =np.vstack([sps.multinomial.rvs(N, dd) for dd in PhiTheta])
print("The word count matrix for the first 10 training documents:\n", M[:10])

The word count matrix for the first 10 training documents:
 [[69 26  0  2  3]
 [69 25  0  2  4]
 [68 30  2  0  0]
 [60 31  2  3  4]
 [60 27  2  4  7]
 [56 25  4  7  8]
 [63 36  0  0  1]
 [58 18  4  9 11]
 [59 22  5  5  9]
 [70 23  4  2  1]]


# Make use of the following variables

D, V = M.shape # number of training documents D and size of vocabulary V
K = 3
alpha0 = np.ones(K, dtype='float') # Parameters of prior for Theta
beta0  = np.ones(V, dtype='float') # Parameters of prior for Phi
Theta  = np.ones((D,K))/K          # Initial values for Theta; store new Theta sampled in each iteration to this variable
Phi    = np.ones((K,V))/V          # Initial values for Phi; store new Phi sampled in each iteration to this variable


#Your code for Gibbs sampling inference


#Your code goes here


#Your code goes here


#Your code for clustering


M_test = np.array([[  6,   3,   1,   0,   0],
                   [16,  171, 110,  50, 160],
                   [  6,   1,   0,   8,   5],
                   [159,  78,  14,  97, 121]])
print("Word count matrix for additional test documents\n", M_test)

#
D_test=len(M_test)
Theta_test=np.array([[1,0,0]]*D_test)

Word count matrix for additional test documents
 [[  6   3   1   0   0]
 [ 16 171 110  50 160]
 [  6   1   0   8   5]
 [159  78  14  97 121]]


#Your code goes here


# Make use of the following variables

D, V = M.shape # number of training documents D and size of vocabulary V
K = 3
alpha0 = np.ones(K, dtype='float') # Parameters of prior for Theta
beta0  = np.ones(V, dtype='float') # Parameters of prior for Phi
alpha=np.abs(np.random.randn(D,K)) # The rows are the initial parameters of the approximate posterior for each q(theta_d)
beta =np.abs(np.random.randn(K,V)) # The rows are the initial parameters of the approximate posterior for each q(varphi_k)
# Reuse the variables alpha and beta to store the parameters of the approximate posteriors updated in each VB iteration.


#Your code for Variational Bayes inference


#Your code goes here


#Your code goes here


#Your code for clustering


#Your code goes here

BAYa class Assignment 2022¶

LDA generative process¶

Handcrafting the LDA model¶

Handcrafting $\boldsymbol\Phi$¶

Handcrafting $\boldsymbol\Theta$¶

Plotting all handcrafted ${\boldsymbol\theta}_d$ into a single simplex¶

Sampling training data¶

Training the LDA model using Gibbs Sampling inference¶

Summary of the inference algorithm¶

Tasks and questions:¶

Training LDA model using Variational Bayes inference¶

Summary of the inference algorithm¶

Tasks and questions:¶