Dirichlet过程混合模型(DPMM)的Gibbs抽样程序

来源：互联网发布：怎么看淘宝订单编号编辑：程序博客网时间：2024/05/22 03:02

Dirichlet过程混合模型(DPMM)的详细细节介绍包括抽样过程，请看如下这篇文章：
Yu X. Gibbs Sampling Methods for Dirichlet Process Mixture Model: Technical Details[J]. 2009.

假设我们现在有一个巨大的空间，整个空间中包含了无数的混合组成成分，每次选择其中的几个成分，然后利用这几个成分生成一组数据。我们把一组一组的数据放在一起，就得到了我们现在所拥有的数据。

自己打算写个java版本的，这里把别人写的python版本的放在这里，留作参考：
地址：https://github.com/lee813/pydpmm

distribution.py文件

from __future__ import divisionimport numpy as np#fix var = 1 standard normal distribution#with prior mu conjugate to a normal distributionclass UnivariateGaussian(object):    def __init__(self,mu=None):        self.mu = mu    def rvs(self, size=None):        return np.random.normal(self.mu,1,size)    def set_mu(self,mu=None):        self.mu = mu    def sample_new_mu(self,x):        return np.random.normal(0.5*x, 0.5, 1)    def log_likelihood(self,x):        x = np.reshape(x, (-1, 1))        return (-0.5 * (x - self.mu) ** 2 - np.log(2 * np.pi )).ravel()    @staticmethod    def epsilon_log_univariate_normal(self, mu, sigma):        return np.log(1/(sigma * np.sqrt(2*np.pi))) - mu**2/2*sigma**2

以下是抽样过程
gibbs.py

from __future__ import divisionfrom distribution import UnivariateGaussianimport numpy as npclass dpmm_gibbs_base(object):    def __init__(self, init_K=5, x=[], alpha_prior=None):        #Convert python array to numpy array        self.x = np.asarray(x)        self.K = init_K        self.nn = np.ones(self.K)        self.alpha_prior = alpha_prior        self.alpha_0 = 1            #np.random.gamma(self.alpha_prior['a'],self.alpha_prior['b'])        # init zz randomly        self.zz = np.random.randint(init_K, size=len(self.x))        self.mu_0 = 1        self.mu = np.ones(self.K)        self.components = [mixture_component(ss=[], distn=UnivariateGaussian(mu=mu_i)) for mu_i in self.mu]        for idx, c in enumerate(self.components):            c.ss = self.x[self.zz == idx]            self.nn[idx] = len(c.ss)        self.n = len(self.x)#TODO Add variance parameter# fix var = 1class direct_dpmm_gibbs(dpmm_gibbs_base):    def __init__(self, init_K=5, x=[], alpha_prior=None):        super(direct_dpmm_gibbs, self).__init__(init_K, x, alpha_prior)    def new_component_probability(self, x):        # TODO check formula        return (1 / (2 * np.sqrt(np.pi))) * np.exp(- x**2 / 4)    def new_component_log_integral(self, x):        # TODO check formula        return np.log(2 * np.sqrt(np.pi)) - (x**2/4)    def sample_z(self):        # STEP 2(d)        # add z_i = new to form a new multi dist        # Start sample aux indication variable z        for idx, x_i in enumerate(self.x):            kk = self.zz[idx]            # Clean mixture components            temp_zz, = np.where(self.zz == kk)            temp_zz = np.setdiff1d(temp_zz, np.array([idx]))            self.nn[kk] -= 1            temp_ss = self.x[temp_zz]            self.components[kk].ss = temp_ss            if (len(temp_ss) == 0):                #print('component deleted')                self.components = np.delete(self.components, kk)                self.K = len(self.components)                self.nn = np.delete(self.nn, kk)                zz_to_minus_1 = np.where(self.zz > kk)                self.zz[zz_to_minus_1] -= 1            proportion = np.array([])            for k in range(0, self.K):                # Calculate proportion for exist mixture component                # Clean mixture components                n_k = self.nn[k]                #return exp                _proportion = (n_k / (self.n + self.alpha_0 - 1)) * np.exp(self.components[k].distn.log_likelihood(x_i))                proportion = np.append(proportion, _proportion)            new_proportion = (self.alpha_0 / (self.n + self.alpha_0 - 1)) * self.new_component_probability(x_i)            all_propotion = np.append(proportion, new_proportion)            normailizedAllPropotion = all_propotion / sum(all_propotion)            sample_z = np.random.multinomial(1, normailizedAllPropotion, size=1)            z_index = np.where(sample_z == 1)[1][0]            self.zz[idx] = z_index            # found new component            if (z_index == self.K):                self.K += 1                # sample new mu for new component                # G_0 = n(0,1)                new_mu = np.random.normal(0.5 * x_i, 0.5, 1);                new_component = mixture_component(ss=[x_i], distn=UnivariateGaussian(mu=new_mu))                self.components = np.append(self.components, new_component)                self.nn = np.append(self.nn, 1)                #print 'new component added'            # add data to exist component            else:                self.components[z_index].ss = np.append(self.components[z_index].ss, x_i)                self.nn[z_index] += 1        for component in self.components:            component.print_self()        print('alpha -> ' + str(self.alpha_0))    def sample_mu(self):        for k in range(0, self.K):            x_k = self.components[k].ss            mu_k = np.random.normal((self.mu_0 + sum(x_k))/(1+len(x_k)), 1/(1 + len(x_k)), 1)            self.components[k].distn.set_mu(mu=mu_k)            #print('new mu -> ' + str(mu_k[0]))    def sample_alpha_0(self):        #Escobar and West 1995        eta = np.random.beta(self.alpha_0 + 1,self.n,1)        #Teh HDP 2005        #construct the mixture model        pi = self.n/self.alpha_0        pi = pi/(1+pi)        s = np.random.binomial(1,pi,1)        #sample from a two gamma mixture models        self.alpha_0 = np.random.gamma(self.alpha_prior['a'] + self.K - s, 1/(self.alpha_prior['b'] - np.log(eta)), 1)class collapsed_dpmm_gibbs(dpmm_gibbs_base):    def __init__(self, init_K=5, x=[], alpha_prior = None, observation_prior=None,):        super(collapsed_dpmm_gibbs, self).__init__(init_K, x, alpha_prior)        self.observation_prior = observation_prior        #add a new empty component        new_mu = np.random.normal(self.observation_prior['mu'], self.observation_prior['sigma'], 1);        new_component = mixture_component(ss=[], distn=UnivariateGaussian(mu=new_mu))        self.components = np.append(self.components, new_component)        #print (UnivariateGaussian.epsilon_log_univariate_normal(self,-12,2) - UnivariateGaussian.epsilon_log_univariate_normal(self,1,1))    def sample_z(self):        for idx, x_i in enumerate(self.x):            kk = self.zz[idx]            # Clean mixture components            temp_zz, = np.where(self.zz == kk)            # print('----')            # print len(self.components[kk].ss)            temp_zz = np.setdiff1d(temp_zz,np.array([idx]))            self.nn[kk] -= 1            temp_ss = self.x[temp_zz]            #print len(temp_ss)            self.components[kk].ss = temp_ss            if (len(temp_ss) == 0):                #print('component deleted')                #print(len(self.components))                self.components = np.delete(self.components, kk)                #print(len(self.components))                self.K = len(self.components)                self.nn = np.delete(self.nn,kk)                zz_to_minus_1 = np.where(self.zz > kk)                self.zz[zz_to_minus_1] -= 1            pp = np.log(np.append(self.nn, self.alpha_0))            for k in range(0, self.K):                pp[k] = pp[k] + self.log_predictive(self.components[k],x_i)                print(self.log_predictive(self.components[k],x_i))            pp = np.exp(pp - np.max(pp))            pp = pp/np.sum(pp)            sample_z = np.random.multinomial(1, pp, size=1)            print x_i            z_index = np.where(sample_z == 1)[1][0]            self.zz[idx] = z_index            if(z_index == len(self.components) - 1):                print('component added')                new_mu = np.random.normal(0.5 * x_i, 0.5, 1);                new_component = mixture_component(ss=[x_i], distn=UnivariateGaussian(mu=new_mu))                self.components = np.append(self.components, new_component)                self.K = len(self.components)                self.nn = np.append(self.nn, 1)            else:                self.components[z_index].ss = np.append(self.components[z_index].ss, x_i)                self.nn[z_index] += 1        print '----Summary----'        # print self.zz        # print self.nn        # for component in self.components:        #     component.print_self()    def sample_alpha_0(self):        #Escobar and West 1995        eta = np.random.beta(self.alpha_0 + 1,self.n,1)        #Teh HDP 2005        #construct the mixture model        pi = self.n/self.alpha_0        pi = pi/(1+pi)        s = np.random.binomial(1,pi,1)        #sample from a two gamma mixture models        self.alpha_0 = np.random.gamma(self.alpha_prior['a'] + self.K - s, 1/(self.alpha_prior['b'] - np.log(eta)), 1)        print self.alpha_0    def log_predictive(self,component, x_i):        ll = UnivariateGaussian.epsilon_log_univariate_normal(self, self.observation_prior['mu'] + np.sum(component.get_ss()) + x_i ,\                                            self.observation_prior['sigma'] + component.get_n_k_minus_i() + 1) - \             UnivariateGaussian.epsilon_log_univariate_normal(self, self.observation_prior['mu'] + np.sum(component.get_ss()), \                                            self.observation_prior['sigma'] + component.get_n_k_minus_i())        return llclass mixture_component(object):    def __init__(self, ss, distn):        self.ss = ss        self.distn = distn        if(len(ss)> 0):            self.n_k_minus_i = len(ss) - 1        else:            self.n_k_minus_i = 0    def get_n_k_minus_i(self):        if (len(self.ss) > 1):            self.n_k_minus_i = len(self.ss) - 1        else:            self.n_k_minus_i = 0        return self.n_k_minus_i    def get_ss(self):        return self.ss    def print_self(self):        print(self.ss)        print('Mu: '+ str(self.distn.mu))

阅读全文

0 0