I use python to simulate central limit theorem. It is the sum of n bernoulli distribution. This is my code:
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
"""
Created on Sun Nov 12 08:44:37 2017
@author: Belter
"""
def sampling2pmf(n, dist, m=10000):
"""
n: sample size for each experiment
m: how many times do you do experiment, fix in 10000
dist: frozen distribution
"""
ber_dist = dist
sum_of_samples = []
for i in range(m):
samples = ber_dist.rvs(size=n)
sum_of_samples.append(np.sum(samples))
val, cnt = np.unique(sum_of_samples, return_counts=True)
pmf = cnt / len(sum_of_samples)
return val, pmf
def plot(n, dist, subplot, plt_handle):
"""
:param n: sample size
:param dist: distribution of each single sample
:param subplot: location of sub-graph, such as 221, 222, 223, 224
:param plt_handle: plt object
:return: plt object
"""
plt = plt_handle
plt.subplot(subplot)
mu = n * dist.mean()
sigma = np.sqrt(n * dist.var())
samples = sampling2pmf(n=n, dist=dist)
plt.vlines(samples[0], 0, samples[1],
colors='g', linestyles='-', lw=3)
plt.ylabel('Probability')
plt.title('Sum of bernoulli dist. (n={})'.format(n))
# normal distribution
norm_dis = stats.norm(mu, sigma)
norm_x = np.linspace(mu - 2.8 * sigma, mu + 2.8 * sigma, 10000)
pdf1 = norm_dis.pdf(norm_x)
plt.plot(norm_x, pdf1, 'r-')
return plt
# p = 0.999
size = [1, 4, 20, 80, 200, 10000]
# sum of bernoulli distribution
dist_type = 'bern'
bern_para = [0.4]
single_sample_dist = stats.bernoulli(p=bern_para[0]) # define a bernoulli distribution
# plot by matplotlib
plt.figure(1)
plt = plot(n=size[0], dist=single_sample_dist, subplot=321, plt_handle=plt)
plt = plot(n=size[1], dist=single_sample_dist, subplot=322, plt_handle=plt)
plt = plot(n=size[2], dist=single_sample_dist, subplot=323, plt_handle=plt)
plt = plot(n=size[3], dist=single_sample_dist, subplot=324, plt_handle=plt)
plt = plot(n=size[4], dist=single_sample_dist, subplot=325, plt_handle=plt)
plt = plot(n=size[5], dist=single_sample_dist, subplot=326, plt_handle=plt)
plt.tight_layout()
plt.savefig('sum_of_{}_dist.png'.format(dist_type), dpi=200)
Then I got this result:
As we can see, when $n=20$ or $n=80$, it looks like they have a very high similarity. But when $n$ goes bigger to $200$ or even $10000$, it looks like lower similarity.
Did I missed something? Any help would be nice.
