Using python simulate central limit theorem, bigger n, worse fitting?

1.4k Views Asked by At

I use python to simulate central limit theorem. It is the sum of n bernoulli distribution. This is my code:

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

"""
Created on Sun Nov 12 08:44:37 2017

@author: Belter
"""


def sampling2pmf(n, dist, m=10000):
    """
    n: sample size for each experiment
    m: how many times do you do experiment, fix in 10000
    dist: frozen distribution
    """
    ber_dist = dist
    sum_of_samples = []
    for i in range(m):
        samples = ber_dist.rvs(size=n)
        sum_of_samples.append(np.sum(samples))
    val, cnt = np.unique(sum_of_samples, return_counts=True)
    pmf = cnt / len(sum_of_samples)
    return val, pmf


def plot(n, dist, subplot, plt_handle):
    """
    :param n: sample size
    :param dist: distribution of each single sample
    :param subplot: location of sub-graph, such as 221, 222, 223, 224
    :param plt_handle: plt object
    :return: plt object
    """
    plt = plt_handle
    plt.subplot(subplot)
    mu = n * dist.mean()
    sigma = np.sqrt(n * dist.var())
    samples = sampling2pmf(n=n, dist=dist)
    plt.vlines(samples[0], 0, samples[1],
               colors='g', linestyles='-', lw=3)
    plt.ylabel('Probability')
    plt.title('Sum of bernoulli dist. (n={})'.format(n))
    # normal distribution
    norm_dis = stats.norm(mu, sigma)
    norm_x = np.linspace(mu - 2.8 * sigma, mu + 2.8 * sigma, 10000)
    pdf1 = norm_dis.pdf(norm_x)
    plt.plot(norm_x, pdf1, 'r-')
    return plt

# p = 0.999
size = [1, 4, 20, 80, 200, 10000]

# sum of bernoulli distribution
dist_type = 'bern'
bern_para = [0.4]
single_sample_dist = stats.bernoulli(p=bern_para[0])  # define a bernoulli distribution

# plot by matplotlib
plt.figure(1)
plt = plot(n=size[0], dist=single_sample_dist, subplot=321, plt_handle=plt)
plt = plot(n=size[1], dist=single_sample_dist, subplot=322, plt_handle=plt)
plt = plot(n=size[2], dist=single_sample_dist, subplot=323, plt_handle=plt)
plt = plot(n=size[3], dist=single_sample_dist, subplot=324, plt_handle=plt)
plt = plot(n=size[4], dist=single_sample_dist, subplot=325, plt_handle=plt)
plt = plot(n=size[5], dist=single_sample_dist, subplot=326, plt_handle=plt)
plt.tight_layout()
plt.savefig('sum_of_{}_dist.png'.format(dist_type), dpi=200)

Then I got this result:

enter image description here

As we can see, when $n=20$ or $n=80$, it looks like they have a very high similarity. But when $n$ goes bigger to $200$ or even $10000$, it looks like lower similarity.

Did I missed something? Any help would be nice.