Estimate the average difference from the expected amount of bit-runs in true random data?

230 Views Asked by At

I'm creating blocks of totally random bits using a TRNG and per X amount of bits I count the number of times "bit-runs" of different lengths occurred.

With bit-run I mean sequences of repeated bits. E.g. in this block 011110001 we can count the following bit-runs:

[ Bit-run length , Count ]
[              1 ,     2 ] (0 and 1)
[              3 ,     1 ] (000)
[              4 ,     1 ] (1111)

I'm terrible at math, but I know how to program, here I will use JavaScript.

I made this function to calculate the expected amount of bit-runs:

function expectedRuns(blockSize, runLength) {
  if (runLength > blockSize) return 0
  if (runLength == blockSize) {
    return (blockSize-(runLength-2)) / 2**(runLength)
  }
  return (blockSize-(runLength-3)) / 2**(runLength+1)
}
// 2**(runLength+1) is the same as Math.pow(2, runLength+1)

A block of 1000 bits is expected to have this amount (on average):

[ 250.5000, // of run length 1
  125.1250, // of run length 2
   62.5000, // of run length 3
   31.2188,
   15.5938,
    7.7891,
    3.8906,
    1.9434,
    0.9707 ]

So that's 250.5 times single non-repeated bits are to be expected (1 or 0). And 62.5 times a bit-run of 3 is to be expected (000 or 111).

Do you agree with the math so far?

I did of course test it, and this was the average result after checking 62077927 blocks of 1000 bits:

[ 250.4962,
  125.1266,
   62.4994,
   31.2185,  
   15.5952,  
    7.7889,
    3.8902,
    1.9436,  
    0.9706 ]

So I guess that validates my math.

But one thing I can't figure out is how to calculate the expected average difference from these values?

After checking 62077927 blocks of 1000 bits these are the observed average differences:

[ 14.1123, // so a count from 243.44 to 257.55 is quite normal
   8.3462,
   5.6911,
   4.0978,
   2.9831,
   2.1491,
   1.5275,
   1.0685,
   0.7301 ]

And after checking 6872370 blocks of 10.000 bits these are the observed average differences:

[ 44.5928,
  26.3933,
  17.9704,
  12.9517,  
   9.4097,  
   6.7917,
   4.8671,  
   3.4863,  
    2.476 ]

I think there must be a way to use math to estimate these values? But I have failed at all my attempts, since I don't really understand advanced math very well...

If you know the solution I would love to have it explained in JavaScript, e.g. a function that I could run?

As a bonus I would love to also know how to calculate the expected avg difference e.g. after 1000 and 10.0000 blocks? Since it will continuously narrow in to be more accurate; it would be nice to know what was within normal at any point.

Update 1, here is some code to play with (can run in browser/Node/Deno):

const DEPTH = 100

function expAvgRunCount(blockSize, runLength) {
  if (runLength > blockSize) return 0
  if (runLength == blockSize) {
    return (blockSize-(runLength-2)) / 2**(runLength)
  }
  return (blockSize-(runLength-3)) / 2**(runLength+1)
}

class RunCounter {
  #blockSize; #prevBit; #onRunCount
  #bitRun = Array(DEPTH).fill(0)
  #runLength = 0; #bitIndex = 0

  constructor(blockSize, onCountCompleted) {
    this.#blockSize = blockSize
    this.#onRunCount = onCountCompleted
  }

  processByte(byte) {
    for (let i=0; i<8; i++) {
      const bit = (byte >> i) & 1
      if (this.#runLength === 0) { // first
        this.#runLength = 1
        this.#prevBit = bit
      } else if (bit === this.#prevBit) { // same
        this.#runLength ++
      } else { // different (end of sequence)
        this.#bitRun[this.#runLength] ++ // count it
        this.#runLength = 1
        this.#prevBit = bit
      }
      if (++this.#bitIndex === this.#blockSize) {
        this.#bitRun[this.#runLength] ++ // count last
        this.#onRunCount(this.#bitRun)
        // reset
        this.#bitRun = Array(DEPTH).fill(0)
        this.#runLength = 0
        this.#bitIndex = 0
      }
    }
  }
}

function bitRunsInBlock(block, blockSizeBits) {
  const bitRun = Array(DEPTH).fill(0)
  let prevBit, runLength = 0, bitIndex = 0

  outer: for (const byte of block) {
    for (let i=0; i<8; i++) {
      const bit = (byte >> i) & 1
      if (runLength === 0) { // first
        runLength = 1
        prevBit = bit
      } else if (bit === prevBit) { // same
        runLength ++
      } else { // different (end of sequence)
        bitRun[runLength] ++ // count it
        runLength = 1
        prevBit = bit
      }
      if (++bitIndex === blockSizeBits) break outer
    }
  }

  bitRun[runLength] ++ // count last
  return bitRun
}

const blockSizeBits = 10
const totalRunLength = Array(DEPTH).fill(0)
const totalDeviation = Array(DEPTH).fill(0)
const expRuns = []
for (let runLength=1; runLength<DEPTH; runLength++) {
  expRuns[runLength] = expAvgRunCount(blockSizeBits, runLength)
}
let blockNumber = 0, lastPrint = Date.now()
const runCounter = new RunCounter(blockSizeBits, bitRuns => {
  blockNumber ++
  for (let runLength=1; runLength<DEPTH && runLength<=blockSizeBits; runLength++) {
    totalRunLength[runLength] += bitRuns[runLength]
    totalDeviation[runLength] += Math.abs(expRuns[runLength] - bitRuns[runLength])
  }
})

while (1) {
  const randomBytes = crypto.getRandomValues(new Uint8Array(65_536))
  for (const byte of randomBytes) {
    runCounter.processByte(byte)
  }
  if (Date.now() < lastPrint + 1_000) continue
  lastPrint = Date.now()
  console.log(blockSizeBits, blockNumber)
  printAverage(totalRunLength, blockNumber)
  printAverage(totalDeviation, blockNumber)
}

function printAverage(total, count) {
  const average = Array(DEPTH).fill(0)
  for (let i=0; i<DEPTH && i<=blockSizeBits; i++) {
    average[i] = total[i] / count
  }
  console.log(average.slice(1,11))
}

Update 2:

I can almost properly (they are still off) guess this kind of jump if I cheat using a magic number:

console.log(0.4463 * Math.sqrt( 1000)) // 14.1123
console.log(0.4463 * Math.sqrt(10000)) // 44.6299

But how to go from 14.1123 to 8.3462 I have no idea. But the percentages of reduction between each step seems to be the same no matter the block size. But how to calculate it instead of extracting it from the observed data I don't know...

I don't want to end up using any magic (extracted) numbers, everything should have a formula/algorithm.

Update 3:

Demo of magic number use:

function magicFromAvgDiff(blockSize, avgDiff) {
  const result = []
  for (let i=0; i<avgDiff.length; i++) {
    result.push((avgDiff[i]) / Math.sqrt(blockSize))
  }
  return result
}

const magic = magicFromAvgDiff(1000, [
  14.113736720289142, // after 1103188176 blocks
  8.34446417235712,
  5.691317269883429,
  4.096940789048124,
  2.983369701879401,
  2.149007548273999,
  1.5274827545604512,
  1.0683298643635877,
  0.730175845988502
])

function avgEstDiff(blockSize, runLength) {
  return magic[runLength-1] * Math.sqrt(blockSize)
}

for (const blockSize of [1000,2000,4000,10_000]) {
  const r = ['blockSize: '+blockSize]
  for (let runLength=1; runLength<=9; runLength++) {
    r.push(avgEstDiff(blockSize, runLength).toFixed(4))
  }
  console.table(r)
}

Update 4:

I'm close to something? Or just another magic...

function t(bs, rl) {
  const estRl = estRunLength(bs, rl)
  return (1.79*estRl) / Math.sqrt(bs)
}

console.log(t(  1000, 1)) // 14.179494914312004
console.log(t(  2000, 1)) // 20.03282120702174
console.log(t(  4000, 1)) // 28.31653625103625
console.log(t( 10000, 1)) // 44.758950000000006
console.log(t(100000, 1)) // 141.51475553104083
```
2

There are 2 best solutions below

7
On BEST ANSWER

$\def\eqdef{\stackrel{\text{def}}{=}}$ If a random variable $\ X\ $ has a discrete distribution, and mean $\ \mu\ $, then its average absolute deviation from its mean can be computed from its probability mass function: $$ \mathbb{E}(|X-\mu|)=\sum_x|x-\mu|\,\mathbb{P}(X=x)\ . $$ In your case, the random variables in question are the numbers of bit-runs of a given length that occur in a block of random bits of a given length. The distribution of this quantity can be computed by means of a recursion that includes a little more information about the structure of the block.

I shall call a run of bits in a block completed if its last bit is not the last bit of the block. That is, it has either the form $\ xxx\dots xy\ $ at the start of the block, or $\ yxxx\dots xy\ $ elsewhere (where $\ y\ne x\ $). Let $\ J_t\ $ be the number of completed runs of a given length ($\ r\ $, say) that occur within the first $\ t\ $ bits of a random sequence $\ \big\{\beta_s\big\}_{s=1}^b\ $ of bits, and $\ I_t\ $ the length of the last run at the end of this $\ t$-bit block if this is not greater than $\ r\ $, or $\ I_t=r+1\ $ otherwise. Then \begin{align} \big\{I_{t+1}=1,J_{t+1}=j\big\}&=\big\{I_t=r,J_t=j-1,\beta_{t+1}\ne\beta_t\big\}\\ &\hspace{1em}\cup\big\{I_t=r+1,J_t=j,\beta_{t+1}\ne\beta_t\big\}\\ &\hspace{1em}\cup\bigcup_{i=1}^{r-1}\big\{I_t=i,J_t=j,\beta_{t+1}\ne\beta_t\big\}\\ \big\{I_{t+1}=i,J_{t+1}=j\big\}&=\big\{I_t=i-1,J_t=j,\beta_{t+1}=\beta_t\big\}\\ &\hspace{4em}\ \text{for}\ \ 2\le i\le r,\\ \big\{I_{t+1}=r+1,J_{t+1}=j\big\}&=\big\{I_t=r,J_t=j,\beta_{t+1}=\beta_t\big\}\\ &\hspace{1em}\cup\big\{I_t=r+1,J_t=j,\beta_{t+1}=\beta_t\big\}\\ I_1&=1\ \ \text{, and}\\\\ J_1&=0\ . \end{align} If we put $$ \pi_{t,i,j}\eqdef\mathbb{P}\big(I_t=i,J_t=j\big)\ , $$ then we obtain the following recursion from the above identities: \begin{align} \pi_{t+1,1,j}&=\frac{1}{2}\left(\pi_{t,r,j-1}+\pi_{t,r+1,j}+\sum_{i=1}^r\pi_{t,i,j}\right)\\ \pi_{t+1,i,j}&=\frac{\pi_{t,i-1,j}}{2}\ \ \text{ for }\ \ 2\le i\le r,\\ \pi_{t+1,r+1,j}&=\frac{1}{2}\big(\pi_{t,r,j}+\pi_{t,r+1,j}\big)\ , \end{align} with initial condition: $$ \pi_{1,i,j}=\cases{1&if $\ i=1,j=0$\\ 0&otherwise.} $$ The number $\ R_t\ $ of runs of length $\ r\ $ within the first $\ t\ $ bits of your random string is $\ J_t+1\ $ if $\ I_t=r\ $, or $\ J_t\ $ otherwise. Therefore \begin{align} \Bbb{P}\big(R_t=j\big)&=\Bbb{P}\big(J_t=j-1,I_t= r\big)+\Bbb{P}\big(J_t=j,I_t\ne r\big)\\ &=\pi_{t,r,j-1}+\pi_{t,r+1,j}+\sum_{i=1}^{r-1}\pi_{t,i,j}\ , \end{align} and this probability mass function of $\ R_t\ $ enables you to calculate its average absolute deviation from its mean, using the expression given at the top of this answer.

Unfortunately, I have barely a nodding acquaintance with JavaScript. However I've written the Magma script given below to calculate the average absolute deviations of the numbers of runs of any length in a random string of any length. You can run it by copying and pasting it into the online Magma calculator. The values it obtains for the average numbers of runs of lengths up to $9$ in strings of length $1,000$ agree with the ones given by your formula, and the values it obtains for the average absolute deviations from the means are consistent with your simulations. Here are the values

\begin{array}{|c|c|c|} \hline\\ \text{run length}&\text{mean number of runs}&\text{average absolute deviation}\\ &&\text{from mean}\\ \hline 1&250.50000000000000000&14.113510463531267307\\ \hline 2&125.12500000000000000&8.3446389024482110929\\ \hline 3&62.500000000000000000&5.6912778405691599315\\ \hline 4&31.218750000000000000&4.0970066266939432457\\ \hline 5&15.593750000000000000&2.9833451409959858316\\ \hline 6&7.7890625000000000000&2.1490440851634585814\\ \hline 7&3.8906250000000000000&1.5275032760655310069\\ \hline 8&1.9433593750000000000&1.0683524059248089311\\ \hline 9&0.97070312500000000000&0.73018743329332705340\\ \hline \end{array}

Unfortunately, for strings of length $10,000$, the run time of the script exceeds the limit imposed by the online Magma calculator. Nevertheless, I'm sure the computing resources needed to carry out the calculation for strings of length $10,000$ would not be exorbitant.

getdist:=function(r,b)
  u:=Floor(Rationals()!b/Rationals()!r);
  p:=ZeroMatrix(Rationals(),r+1,u+1);
  rdist:=ZeroMatrix(Rationals(),1,u+1);
  pold:=p;
  pold[1,1]:=1;
  pnew:=p;
  for t in [1..b-1] do
    for j in [0..u] do
      for i in [1..r-1]do
        pnew[1,j+1]:=pnew[1,j+1]+pold[i,j+1]/2;
        if i ge 2 then
          pnew[i,j+1]:=pold[i-1,j+1]/2;
        end if;
      end for;
      if r ge 2 then
        pnew[r,j+1]:=pold[r-1,j+1]/2;
      end if;
      pnew[1,j+1]:=pnew[1,j+1]+pold[r+1,j+1]/2;
      if j gt 0 then
        pnew[1,j+1]:=pnew[1,j+1]+pold[r,j]/2;
      end if;
      pnew[r+1,j+1]:=(pold[r,j+1]+pold[r+1,j+1])/2;
    end for;
    pold:=pnew;
    pnew:=p;
  end for;

  for j in [0..u] do
    for i in [1..r-1] do
      rdist[1,j+1]:=rdist[1,j+1]+pold[i,j+1];
    end for;
    rdist[1,j+1]:=rdist[1,j+1]+pold[r+1,j+1];
    if j gt 0 then
      rdist[1,j+1]:=rdist[1,j+1]+pold[r,j];
    end if;
  end for;

  return(rdist);
end function;

b:=1000;
for r in [1..10] do
  pdist:=getdist(r,b);
  x:=Ncols(pdist);
  av:=0;
  for j in [0..x-1] do
    av:=av+j*pdist[1,j+1];
  end for;
  avdiff:=0;
  for j in [0..x-1] do
    avdiff:=avdiff+AbsoluteValue(j-av)*pdist[1,j+1];
  end for;
  print r, RealField(20)!av, RealField(20)!avdiff;
end for;
0
On

I converted Lonza Leggiera's Magma script to JavaScript. It gives the correct output.

function create2dArray(rows, cols, initialValue) {
  const array = Array(rows)
  for (let i=0; i<rows; i++) {
    array[i] = Array(cols).fill(initialValue)
  }
  return array
}

function copy2dArray(array) {
  const copy = Array(array.length)
  for (let i=0; i<array.length; i++) {
    copy[i] = array[i].slice()
  }
  return copy
}

function getDist(rl, bs) {
  const u = Math.floor(bs / rl)
  const dist = Array(u+2).fill(0)
  let old = create2dArray(rl+2, u+2, 0)
  let cur = create2dArray(rl+2, u+2, 0)

  old[1][1] = 1

  for (let block=1; block < bs; block++) {
    for (let j=1; j <= u+1; j++) {
      for (let r=1; r < rl; r++) {
        cur[1][j] += old[r][j] / 2
        if (r > 1) cur[r][j] = old[r-1][j] / 2
      }
      if (rl > 1) cur[rl][j] = old[rl-1][j] / 2
      cur[1][j] += old[rl+1][j] / 2
      if (j > 1) cur[1][j] += old[rl][j-1] / 2
      cur[rl+1][j] = (old[rl][j] + old[rl+1][j]) / 2
    }
    old = copy2dArray(cur)
    cur = create2dArray(rl+2, u+2, 0)
  }

  for (let j=1; j <= u+1; j++) {
    for (let i=1; i < rl; i++) {
      dist[j] += old[i][j]
    }
    dist[j] += old[rl+1][j]
    if (j > 1) dist[j] += old[rl][j-1]
  }

  return dist
}

// updated to be correct when runLength == blockSize
function expRunCount(blockSize, runLength) {
  if (runLength > blockSize) return 0
  if (runLength == blockSize) {
    return (blockSize-(runLength-2)) / 2**(runLength)
  }
  return (blockSize-(runLength-3)) / 2**(runLength+1)
}

function getExpAndAvgDiff(blockSize) {
  const result = [{}]
  for (let runLength=1; runLength <= 10 && runLength <= blockSize; runLength++) {
    const dist = getDist(runLength, blockSize).slice(1)
    let expAvg = 0, expAvgDiff = 0
    for (let i=0; i < dist.length; i++) {
      expAvg += i * dist[i]
    }
    const control = expRunCount(blockSize, runLength)
    if (expAvg !== control) console.log(control+' != '+expAvg)
    for (let i=0; i < dist.length; i++) {
      expAvgDiff += Math.abs(i - expAvg) * dist[i]
    }
    result.push({expAvg: expAvg.toFixed(10), expAvgDiff: expAvgDiff.toFixed(10)})
  }
  return result
}

console.table(getExpAndAvgDiff(10))