Minimising inconclusive range for binary classification with two thresholds

98 Views Asked by At

Background

I have an optimisation problem related to a binary classification tasks.

I have arrays of probabilistic predictions, $x$, and true binary labels, $y$, i.e. $x_i\in [0,\,1]$ and $y_i\in \{0,\,1\}$, $\forall i \in [N]$, where $N$ is the length of the vectors $x$ and $y$.

I want to optimize the trade-off between the true positive rate (TPR), true negative rate (TNR), and the number of inconclusive predictions.

More precisely, I want to minimise the number of inconclusive predictions ($|S|$, where $S$ is the set of excluded indexes), constrained on TPR and TNR exceeding some given constants $p$ and $q$, over the threshold variables $a$ and $b$, where $0 < a < b < 1$. Model predictions between $a$ and $b$ are considered inconclusive, and hence excluded.

Problem Definition

The problem can formally be defined as the following constrained minimisation problem:

$$ \begin{align} \min_{a,\,b} \quad\quad & |\{ i \mid a \leq x_i < b \}| & (|S|)\\ \text{subject to} \quad\quad & \\ & TNR - p > 0, & (TNR > p)\\ & TPR - q > 0, & (TPR > q)\\ & b - a > 0, & (b > a)\\ & a > 0, & (a > 0)\\ & b > 0, & (b > 0)\\ & 1 - a > 0, & (a < 1)\\ & 1 - b > 0, & (b < 1)\\ \text{where} \quad\quad & &\\ & TNR = \frac{TN}{TN + FP}, &\\ & TPR = \frac{TP}{TP + FN}, &\\ & z_i = \mathbb{1}\left[x_i \geq b\right], &\\ & TN = \sum_{i \notin S} \mathbb{1}[y_i = 0, z_i = 0], &\\ & FP = \sum_{i \notin S} \mathbb{1}[y_i = 0, z_i = 1], &\\ & TP = \sum_{i \notin S} \mathbb{1}[y_i = 1, z_i = 1], &\\ & FN = \sum_{i \notin S} \mathbb{1}[y_i = 1, z_i = 0], &\\ & p,\, q \in (0,1). & \end{align} $$

Bound-Constrained Minimisation

I have also considered if the constrained minimisation problem can be reformulated into a bound-constrained minimisation problem. However, the TPR and TNR constraints are more complex and not simple linear constraints. As far as I know, these constraints cannot be directly translated into bounds for $a$ and $b$.

Coding Attempt

While my question is mathematical in nature, I describe my attempts to solve the problem programmatically to highlight my issues. I am using Python with the minimize function from the optimize module in the scipy package (scipy.optimize.minimize).

from typing import Tuple
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.metrics import confusion_matrix


def objective_function(
    thresholds: Tuple[float, float],
    probs: np.ndarray,
    targets: np.ndarray,
) -> float:
    a, b = thresholds
    included_indices = (probs < a) | (probs >= b)
    # exclusion rate
    r = 1 - float(np.mean(included_indices))
    return r


def calculate_rates(
    targets: np.ndarray,
    probs: np.ndarray,
    thresholds: Tuple[float, float],
) -> Tuple[float, float]:
    a, b = thresholds
    idx = (probs < a) | (probs >= b)
    y_true_non_excluded = targets[idx]
    y_pred_non_excluded = (probs[idx] >= b).astype(int)
    tn, fp, fn, tp = confusion_matrix(
        y_true_non_excluded,
        y_pred_non_excluded,
    ).ravel()
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
    return tpr, tnr


def constraint_tnr(
    thresholds: Tuple[float, float],
    probs: np.ndarray,
    targets: np.ndarray,
    p: float,
) -> float:
    tnr = calculate_rates(targets, probs, thresholds)[1]
    return tnr - p


def constraint_tpr(
    thresholds: Tuple[float, float],
    probs: np.ndarray,
    targets: np.ndarray,
    q: float,
) -> float:
    tpr = calculate_rates(targets, probs, thresholds)[0]
    return tpr - q


def optimize_thresholds(
    probs: np.ndarray,
    targets: np.ndarray,
    p: float,
    q: float,
    initial_guess: Tuple[float, float],
) -> Tuple[float, float]:

    # Constrained minimization
    constraints = (
        {'type': 'ineq', 'fun': constraint_tnr, 'args': (probs, targets, p)},  # TNR > p (1)
        {'type': 'ineq', 'fun': constraint_tpr, 'args': (probs, targets, q)},  # TPR > q (2)
        {'type': 'ineq', 'fun': lambda thrs: thrs[0]},                         # 0 < a   (3)
        {'type': 'ineq', 'fun': lambda thrs: 1 - thrs[1]},                     # b < 1   (4)
        {'type': 'ineq', 'fun': lambda thrs: thrs[1] - thrs[0]},               # a < b   (5)
        # (3) and (5) => b > 0
        # (4} and (5) => a < 1
    )

    # Bounds for a and b (0 < a ≤ b < 1)
    # bounds = [(0, 1), (0, 1)]
    # Method COBYLA cannot handle bounds. Therefore, we use constraints instead.    
 
    # Run the optimization
    result = minimize(
        objective_function,
        x0=initial_guess,
        args=(probs, targets),
        method='COBYLA',
        # bounds=bounds,
        constraints=constraints,
    )

    # Check if the optimization was successful and display the results
    if result.success:
        optimal_a, optimal_b = result.x
        # exclusion rate
        optimal_r = objective_function((optimal_a, optimal_b), probs, targets)

        # Calculate TNR and TPR for the non-excluded cases
        non_excluded_indices = (probs < optimal_a) | (probs >= optimal_b)
        y_true_non_excluded = targets[non_excluded_indices]
        y_pred_non_excluded = (probs[non_excluded_indices] >= optimal_b).astype(int)
        optimal_tnr, optimal_tpr = calculate_rates(
            y_true_non_excluded, y_pred_non_excluded, (optimal_a, optimal_b)
        )

        print(f'Optimal a: {optimal_a}, Optimal b: {optimal_b}')
        print(f'Exclusion rate r: {optimal_r}')
        print(f'True Negative Rate: {optimal_tnr}')
        print(f'True Positive Rate: {optimal_tpr}')
    else:
        print(result)
        raise ValueError('Optimisation was not successful.')

    return optimal_a, optimal_b


def main() -> None:
    df = pd.read_csv('sample_data.csv')
    x = df['x'].to_numpy()
    y = df['y'].to_numpy()

    # works
    optimize_thresholds(x, y, p=0.9, q=0.9, initial_guess=(0.2, 0.8))

    # does not work
    optimize_thresholds(x, y, p=0.9, q=0.9, initial_guess=(0.3, 0.7))

    # optimal solution
    optimal_thresholds = (0.3610841998732966, 0.6057616712011725)
    # does not work
    optimize_thresholds(x, y, p=0.9, q=0.9, initial_guess=optimal_thresholds)

Issue

With an initial guess of $a = 0.2$ and $b = 0.8$, i.e.

# setup x, y
...
optimize_thresholds(x, y, p=0.9, q=0.9, initial_guess=(0.2, 0.8))

it converges to a seemingly optimal solution:

Optimal a: 0.370554470806464, Optimal b: 0.5917056391027792
Exclusion rate r: 0.13576158940397354
True Negative Rate: 0.9005128205128206
True Positive Rate: 0.9002911208151383
(0.370554470806464, 0.5917056391027792)

However, when $a$ and $b$ are initialised to most other values, even values close to the optimal solution, e.g. $a = 0.37$ and $b = 0.6$, it fails to find a solution.

 message: Did not converge to a solution satisfying the constraints. See `maxcv` for magnitude of violation.
 success: False
  status: 4
     fun: 0.13355408388520973
       x: [ 3.719e-01  5.891e-01]
    nfev: 18
   maxcv: 0.000944081336238245

ValueError: Optimisation was not successful.

I would like to improve the stability of the my solution, to not depend on a seemingly arbitrary choice of initialisation.

Alternative methods

With SLSQP, the optimisation fails if the initialisation does not satisfy the constraint related to TNR, and TPR, and returns the initialisation values otherwise, i.e. a sub-optimal solution with TNR and TPR unnecessarily high, leading to a larger number of excluded cases than the optimal solution.

I would be happy with any form of input and guidance!

I have previously asked my question in a modified form in Stack Overflow (link), however, since my question is really mathematical in nature, I am asking it here.

Sample data

sample_data.csv

Similar problem

I have found a paper that addresses a similar problem; however, instead of constraints on the TPR and the TNR independently, they have a constraint on the overall accuracy.

Patrone PN, Bedekar P, Pisanic N, et al. Optimal decision theory for diagnostic testing: Minimizing indeterminate classes with applications to saliva-based SARS-CoV-2 antibody assays. Mathematical Biosciences. 2022 Sep 1;351:108858. doi: 10.1016/j.mbs.2022.108858.

2

There are 2 best solutions below

1
On BEST ANSWER

You can linearize the problem and use a mixed integer linear programming solver, which will find a globally optimal solution without relying on an initial solution.

Besides the decision variables $a$, $b$, $z_i$, TN, FP, TP, and FN that you have already introduced, let binary decision variables $\ell_i$ and $s_i$ indicate whether $x_i<a$ or $a \le x_i < b$, respectively. Introduce a small positive constant tolerance $\epsilon$ to replace strict inequalities with weak inequalities. The problem is to minimize $$\sum_{i=1}^N s_i$$ subject to \begin{align} 0 \le a \le b &\le 1 \tag1\label1 \\ \ell_i + s_i + z_i &= 1 && \forall i \tag2\label2 \\ x_i - a + \epsilon &\le (x_i-0+\epsilon)(1-\ell_i) && \forall i \tag3\label3 \\ a - x_i &\le (1-x_i)(1-s_i) && \forall i \tag4\label4 \\ x_i - b + \epsilon &\le (x_i-0+\epsilon)(1-s_i) && \forall i \tag5\label5 \\ b - x_i &\le (1-x_i)(1-z_i) && \forall i \tag6\label6 \\ \text{TN} &= \sum_{i:y_i=0} \ell_i \tag7\label7 \\ \text{FP} &= \sum_{i:y_i=0} z_i \tag8\label8 \\ \text{TP} &= \sum_{i:y_i=1} z_i \tag9\label9 \\ \text{FN} &= \sum_{i:y_i=1} \ell_i \tag{10}\label{10} \\ \text{TN} &\ge p(\text{TN}+\text{FP}) \tag{11}\label{11} \\ \text{TP} &\ge q(\text{TP}+\text{FN}) \tag{12}\label{12} \end{align} Constraint \eqref{1} is clear. Constraints \eqref{2} through \eqref{6} enforce the definitions of the binary variables. For example, \eqref{3} enforces the logical implication $\ell_i = 1 \implies x_i \le a - \epsilon$. Constraints \eqref{7} through \eqref{10} define the confusion matrix in terms of the binary variables. Constraints \eqref{11} and \eqref{12} linearize the ratio constraints for the true negative and true positive rates.

For your sample data, the optimal objective value is $160$, attained for example by $(a,b)=(0.36256, 0.60603)$.


By request, here is SAS code:

data indata;
   infile 'sample_data.csv' dlm=',' firstobs=2;
   input x y;
run;

proc optmodel;
   set OBS;
   num x {OBS}, num y {OBS};
   read data indata into OBS=[_N_] x y;

   num p = 0.9, q = 0.9, eps = 1e-6;

   var a >= 0 <= 1;
   var b >= 0 <= 1;
   var l {OBS} binary;
   var s {OBS} binary;
   var z {OBS} binary;

   min NumExcluded = sum {i in OBS} s[i];

   con a_le_b:
      a <= b;

   con Partition {i in OBS}:
      l[i] + s[i] + z[i] = 1;
   con Indicator1 {i in OBS}:
      l[i] = 1 implies x[i] <= a - eps;
   con Indicator2 {i in OBS}:
      s[i] = 1 implies x[i] >= a;
   con Indicator3 {i in OBS}:
      s[i] = 1 implies x[i] <= b - eps;
   con Indicator4 {i in OBS}:
      z[i] = 1 implies x[i] >= b;

   var TN, FP, TP, FN;
   con TNcon:
      TN = sum {i in OBS: y[i] < 0.5} l[i];
   con FPcon:
      FP = sum {i in OBS: y[i] < 0.5} z[i];
   con TPcon:
      TP = sum {i in OBS: y[i] > 0.5} z[i];
   con FNcon:
      FN = sum {i in OBS: y[i] > 0.5} l[i];

   con TNRcon:
      TN >= p * (TN + FP);
   con TPRcon:
      TP >= q * (TP + FN);

   solve;

   create data outdata(drop=i) from [i] x y l s z;
quit;
0
On

This is just building on top of the answer provided by @RobPratt, for someone reading this thread and who is, like me, more used to Python than SAS.

Here is a Python implementation for the solution by @RobPratt.

### Step 1: Imports
import pandas as pd
import pulp

### Step 2: Define Your Data
indata = pd.read_csv('sample_data.csv')
x = indata['x'].to_numpy()
y = indata['y'].to_numpy()
N = len(x)

### Step 3: Set Up the PuLP Problem

# Constants
p = 0.9
q = 0.9
eps = 1e-6

# Initialize the optimization problem
problem = pulp.LpProblem('Binary_Classification', pulp.LpMinimize)

### Step 4: Define Decision Variables

# Threshold variables
a = pulp.LpVariable('a', 0, 1)
b = pulp.LpVariable('b', 0, 1)

# Binary decision variables for each observation
l = pulp.LpVariable.dicts('l', range(N), cat='Binary')
s = pulp.LpVariable.dicts('s', range(N), cat='Binary')
z = pulp.LpVariable.dicts('z', range(N), cat='Binary')

# Variables for confusion matrix elements
TN = pulp.LpVariable('TN', lowBound=0, cat='Continuous')
FP = pulp.LpVariable('FP', lowBound=0, cat='Continuous')
TP = pulp.LpVariable('TP', lowBound=0, cat='Continuous')
FN = pulp.LpVariable('FN', lowBound=0, cat='Continuous')

### Step 5: Objective Function

# Objective: Minimize the number of inconclusive predictions
problem += pulp.lpSum([s[i] for i in range(N)])

### Step 6: Constraints

# Constraint 1: a <= b
problem += a <= b

# Constraints 2-6: Definitions of binary variables
for i in range(N):
    problem += l[i] + s[i] + z[i] == 1
    problem += x[i] - a + eps <= (1 - l[i]) * (x[i] - 0 + eps)
    problem += a - x[i] <= (1 - s[i]) * (1 - x[i])
    problem += x[i] - b + eps <= (1 - s[i]) * (x[i] - 0 + eps)
    problem += b - x[i] <= (1 - z[i]) * (1 - x[i])

# Constraints 7-10: Confusion matrix elements
problem += TN == pulp.lpSum([l[i] for i in range(N) if y[i] == 0])
problem += FP == pulp.lpSum([z[i] for i in range(N) if y[i] == 0])
problem += TP == pulp.lpSum([z[i] for i in range(N) if y[i] == 1])
problem += FN == pulp.lpSum([l[i] for i in range(N) if y[i] == 1])

# Constraints 11-12: TNR and TPR constraints
problem += TN >= p * (TN + FP)
problem += TP >= q * (TP + FN)

### Step 7: Solve the Problem
problem.solve()

### Step 8: Print the Results
print('Status:', pulp.LpStatus[problem.status])
print('Optimal value of a:', a.varValue)
print('Optimal value of b:', b.varValue)