$ mse = \dfrac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y_i})^2 $ where
$ \hat{y_i} = w_1 * x_{i1} + w_2 * x_{i2} + b $ where $ x_{i1} $ and $ x_{i2} $ and $ b $ are scalars
$ \dfrac{\partial}{\partial w_1} mse = \dfrac{1}{n} \sum_{i=1}^{n} \dfrac{\partial}{\partial w_1}(y_i - \hat{y_i})^2 $
$ \dfrac{\partial}{\partial w_1} mse = \dfrac{-2}{n} \sum_{i=1}^{n} (y_i - \hat{y_i}) \dfrac{\partial}{\partial w_1} \hat{y_i} $
$ \dfrac{\partial}{\partial w_1} mse = \dfrac{-2}{n} \sum_{i=1}^{n} (y_i - \hat{y_i}) x_{i1} $
$ \dfrac{\partial}{\partial w_1} mse = \dfrac{-2}{n} \sum_{i=1}^{n} (y_i - \hat{y_i}) x_{i1} $
Edit: Corrected derivation with guidance from Paul. I tested out the formula with pytorch package below. Pls ignore the below code, this was just my way of validating
import torch
from torch.nn import Linear, functional
import numpy as np
red = lambda x:print(f'\x1b[31m{x}\x1b[0m')
X = torch.tensor([[0.1019, 0.0604],
[1.0000, 0.7681]], dtype=torch.float32)
y = torch.tensor([[1.],
[0.]], dtype=torch.float32)
xi1 = X.numpy()[:,0].reshape(2,1)
red('xi1')
print(xi1)
red('y')
print(y)
n = len(X)
torch.manual_seed(44)
model = Linear(2,1)
w1, w2 = model.weight[0]
red('Initial weights')
print(f'w1={w1}, w2={w2}')
ycap = model(X)
red('ycap')
print(f'{ycap.detach().numpy()}' )
loss = functional.mse_loss(ycap, y)
loss.backward()
d1, d2 = model.weight.grad[0]
red('partial diff of w1 of mse_loss fn')
print(f'd1={d1}, d2={d2}')
d_x_i1 = - 2/n * np.sum( (y.numpy() - ycap.detach().numpy()) * xi )
red('manual diff of w1 using formula')
print(d_x_i1)
assert d1 == d_x_i1
