import numpy as np

class NNLayer:
    def __init__(self, input_size, output_size, activation_fn, activation_derivative_fn):
        self.input_size = input_size
        self.output_size = output_size
        self.activation_fn = activation_fn
        self.activation_derivative_fn = activation_derivative_fn
        self.weights = 2 * np.random.rand(input_size, output_size) - 0.5
        self.biases = np.random.rand(output_size).reshape(1, -1)

    def forward(self, input_data):
        self.input_data = input_data
        self.z = np.dot(input_data, self.weights) + self.biases
        self.a = self.activation_fn(self.z)

        return self.a

    def backward(self, dA, learning_rate=0.01):
        dZ = dA * self.activation_derivative_fn(self.z)
        dW = np.dot(self.input_data.T, dZ) / self.input_data.shape[0]
        dB = np.sum(dZ, axis=0, keepdims=True) / self.input_data.shape[0]
        dInputs = np.dot(dZ, self.weights.T)

        self.weights -= learning_rate * dW
        self.biases -= learning_rate * dB

        return dInputs

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.heaviside(x, 1)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def softmax_derivative(S):
    jacobian = np.diag(S)
    for i in range(len(jacobian)):
        for j in range(len(jacobian[i])):
            if i==j:
                jacobian[i][j] = S[i] * (1 - S[j])
            else:
                jacobian[i][j] = -S[j] * S[i]

    return jacobian

def cross_entropy_loss(y_pred, y_true):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    tmp = y_true * np.log(y_pred)
    loss = -np.sum(tmp, axis=1)

    loss = np.mean(loss)

    return loss

def cross_entropy_derivative(y_pred, y_true):
    return y_pred - y_true