
    {Kg                        d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZ dd	lmZmZmZmZmZmZ  G d
 d      Z G d de      Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d  d!e      Z(eee e!e"e#e$e&e'e(d"
Z)y)#z
This module contains loss classes suitable for fitting.

It is not part of the public API.
Specific losses are used for regression, binary classification or multiclass
classification.
    Nxlogy   )check_scalar)_weighted_percentile   )CyAbsoluteErrorCyExponentialLossCyHalfBinomialLossCyHalfGammaLossCyHalfMultinomialLossCyHalfPoissonLossCyHalfSquaredErrorCyHalfTweedieLossCyHalfTweedieLossIdentityCyHuberLossCyPinballLoss)HalfLogitLinkIdentityLinkInterval	LogitLinkLogLinkMultinomialLogitc                       e Zd ZdZdZdZdZddZd Zd Z		 	 	 ddZ
	 	 	 	 dd	Z	 	 	 dd
Z	 	 	 	 ddZddZddZddZej$                  dfdZy)BaseLossa  Base class for a loss function of 1-dimensional targets.

    Conventions:

        - y_true.shape = sample_weight.shape = (n_samples,)
        - y_pred.shape = raw_prediction.shape = (n_samples,)
        - If is_multiclass is true (multiclass classification), then
          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
          Note that this corresponds to the return value of decision_function.

    y_true, y_pred, sample_weight and raw_prediction must either be all float64
    or all float32.
    gradient and hessian must be either both float64 or both float32.

    Note that y_pred = link.inverse(raw_prediction).

    Specific loss classes can inherit specific link classes to satisfy
    BaseLink's abstractmethods.

    Parameters
    ----------
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.
    n_classes : {None, int}
        The number of classes for classification, else None.

    Attributes
    ----------
    closs: CyLossFunction
    link : BaseLink
    interval_y_true : Interval
        Valid interval for y_true
    interval_y_pred : Interval
        Valid Interval for y_pred
    differentiable : bool
        Indicates whether or not loss function is differentiable in
        raw_prediction everywhere.
    need_update_leaves_values : bool
        Indicates whether decision trees in gradient boosting need to uptade
        leave values after having been fit to the (negative) gradients.
    approx_hessian : bool
        Indicates whether the hessian is approximated or exact. If,
        approximated, it should be larger or equal to the exact one.
    constant_hessian : bool
        Indicates whether the hessian is one for this loss.
    is_multiclass : bool
        Indicates whether n_classes > 2 is allowed.
    TFNc                     || _         || _        d| _        d| _        || _        t        t        j                   t        j                  dd      | _        | j                  j                  | _	        y )NF)
closslinkapprox_hessianconstant_hessian	n_classesr   npinfinterval_y_trueinterval_y_pred)selfr   r   r!   s       V/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/sklearn/_loss/loss.py__init__zBaseLoss.__init__   sV    
	# %"'F#yy88    c                 8    | j                   j                  |      S zuReturn True if y is in the valid range of y_true.

        Parameters
        ----------
        y : ndarray
        )r$   includesr&   ys     r'   in_y_true_rangezBaseLoss.in_y_true_range        ##,,Q//r)   c                 8    | j                   j                  |      S )zuReturn True if y is in the valid range of y_pred.

        Parameters
        ----------
        y : ndarray
        )r%   r,   r-   s     r'   in_y_pred_rangezBaseLoss.in_y_pred_range   r0   r)   c                     |t        j                  |      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }| j
                  j                  |||||       |S )aJ  Compute the pointwise loss value for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.
        r   r   y_trueraw_predictionsample_weightloss_out	n_threads)r"   
empty_likendimshapesqueezer   loss)r&   r5   r6   r7   r8   r9   s         r'   r>   zBaseLoss.loss   ss    < }}V,H!#(<(<Q(?1(D+33A6N

)' 	 	
 r)   c                    |O|+t        j                  |      }t        j                  |      }nEt        j                  ||j                        }n#|!t        j                  ||j                        }|j                  dk(  r#|j                  d   dk(  r|j                  d      }|j                  dk(  r#|j                  d   dk(  r|j                  d      }| j                  j                  ||||||       ||fS )a  Compute loss and gradient w.r.t. raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the loss is stored. If None, a new array
            might be created.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.

        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        dtyper   r   )r5   r6   r7   r8   gradient_outr9   )r"   r:   rA   r;   r<   r=   r   loss_gradient)r&   r5   r6   r7   r8   rB   r9   s          r'   rC   zBaseLoss.loss_gradient   s    L #==0!}}^<==|7I7IJ!==x~~NL !#(<(<Q(?1(D+33A6N!l&8&8&;q&@'//2L

  )'% 	! 	
 %%r)   c                 <   |t        j                  |      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }| j
                  j                  |||||       |S )a  Compute gradient of loss w.r.t raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        r   r   )r5   r6   r7   rB   r9   )r"   r:   r;   r<   r=   r   gradient)r&   r5   r6   r7   rB   r9   s         r'   rE   zBaseLoss.gradient	  s    > ==8L !#(<(<Q(?1(D+33A6N!l&8&8&;q&@'//2L

)'% 	 	
 r)   c                 0   |C|+t        j                  |      }t        j                  |      }n-t        j                  |      }n|t        j                  |      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }|j                  dk(  r#|j                  d   dk(  r|j	                  d      }| j
                  j                  ||||||       ||fS )a  Compute gradient and hessian of loss w.r.t raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the hessian is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.

        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise hessians.
        r   r   )r5   r6   r7   rB   hessian_outr9   )r"   r:   r;   r<   r=   r   gradient_hessian)r&   r5   r6   r7   rB   rG   r9   s          r'   rH   zBaseLoss.gradient_hessian:  s   N "!}}^< mmN;!}}[9 --5K !#(<(<Q(?1(D+33A6N!l&8&8&;q&@'//2Lq [%6%6q%9Q%>%--a0K

##)'%# 	$ 	
 [((r)   c           	      X    t        j                  | j                  ||dd|      |      S )a{  Compute the weighted average loss.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : float
            Mean or averaged loss function.
        Nr4   weights)r"   averager>   )r&   r5   r6   r7   r9   s        r'   __call__zBaseLoss.__call__|  s:    ( zzII-"#   "	
 		
r)   c                    t        j                  ||d      }dt        j                  |j                        j                  z  }| j
                  j                  t         j                   k(  rd}nF| j
                  j                  r| j
                  j                  }n| j
                  j                  |z   }| j
                  j                  t         j                  k(  rd}nF| j
                  j                  r| j
                  j                  }n| j
                  j                  |z
  }||| j                  j                  |      S | j                  j                  t        j                  |||            S )a#  Compute raw_prediction of an intercept-only model.

        This can be used as initial estimates of predictions, i.e. before the
        first iteration in fit.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or array of shape (n_samples,)
            Sample weights.

        Returns
        -------
        raw_prediction : numpy scalar or array of shape (n_classes,)
            Raw predictions of an intercept-only model.
        r   rK   axis
   N)r"   rL   finforA   epsr%   lowr#   low_inclusivehighhigh_inclusiver   clip)r&   r5   r7   y_predrS   a_mina_maxs          r'   fit_intercept_onlyzBaseLoss.fit_intercept_only  s   ( FMB288FLL)---##w.E!!//((,,E((,,s2E$$.E!!00((--E((--3E=U]99>>&))99>>"''&%"?@@r)   c                 ,    t        j                  |      S )zpCalculate term dropped in loss.

        With this term added, the loss of perfect predictions is zero.
        )r"   
zeros_liker&   r5   r7   s      r'   constant_to_optimal_zeroz!BaseLoss.constant_to_optimal_zero  s    
 }}V$$r)   Fc                 V   |t         j                  t         j                  fvrt        d| d      | j                  r|| j
                  f}n|f}t        j                  |||      }| j                  rt        j                  d|      }||fS t        j                  |||      }||fS )au  Initialize arrays for gradients and hessians.

        Unless hessians are constant, arrays are initialized with undefined values.

        Parameters
        ----------
        n_samples : int
            The number of samples, usually passed to `fit()`.
        dtype : {np.float64, np.float32}, default=np.float64
            The dtype of the arrays gradient and hessian.
        order : {'C', 'F'}, default='F'
            Order of the arrays gradient and hessian. The default 'F' makes the arrays
            contiguous along samples.

        Returns
        -------
        gradient : C-contiguous array of shape (n_samples,) or array of shape             (n_samples, n_classes)
            Empty array (allocated but not initialized) to be used as argument
            gradient_out.
        hessian : C-contiguous array of shape (n_samples,), array of shape
            (n_samples, n_classes) or shape (1,)
            Empty (allocated but not initialized) array to be used as argument
            hessian_out.
            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
            initialized to ``1``.
        zCValid options for 'dtype' are np.float32 and np.float64. Got dtype=z	 instead.)r<   rA   order)r   )r<   rA   )	r"   float32float64
ValueErroris_multiclassr!   emptyr    ones)r&   	n_samplesrA   rc   r<   rE   hessians          r'   init_gradient_and_hessianz"BaseLoss.init_gradient_and_hessian  s    8 RZZ00"G9. 
 /ELE88%uEB  
 ggD6G    hhU%uEG  r)   N)NNr   NNNr   Nr   )__name__
__module____qualname____doc__differentiableneed_update_leaves_valuesrg   r(   r/   r2   r>   rC   rE   rH   rM   r\   r`   r"   re   rl    r)   r'   r   r   C   s    /t N %M900 +b =&F /j @)D
>(AT% :<3 1!r)   r   c                   $     e Zd ZdZd fd	Z xZS )HalfSquaredErrora  Half squared error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half squared error is defined as::

        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2

    The factor of 0.5 simplifies the computation of gradients and results in a
    unit hessian (and is consistent with what is done in LightGBM). It is also
    half the Normal distribution deviance.
    c                 Z    t         |   t               t                      |d u | _        y )Nr   r   )superr(   r   r   r    r&   r7   	__class__s     r'   r(   zHalfSquaredError.__init__  s(    13,.I - 5r)   rm   rp   rq   rr   rs   r(   __classcell__r}   s   @r'   rx   rx     s    "6 6r)   rx   c                   4     e Zd ZdZdZdZd fd	ZddZ xZS )AbsoluteErrora  Absolute error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the absolute error is defined as::

        loss(x_i) = |y_true_i - raw_prediction_i|

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.
    FTc                 h    t         |   t               t                      d| _        |d u | _        y )Nrz   T)r{   r(   r	   r   r   r    r|   s     r'   r(   zAbsoluteError.__init__0  s/    0|~F" - 5r)   c                 N    |t        j                  |d      S t        ||d      S )Compute raw_prediction of an intercept-only model.

        This is the weighted median of the target, i.e. over the samples
        axis=0.
        r   rP   2   )r"   medianr   r_   s      r'   r\   z AbsoluteError.fit_intercept_only5  s*      99V!,,'rBBr)   rm   	rp   rq   rr   rs   rt   ru   r(   r\   r   r   s   @r'   r   r     s     " N $6
	Cr)   r   c                   4     e Zd ZdZdZdZd fd	ZddZ xZS )PinballLossa  Quantile loss aka pinball loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the pinball loss is defined as::

        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)

        rho_{quantile}(u) = u * (quantile - 1_{u<0})
                          = -u *(1 - quantile)  if u < 0
                             u * quantile       if u >= 0

    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level of the quantile to be estimated. Must be in range (0, 1).
    FTc                     t        |dt        j                  ddd       t        |   t        t        |            t                      d| _        |d u | _	        y )	Nquantiler   r   neithertarget_typemin_valmax_valinclude_boundaries)r   rz   T)
r   numbersRealr{   r(   r   floatr   r   r    )r&   r7   r   r}   s      r'   r(   zPinballLoss.__init__b  s]    (	
 	x9 	 	
 # - 5r)   c                     |/t        j                  |d| j                  j                  z  d      S t	        ||d| j                  j                  z        S )r   d   r   r   )r"   
percentiler   r   r   r_   s      r'   r\   zPinballLoss.fit_intercept_onlyr  sO      ==tzz/B/B)BKK'sTZZ-@-@'@ r)   )N      ?rm   r   r   s   @r'   r   r   A  s    : N $6 r)   r   c                   4     e Zd ZdZdZdZd fd	ZddZ xZS )	HuberLossa  Huber loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the Huber loss is defined as::

        loss(x_i) = 1/2 * abserr**2            if abserr <= delta
                    delta * (abserr - delta/2) if abserr > delta

        abserr = |y_true_i - raw_prediction_i|
        delta = quantile(abserr, self.quantile)

    Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
    equals delta * (AbsoluteError() - delta/2).

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level which defines the breaking point `delta` to distinguish
        between absolute error and squared error. Must be in range (0, 1).

     Reference
    ---------
    .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
      boosting machine <10.1214/aos/1013203451>`.
      Annals of Statistics, 29, 1189-1232.
    FTc                     t        |dt        j                  ddd       || _        t        |   t        t        |            t                      d| _	        d	| _
        y )
Nr   r   r   r   r   )deltarz   TF)r   r   r   r   r{   r(   r   r   r   r   r    )r&   r7   r   r   r}   s       r'   r(   zHuberLoss.__init__  s_    (	
 !E%L1 	 	
 # %r)   c                 6   |t        j                  |dd      }nt        ||d      }||z
  }t        j                  |      t        j                  | j
                  j                  t        j                  |            z  }|t        j                  ||      z   S )r   r   r   r   rJ   )	r"   r   r   signminimumr   r   absrL   )r&   r5   r7   r   diffterms         r'   r\   zHuberLoss.fit_intercept_only  sx      ]]62A6F)&-DFwwt}rzz$***:*:BFF4LII

4???r)   )Ng?r   rm   r   r   s   @r'   r   r     s!    B N $&"@r)   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfPoissonLossa  Half Poisson deviance loss with log-link, for regression.

    Domain:
    y_true in non-negative real numbers
    y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half the Poisson deviance is defined as::

        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                    - y_true_i + exp(raw_prediction_i)

    Half the Poisson deviance is actually the negative log-likelihood up to
    constant terms (not involving raw_prediction) and simplifies the
    computation of the gradients.
    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
    c                     t         |   t               t                      t	        dt
        j                  dd      | _        y )Nrz   r   TF)r{   r(   r   r   r   r"   r#   r$   r|   s     r'   r(   zHalfPoissonLoss.__init__  s2    02C'2664?r)   c                 2    t        ||      |z
  }|||z  }|S rm   r   r&   r5   r7   r   s       r'   r`   z(HalfPoissonLoss.constant_to_optimal_zero  s(    VV$v-$M!Dr)   rm   rp   rq   rr   rs   r(   r`   r   r   s   @r'   r   r     s    (@r)   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfGammaLossaV  Half Gamma deviance loss with log-link, for regression.

    Domain:
    y_true and y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Gamma deviance loss is defined as::

        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                    + y_true/exp(raw_prediction_i) - 1

    Half the Gamma deviance is actually proportional to the negative log-
    likelihood up to constant terms (not involving raw_prediction) and
    simplifies the computation of the gradients.
    We also skip the constant term `-log(y_true_i) - 1`.
    c                     t         |   t               t                      t	        dt
        j                  dd      | _        y )Nrz   r   F)r{   r(   r   r   r   r"   r#   r$   r|   s     r'   r(   zHalfGammaLoss.__init__  s1    0wyA'2665%@r)   c                 F    t        j                  |       dz
  }|||z  }|S ro   )r"   logr   s       r'   r`   z&HalfGammaLoss.constant_to_optimal_zero  s+    v"$M!Dr)   rm   r   r   s   @r'   r   r     s    &Ar)   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )HalfTweedieLossa  Half Tweedie deviance loss with log-link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers
    power in real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
                    + exp(raw_prediction_i)**(2-p) / (2-p)

    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
    HalfPoissonLoss and HalfGammaLoss.

    We also skip constant terms, but those are different for p=0, 1, 2.
    Therefore, the loss is not continuous in `power`.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    c                    t         |   t        t        |            t	                      | j
                  j                  dk  r1t        t        j                   t        j                  dd      | _
        y | j
                  j                  dk  r"t        dt        j                  dd      | _
        y t        dt        j                  dd      | _
        y N)powerrz   r   Fr   T)r{   r(   r   r   r   r   r   r   r"   r#   r$   r&   r7   r   r}   s      r'   r(   zHalfTweedieLoss.__init__'  s    #%,7 	 	
 ::q #+RVVGRVVUE#JD ZZ!#+ArvvtU#CD #+Arvvue#DD r)   c                    | j                   j                  dk(  rt               j                  ||      S | j                   j                  dk(  rt	               j                  ||      S | j                   j                  dk(  rt               j                  ||      S | j                   j                  }t        j                  t        j                  |d      d|z
        d|z
  z  d|z
  z  }|||z  }|S )Nr   )r5   r7   r   r   )r   r   rx   r`   r   r   r"   maximum)r&   r5   r7   pr   s        r'   r`   z(HalfTweedieLoss.constant_to_optimal_zero3  s    ::q #%>>] ?   ZZ""$==] >   ZZ" ?;;] <   

  A88BJJvq11q59QUCq1uMD(%Kr)   Ng      ?rm   r   r   s   @r'   r   r     s    <
Er)   r   c                   $     e Zd ZdZd fd	Z xZS )HalfTweedieLossIdentityan  Half Tweedie deviance loss with identity link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers for power != 0
    y_pred in real numbers for power = 0
    power in real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
                    + raw_prediction_i**(2-p) / (2-p)

    Note that the minimum value of this loss is 0.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    c                    t         |   t        t        |            t	                      | j
                  j                  dk  r1t        t        j                   t        j                  dd      | _
        n\| j
                  j                  dk  r"t        dt        j                  dd      | _
        n!t        dt        j                  dd      | _
        | j
                  j                  dk(  r1t        t        j                   t        j                  dd      | _        y t        dt        j                  dd      | _        y r   )r{   r(   r   r   r   r   r   r   r"   r#   r$   r%   r   s      r'   r(   z HalfTweedieLossIdentity.__init__d  s    +%,? 	 	
 ::q #+RVVGRVVUE#JD ZZ!#+ArvvtU#CD #+Arvvue#DD ::q #+RVVGRVVUE#JD #+Arvvue#DD r)   r   r~   r   s   @r'   r   r   H  s    6E Er)   r   c                   2     e Zd ZdZd fd	ZddZd Z xZS )HalfBinomialLossaY  Half Binomial deviance loss with logit link, for binary classification.

    This is also know as binary cross entropy, log-loss and logistic loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(raw_prediction)

    For a given sample x_i, half Binomial deviance is defined as the negative
    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
    as::

        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i

    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
    section 4.4.1 (about logistic regression).

    Note that the formulation works for classification, y = {0, 1}, as well as
    logistic regression, y = [0, 1].
    If you add `constant_to_optimal_zero` to the loss, you get half the
    Bernoulli/binomial deviance.

    More details: Inserting the predicted probability y_pred = expit(raw_prediction)
    in the loss gives the well known::

        loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
    c                 p    t         |   t               t               d       t	        dddd      | _        y Nr   r   r   r!   r   r   T)r{   r(   r   r   r   r$   r|   s     r'   r(   zHalfBinomialLoss.__init__  s8    $& 	 	

  (1dD9r)   c                 R    t        ||      t        d|z
  d|z
        z   }|||z  }|S ro   r   r   s       r'   r`   z)HalfBinomialLoss.constant_to_optimal_zero  s7    VV$uQZV'DD$M!Dr)   c                 4   |j                   dk(  r#|j                  d   dk(  r|j                  d      }t        j                  |j                  d   df|j
                        }| j                  j                  |      |dddf<   d|dddf   z
  |dddf<   |S a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, 2)
            Element-wise class probabilities.
        r   r   r   r@   Nr;   r<   r=   r"   rh   rA   r   inverser&   r6   probas      r'   predict_probazHalfBinomialLoss.predict_proba       !#(<(<Q(?1(D+33A6N...q115^=Q=QRii''7ad%1+oadr)   rm   rp   rq   rr   rs   r(   r`   r   r   r   s   @r'   r   r   v  s    >:r)   r   c                   L     e Zd ZdZdZd fd	Zd Zd	dZd Z	 	 	 	 d
dZ	 xZ
S )HalfMultinomialLossa  Categorical cross-entropy loss, for multiclass classification.

    Domain:
    y_true in {0, 1, 2, 3, .., n_classes - 1}
    y_pred has n_classes elements, each element in (0, 1)

    Link:
    y_pred = softmax(raw_prediction)

    Note: We assume y_true to be already label encoded. The inverse link is
    softmax. But the full link function is the symmetric multinomial logit
    function.

    For a given sample x_i, the categorical cross-entropy loss is defined as
    the negative log-likelihood of the multinomial distribution, it
    generalizes the binary cross-entropy to more than 2 classes::

        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)

    See [1].

    Note that for the hessian, we calculate only the diagonal part in the
    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
    we calculate H_i_k_k, i.e. k=l.

    Reference
    ---------
    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
        Multinomial Regression".
        <1311.6529>`
    Tc                     t         |   t               t               |       t	        dt
        j                  dd      | _        t	        dddd      | _        y )Nr   r   TFr   )	r{   r(   r   r   r   r"   r#   r$   r%   )r&   r7   r!   r}   s      r'   r(   zHalfMultinomialLoss.__init__  sP    ')!# 	 	

  (2664?'1eU;r)   c                     | j                   j                  |      xr+ t        j                  |j	                  t
              |k(        S r+   )r$   r,   r"   allastypeintr-   s     r'   r/   z#HalfMultinomialLoss.in_y_true_range  s6     ##,,Q/NBFF188C=A;M4NNr)   c                    t        j                  | j                  |j                        }t        j                  |j                        j
                  }t        | j                        D ]@  }t        j                  ||k(  |d      ||<   t        j                  ||   |d|z
        ||<   B | j                  j                  |dddf         j                  d      S )zCompute raw_prediction of an intercept-only model.

        This is the softmax of the weighted average of the target, i.e. over
        the samples axis=0.
        r@   r   rO   r   N)r"   zerosr!   rA   rR   rS   rangerL   rX   r   reshape)r&   r5   r7   outrS   ks         r'   r\   z&HalfMultinomialLoss.fit_intercept_only  s     hht~~V\\:hhv||$((t~~&AZZ!]KCFWWSVS!c'2CF ' yy~~c$'l+33B77r)   c                 8    | j                   j                  |      S )a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r   r   )r&   r6   s     r'   r   z!HalfMultinomialLoss.predict_proba  s     yy  00r)   c                    |C|+t        j                  |      }t        j                  |      }n-t        j                  |      }n|t        j                  |      }| j                  j                  ||||||       ||fS )aK  Compute gradient and class probabilities fow raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or array of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        proba_out : None or array of shape (n_samples, n_classes)
            A location into which the class probabilities are stored. If None,
            a new array might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples, n_classes)
            Element-wise gradients.

        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r5   r6   r7   rB   	proba_outr9   )r"   r:   r   gradient_proba)r&   r5   r6   r7   rB   r   r9   s          r'   r   z"HalfMultinomialLoss.gradient_proba  s    H  !}}^<MM.9	!}}Y7l3I

!!)'% 	" 	
 Y&&r)   )N   rm   rn   )rp   rq   rr   rs   rg   r(   r/   r\   r   r   r   r   s   @r'   r   r     s8     D M<O81& 5'r)   r   c                   2     e Zd ZdZd fd	ZddZd Z xZS )ExponentialLossa"  Exponential loss with (half) logit link, for binary classification.

    This is also know as boosting loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(2 * raw_prediction)

    For a given sample x_i, the exponential loss is defined as::

        loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)

    See:
    - J. Friedman, T. Hastie, R. Tibshirani.
      "Additive logistic regression: a statistical view of boosting (With discussion
      and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
      https://doi.org/10.1214/aos/1016218223
    - A. Buja, W. Stuetzle, Y. Shen. (2005).
      "Loss Functions for Binary Class Probability Estimation and Classification:
      Structure and Applications."

    Note that the formulation works for classification, y = {0, 1}, as well as
    "exponential logistic" regression, y = [0, 1].
    Note that this is a proper scoring rule, but without it's canonical link.

    More details: Inserting the predicted probability
    y_pred = expit(2 * raw_prediction) in the loss gives::

        loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
            + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
    c                 p    t         |   t               t               d       t	        dddd      | _        y r   )r{   r(   r
   r   r   r$   r|   s     r'   r(   zExponentialLoss.__init__j  s8    #% 	 	

  (1dD9r)   c                 P    dt        j                  |d|z
  z        z  }|||z  }|S )Nr   )r"   sqrtr   s       r'   r`   z(ExponentialLoss.constant_to_optimal_zeror  s3    BGGFa&j122$M!Dr)   c                 4   |j                   dk(  r#|j                  d   dk(  r|j                  d      }t        j                  |j                  d   df|j
                        }| j                  j                  |      |dddf<   d|dddf   z
  |dddf<   |S r   r   r   s      r'   r   zExponentialLoss.predict_probay  r   r)   rm   r   r   s   @r'   r   r   F  s    !F:r)   r   )
squared_errorabsolute_errorpinball_loss
huber_losspoisson_loss
gamma_losstweedie_lossbinomial_lossmultinomial_lossexponential_loss)*rs   r   numpyr"   scipy.specialr   utilsr   utils.statsr   _lossr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rx   r   r   r   r   r   r   r   r   r   r   _LOSSESrv   r)   r'   <module>r      s  $      .    8z! z!B6x 6.#CH #CL<( <~F@ F@Rh @H >=h =@+Eh +E\Bx BJH'( H'VFh FT &###%+'r)   