
    {Kgh                     <    d Z ddlZddlmZ ddlmZ  G d d      Zy)zA
Loss functions for linear models with raw_prediction = X @ coef
    N)sparse   )squared_normc                   ~    e Zd ZdZd ZddZd Zd Zd Z	 	 	 	 ddZ		 	 	 	 dd	Z
	 	 	 	 dd
Z	 	 	 	 	 	 ddZ	 ddZy)LinearModelLossa  General class for loss functions with raw_prediction = X @ coef + intercept.

    Note that raw_prediction is also known as linear predictor.

    The loss is the average of per sample losses and includes a term for L2
    regularization::

        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
               + 1/2 * l2_reg_strength * ||coef||_2^2

    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.

    Gradient and hessian, for simplicity without intercept, are::

        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
                  + l2_reg_strength * identity

    Conventions:
        if fit_intercept:
            n_dof =  n_features + 1
        else:
            n_dof = n_features

        if base_loss.is_multiclass:
            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
        else:
            coef.shape = (n_dof,)

        The intercept term is at the end of the coef array:
        if base_loss.is_multiclass:
            if coef.shape (n_classes, n_dof):
                intercept = coef[:, -1]
            if coef.shape (n_classes * n_dof,)
                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
            intercept.shape = (n_classes,)
        else:
            intercept = coef[-1]

    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as

        coef.reshape((n_classes, -1), order="F")

    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
    coefficients without intercept, coef[:, :-1], contiguous and speeds up
    matrix-vector computations.

    Note: If the average loss per sample is wanted instead of the sum of the loss per
    sample, one can simply use a rescaled sample_weight such that
    sum(sample_weight) = 1.

    Parameters
    ----------
    base_loss : instance of class BaseLoss from sklearn._loss.
    fit_intercept : bool
    c                      || _         || _        y N)	base_lossfit_intercept)selfr
   r   s      e/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py__init__zLinearModelLoss.__init__E   s    "*    Nc                    |j                   d   }| j                  j                  }| j                  r|dz   }n|}| j                  j                  rt        j                  |||f|d      }|S t        j                  |||      }|S )a  Allocate coef of correct shape with zeros.

        Parameters:
        -----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        dtype : data-type, default=None
            Overrides the data type of coef. With dtype=None, coef will have the same
            dtype as X.

        Returns
        -------
        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
            Coefficients of a linear model.
           F)shapedtypeorderr   r   )r   r
   	n_classesr   is_multiclassnp
zeros_like)r   Xr   
n_featuresr   n_dofcoefs          r   init_zero_coefzLinearModelLoss.init_zero_coefI   s~      WWQZ
NN,,	NEE>>''==9e*<EQTUD  ==%u=Dr   c                 <   | j                   j                  s"| j                  r|d   }|dd }||fS d}|}||fS |j                  dk(  r*|j	                  | j                   j
                  dfd      }n|}| j                  r|dddf   }|ddddf   }||fS d}||fS )a  Helper function to get coefficients and intercept.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        N        r   r   r   )r
   r   r   ndimreshaper   )r   r   	interceptweightss       r   weight_interceptz LinearModelLoss.weight_intercepte   s    $ ~~++!! H	s)  	!!  	 	!! yyA~,,(@(@"'ES,Q!!#ArEN	!!SbS&/ 	!!  		!!r   c                     | j                  |      \  }}| j                  j                  s	||z  |z   }n||j                  z  |z   }|||fS )ai  Helper function to get coefficients, intercept and raw_prediction.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        raw_prediction : ndarray of shape (n_samples,) or             (n_samples, n_classes)
        )r(   r
   r   T)r   r   r   r'   r&   raw_predictions         r   weight_intercept_rawz$LinearModelLoss.weight_intercept_raw   sU    , "2248~~++[94N ]Y6N	>11r   c                 P    |j                   dk(  r||z  n
t        |      }d|z  |z  S )z5Compute L2 penalty term l2_reg_strength/2 *||w||_2^2.r   g      ?)r$   r   )r   r'   l2_reg_strengthnorm2_ws       r   
l2_penaltyzLinearModelLoss.l2_penalty   s.    '.||q'8'G#l7>S_$w..r   c                     || j                  ||      \  }}	}n| j                  |      \  }}	| j                  j                  ||d|      }
t	        j
                  |
|      }
|
| j                  ||      z   S )a  Compute the loss as weighted average over point-wise losses.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.
        Ny_truer+   sample_weight	n_threads)r'   )r,   r(   r
   lossr   averager0   )r   r   r   yr4   r.   r5   r+   r'   r&   r6   s              r   r6   zLinearModelLoss.loss   s    N !151J1J4QR1S.GY!%!6!6t!<GY~~"")	 # 
 zz$6doog???r   c                    |j                   | j                  j                  c\  }}	}
|	t        | j                        z   }|| j                  ||      \  }}}n| j                  |      \  }}| j                  j                  ||||      \  }}||nt        j                  |      }|j                         |z  }|| j                  ||      z  }||z  }| j                  j                  s\t        j                  ||j                        }|j                  |z  ||z  z   |d|	 | j                  r|j                         |d<   ||fS t        j                  |
|f|j                  d      }|j                  |z  ||z  z   |ddd|	f<   | j                  r|j                  d      |dddf<   |j                   d	k(  r|j#                  d
      }||fS )a\  Computes the sum of loss and gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.

        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr2   r   r!   r   r   r   r   axisr   r#   )r   r
   r   intr   r,   r(   loss_gradientr   sumr0   r   
empty_liker   r*   emptyr$   ravel)r   r   r   r8   r4   r.   r5   r+   	n_samplesr   r   r   r'   r&   r6   grad_pointwisesw_sumgrads                     r   r?   zLinearModelLoss.loss_gradient   s   T ./WWdnn6N6N*JS!3!344!151J1J4QR1S.GY!%!6!6t!<GY#~~;;)'	  <  
n ,39NxxzF"99& ~~++==W]];D !n 47P PD*!!)--/R Tz 88Y.gmm3OD#1#3#3a#7/G:S#SDKZK !!,00a08QUyyA~zzz,Tzr   c                 F   |j                   | j                  j                  c\  }}	}
|	t        | j                        z   }|| j                  ||      \  }}}n| j                  |      \  }}| j                  j                  ||||      }||nt        j                  |      }||z  }| j                  j                  sZt        j                  ||j                        }|j                  |z  ||z  z   |d|	 | j                  r|j                         |d<   |S t        j                  |
|f|j                  d      }|j                  |z  ||z  z   |ddd|	f<   | j                  r|j                  d      |dddf<   |j                  d	k(  r|j!                  d
      S |S )a  Computes the gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr2   r:   r!   r   r;   r   r<   r   r#   )r   r
   r   r>   r   r,   r(   gradientr   r@   r   rA   r   r*   rB   r$   rC   )r   r   r   r8   r4   r.   r5   r+   rD   r   r   r   r'   r&   rE   rF   rG   s                    r   rI   zLinearModelLoss.gradient5  s   N ./WWdnn6N6N*JS!3!344!151J1J4QR1S.GY!%!6!6t!<GY00)'	 1 
 ,39N& ~~++==W]];D !n 47P PD*!!)--/RK88Y.gmm3OD#1#3#3a#7/G:S#SDKZK !!,00a08QUyyA~zzz,,r   c
                    |j                   \  }
}|t        | j                        z   }|	| j                  ||      \  }}}	n| j	                  |      \  }}| j
                  j                  ||	||      \  }}||
nt        j                  |      }||z  }||z  }t        j                  |dk        dkD  }t        j                  |      }| j
                  j                  s|"t        j                  ||j                        }n|}|j                  |z  ||z  z   |d| | j                  r|j                         |d<   |$t        j                  ||f|j                        }n|}|r|||fS t!        j"                  |      rC|j                  t!        j$                  |df|
|
f      z  |z  j'                         |d|d|f<   n5|dddf   |z  }t        j(                  |j                  |      |d|d|f<   |dkD  r%|j+                  d      d||z  |d	z   xx   |z  cc<   | j                  r;|j                  |z  }||dddf<   ||dddf<   |j                         |d
<   nt,        |||fS )a  Computes gradient and hessian w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        gradient_out : None or ndarray of shape coef.shape
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or ndarray
            A location into which the hessian is stored. If None, a new array
            might be created.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessian : ndarray
            Hessian matrix.

        hessian_warning : bool
            True if pointwise hessian has more than half of its elements non-positive.
        Nr2   r   g      ?r:   r!   r   r   r   )r!   r!   )r   r>   r   r,   r(   r
   gradient_hessianr   r@   meanabsr   rA   r   r*   rB   r   issparse
dia_matrixtoarraydotr%   NotImplementedError)r   r   r   r8   r4   r.   r5   gradient_outhessian_outr+   rD   r   r   r'   r&   rE   hess_pointwiserF   hessian_warningrG   hessWXXhs                          r   rL   z LinearModelLoss.gradient_hessian~  s   j !"	:S!3!344!151J1J4QR1S.GY!%!6!6t!<GY)-)H)H)'	 *I *
& ,39N& & 
 ''.A"56=/~~++#}}T?# !n 47P PD*!!)--/R "xxuenGMMJ"T?22
 q!CC'''+Iy3I 	
 ') [j[+:+-. $AtG,q013R[j[+:+-." R 8zE)eai8$%  !! SS>) "SbS"W "R"W-113V &%T?**r   c                 B    j                    j                  j                  c\  }t         j                        z    j                        \  }}	|nt        j                         j                  j                  sJ j                  j                  ||	|      \  }
}|
z  }
|z  }t        j                  j                        }j                  |
z  z  z   |d  j                  r|
j                         |d<   |j                         t        j                        rt        j                  |df||f      z  n|ddt        j                   f   z   j                  rMt        j"                  t        j$                  j                  d                  t        j&                         fd}||fS  j                  j)                  ||	|      \  }
|
z  }
t        j*                  fj                  d	
      }|
j                  z  z  z   |dddf<    j                  r|
j                  d      |dddf<    fd}j,                  dk(  r|j/                  d	      |fS ||fS )a  Computes gradient and hessp (hessian product function) w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessp : callable
            Function that takes in a vector input of shape of gradient and
            and returns matrix-vector product with hessian.
        Nr2   r:   r!   r   rK   r<   c                    t        j                  |       }t        j                        rj                  | d  z  z  |d  n2t         j
                  j                  j                  | d  g      |d  |d xxx | d  z  z  ccc j                  r(|d xxx | d   z  z  ccc | d  z  | d   z  z   |d<   |S )Nr!   )r   rA   r   rO   r*   linalg	multi_dotr   )	sretr   hXhX_sumhessian_sumr.   r   r   s	     r   hesspz7LinearModelLoss.gradient_hessian_product.<locals>.hesspR  s    mmA&??1%'(ssb1[j>.A'BC$')yy':':ACCQ{
^;T'UC$KZ Oan$DD %%$"6$$q*~5ae8KKCG
r   r   r;   c                 V   | j                  dfd      } j                  r| d d df   }| d d d df   } nd}| j                  z  |z   }|
 |z  j                  d      d d t        j
                  f   z  }|
z  }|d d t        j
                  f   z  }t	        j                  fj                  d      }|j                  z  z  | z  z   |d d d 	f<   j                  r|j                  d      z  |d d df<   j                  dk(  r|j                  d      S |S )Nr!   r   r#   r   r   r<   r;   )
r%   r   r*   r@   r   newaxisrB   r   r$   rC   )r_   s_intercepttmp	hess_prodr   r   r.   r   r   r   probar4   r   rF   r'   s       r   rd   z7LinearModelLoss.gradient_hessian_product.<locals>.hessp  s8   IIy"oSI9%%"#ArE(K!SbS&	A"#K!##g+))q)1!RZZ-@@u ,=BJJ77C HHi%7w}}TWX	-0UUQY&,@?UVCV,V	![j[.)%%'*wwAw'?Iae$99>$???55$$r   r   r#   )r   r
   r   r>   r   r,   r   r@   r   rL   rA   r   r*   r   rO   rP   rf   squeezeasarray
atleast_1dgradient_probarB   r$   rC   )r   r   r   r8   r4   r.   r5   rD   r&   r+   rE   rV   rG   rd   ra   rb   rc   r   r   r   rj   rF   r'   s   ``` ``        @@@@@@@@@r   gradient_hessian_productz(LinearModelLoss.gradient_hessian_product	  s   @ ./WWdnn6N6N*JS!3!344-1-F-FtQ-O*N+39N~~++-1^^-L-L-+#	 .M .*NN f$Nf$N==W]];D !n 47P PD*!!)--/R ),,.Kq!%%~q&9)YAWX 
 $ArzzM2Q6!! BJJrvv1v~$>?v. \ U{w %)NN$A$A-+#	 %B %!NE f$N88Y.gmm3OD#1#3#3a#7/G:S#SDKZK !!,00a08QU.% %. yyA~zzz,e33U{r   r	   )Nr"   r   N)Nr"   r   NNN)Nr"   r   )__name__
__module____qualname____doc__r   r   r(   r,   r0   r6   r?   rI   rL   ro    r   r   r   r      s    7r+8%"N2@/ 4@v Lf G\ I+X NOWr   r   )rs   numpyr   scipyr   utils.extmathr   r   rt   r   r   <module>rx      s!      (U
 U
r   