
    {Kgql              	          d dl Z d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZmZmZmZ d Zej2                  j5                  d	 ej6                  g d
ej8                        gdf ej6                  ddej:                  gej<                        gdf ej6                  g de      gdfdg      ej2                  j5                  dddg      ej2                  j5                  dddg      d                      Z ej2                  j5                  d ej6                  g d
ej8                        gdf ej6                  g de      gddgfg      ej2                  j5                  d ej6                  g d       ej6                  g d      g      ej2                  j5                  dddg      d                      Z!ej2                  j5                  d  ej6                  d gd!z  d"gd!z  z   d#gz   gej8                        jD                  g d
gf ej6                  d$gd!z  d%gd!z  z   d&gz   ge      jD                  g d'gfg      ej2                  j5                  dd(dg      d)               Z#ej2                  j5                  d*g d+d,f ej6                  g d-g dg      jD                  d.fg      d/        Z$d0 Z%ej2                  j5                  d1d"d2gd!z  d3d4gfg dd5z  d"d2gz   g d6fg d7d5z  d8d9gz   g d:fg      d;        Z&ej2                  j5                  d<d=d>g      ej2                  j5                  dddg      ej2                  j5                  dg d?      d@                      Z'ej2                  j5                  dA ej6                  dBgdCz        dBf ej6                  d gdCz        d f ej6                  dDgdCz  e      d fgg dEF      ej2                  j5                  dg dG      dH               Z(dI Z)dJ Z*ej2                  j5                  dg dK      dL        Z+ej2                  j5                  ddMdg      dN        Z,dO Z-y)P    N)assert_allcloseassert_array_equal)RandomForestRegressor)Ridge)KFoldShuffleSplitStratifiedKFoldcross_val_scoretrain_test_split)make_pipeline)KBinsDiscretizerLabelBinarizerLabelEncoderTargetEncoderc                 <   t        j                  |t         j                        }t        j                  |      }|dk(  rt        j                  |      }t        |      D ]j  }|| |k(     }|j                  d   }	|	dk(  r|||<   %t        j                  |      }
|
|z  }|	|	|z   z  }|t        j                  |      z  d|z
  |z  z   ||<   l |S t        |      D ]?  }|| |k(     }t        j                  |      ||z  z   }|j                  d   |z   }||z  ||<   A |S )z0Simple Python implementation of target encoding.dtypeautor      )npzerosfloat64meanvarrangeshapesum)	X_ordinal	y_numericn_categoriessmoothcur_encodingsy_mean
y_variancecy_subsetn_iy_subset_variancemlambda_current_sumcurrent_cnts                  s/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_target_encoder.py_encode_targetr.      s+   HH\<MWWYFVVI&
|$A a0H..#Cax#)a  "x 0!J.AS1WoG&)::a'kV=SSM! % |$A a0H&&*Vf_<K"..+f4K*[8M!	 %
     zcategories, unknown_valuer   r      r            ?      @      @)catdogsnakebear)r      r!         @r   target_typebinary
continuousc                 H   d}t        j                  dgdz  dgdz  z   dgdz  z   gt         j                        j                  }t        j                  g d	gt         j                        j                  }|j                  d   }| d
k(  r|}	|}
n| d   |   }	| d   |   }
t        j
                  |
|ggf      }
t         j                  j                  |      }d}|dk(  r7|j                  dd|      }t        j                  ddgt              }||   }n|dk(  sJ |j                  dd|      }|}|j                  |      }||   }|	|   }	||   }||   }|dk(  rt        ||d      }nt        ||d      }t        j                  |t         j                        }|j!                  ||      D ].  \  }}||df   ||   }}t#        ||||      }|||df      ||df<   0 t%        || ||      }|j'                  |	|      }|j(                  |k(  sJ t+        ||       t-        |j.                        dk(  sJ |dk(  rt1        |j2                         n|j2                  J t        j4                  |      }t#        |dddf   |||      }t+        |j.                  d   |       |j6                  t9        j:                  |      k(  sJ t        j
                  |t        j                  |g      f      j=                  dd      }|j?                  |
      }t+        ||       y)zCheck encoding for binary and continuous targets.

    Compare the values returned by `TargetEncoder.fit_transform` against the
    expected encodings for cv splits from a naive reference Python
    implementation in _encode_target.
    r:   r      r      r1   (   r   r0   r   r=   lowhighsizer6   r7   r>   Tn_splitsrandom_stateshuffle)r!   
categoriescvrJ   N) r   arrayint64Tr   concatenaterandomRandomStaterandintobjectuniformpermutationr	   r   
empty_liker   splitr.   r   fit_transformtarget_type_r   len
encodings_r   classes_r   target_mean_pytestapproxreshape	transform)rL   unknown_valueglobal_random_seedr!   r<   r    X_train_int_arrayX_test_int_array	n_samplesX_trainX_testdata_rngrI   r   target_namesy_trainshuffled_idxrM   expected_X_fit_transform	train_idxtest_idxX_y_r"   target_encoderX_fit_transformr#   expected_encodingsexpected_X_test_transformX_test_transforms                                 r-   test_encodingrz   7   s\   & L1#(aS2X"5b"@!ARTTxx288<>>!''*IV#!Q- 12A/0^^V}o%678Fyy$$%78HHh$$$C	xxf=y) l***$$2I$F	''	2L),7l#Gl#G,'I h,>
 H3EtT  "}}->bjjQ!xx(97C	8"9a<0)I2FB&r2|VD0=hk*1
 1-  D #'	N %227GDO&&+555O%=>~(()Q...h>22LA&&... WWYF'!Q$L& N--a02DE&&&--*???? !#	RXXvh/0!gb!n  &//7$&?@r/   zcategories, unknown_valuesrabbittarget_labels)r   r1   r:   )abr%   c           
         t         j                  j                  |       }d}d}t        j                  |j	                  dd|            }t        j                  |j	                  dd|            }	|d   |   }
|d   |	   }t        j
                  |
|f      }t        j
                  ||	f      }ddgg dg}d}t        j                  |j	                  d||            }||   }t               j                  |      }d}t        || d	      }t        j                  |j                  d   |j                  d   |z  ft         j                  
      }t        |      D ]r  \  }}t        |      D ]_  }|j                  ||      D ]H  \  }}|dd|f   }|||f   ||   }}t        ||t!        |      |      }|||z  z   }||||f      |||f<   J a t t#        |||       } | j                  ||      }!| j$                  dk(  sJ t'        |!|       g }"t        |      D ]M  \  }}t        |      D ]:  }|dd|f   }t        |dd|f   |t!        |      |      }|"j)                  |       < O t!        | j*                        ||z  k(  sJ t        ||z        D ]  }#t'        | j*                  |#   |"|#            t-        | j.                  |       t        j                  ddgddgddgg      }$|dk(  r|$}%nnt        j0                  |$ddddf   t2        
      }%t        |$j                  d         D ]  }&|d   |$dd|&f      |%dd|&f<    t        j4                  |%|f      }%t        j6                  |d      }'t        j                  |$j                  d   |$j                  d   |z  ft         j                  
      }(|$j                  d   })g d}t        |)dz
        D ]'  }*t        |"      D ]  \  }#}+|+|$|*||#   f      |(|*|#f<    ) g d},t        ||z        D ]  }#|'|,|#      |(|)dz
  |#f<    | j9                  |%      }-t'        |-|(       y)z&Check encoding for multiclass targets.P   r1   r   rC   r:   r   r0   TrH   r   Nr!   rM   rJ   
multiclassr2      r   rN   axis)r   r   r   r   r   r   )r   r   r1   r   r   r1   )r   rS   rT   rO   rU   column_stackr   r[   r	   emptyr   r   	enumerater   rZ   r.   r]   r   r\   r   appendr^   r   r_   rY   rV   vstackr   rd   ).rf   rL   unknown_valuesr|   r!   rngri   
n_features
feat_1_int
feat_2_intfeat_1feat_2rj   X_train_intcategories_	n_classesy_train_intrn   y_train_encrI   rM   rp   f_idxcatsc_idxrq   rr   y_classrs   rt   current_encodingexp_idxru   rv   rw   i
X_test_intrk   
column_idxr#   rx   n_rowsrow_idxencmean_idxry   s.                                                 r-   test_encoding_multiclassr      s    ))

 2
3CIJ#++!!)+DEJ#++!!)+DEJ]:&F]:&Foovv./G//:z":;Kq69%KI((3;;199;MNKK(G "009KH	(:D
B
  "xx			1	{003i?@jj  !-t9%E')xx'A#	8%ah/$Y%56	8JB#1"b#d)V#L   59#45>N%0?(7):; (B & . #'N
 %227GDO&&,666O%=>  -t9%E!!U(+G-AuH%wD	6  %%&67 & . ~(()Z)-CCCC:	)*11!46H6KL +~..> Aq6Aq6Aq623Jz#2#q&1@
 0 0 34J$.qM*SbS*_2M$NF1j=! 5 FN34WW[q)F "			!	j..q1I=>jj! a FE!$ 23FAs47
7ERSHCT8U4V%gqj1 4 % "H9z)*39(1+3F!&1*a-0 + &//7$&?@r/   zX, categories
   r   r:   r6   r7   r8   )r7   r6   cow      @c                    t         j                  j                  d      }|j                  dd| j                  d         }t        ||d      j                  | |      }|j                         }|j                  | dd       }|d   t        j                  |      k(  sJ t        |j                        d	k(  sJ |j                  d   d   t        j                  |      k(  sJ y)
zHCustom categories with unknown categories that are not in training data.r   rG   r@   rC   )rL   r!   rJ   rN   N)r   r   r   )r   rS   rT   rW   r   r   fitr   rd   ra   rb   r]   r^   )XrL   r!   r   yr   r#   X_transs           r-   test_custom_categoriesr     s    $ ))


"C"1771:6A
:f1
M
Q
QRSUV
WC VVXFmmAbcF#G4=FMM&1111s~~!###>>!R FMM&$9999r/   zy, msg)r   r1   r   r   z'Found input variables with inconsistent)r   r1   r   z7Target type was inferred to be 'multiclass-multioutput'c                     t        j                  g dg      j                  }t               }t	        j
                  t        |      5  |j                  ||        ddd       y# 1 sw Y   yxY w)zCheck invalidate input.)r   r   r   matchN)r   rO   rQ   r   ra   raises
ValueErrorr[   )r   msgr   r   s       r-   test_errorsr   5  sK     	)A
/C	z	-!Q 
.	-	-s   A$$A-c                     t        j                  g dg      j                  } t        j                  g d      }t        d      }t	        j
                  t        t        j                  d            5  |j                  | |       ddd       |j                  dk(  sJ t        dd	
      }|j                  | |       |j                  d	k(  sJ y# 1 sw Y   KxY w)z@Check inferred and specified `target_type` on regression target.)r   r   r   r   r   r   )r3          @r4   r   r4   r   r1   rM   zQThe least populated class in y has only 1 members, which is less than n_splits=2.r   Nr   r>   )rM   r<   )r   rO   rQ   r   ra   warnsUserWarningreescaper[   r\   )r   r   r   s      r-   test_use_regression_targetr   H  s    
$%&((A
/0A
1
C	ii

 	!Q
 |+++
1,
7Ca|+++
 
s   4CCzy, feature_namesr1   AB   )A_1A_2A_3B_1B_2B_3)y1y2y3r   r   )A_y1A_y2A_y3B_y1B_y2B_y3c                    t        j                  d      }|j                  ddgdz  ddgdz  d      }t        ddd	
      }|j	                  d       t        ddd	
      }|j	                  d       |j                  ||       }|j                  ||       }t        |j                         |       t        |j                         |       t        |j                         |j                         y)z*Check TargetEncoder works with set_output.pandasr}   r~   r   r   r1   )r   r   r4   r   rM   r!   rJ   default)rd   N)ra   importorskip	DataFramer   
set_outputr[   r   to_numpyr   get_feature_names_outcolumns)r   feature_namespdX_dfenc_default
enc_pandas	X_defaultX_pandass           r-   !test_feature_names_out_set_outputr   ]  s     
		X	&B<<sCj2oQFRK@AD1SqAKY/!Ca@JH-))$2I''a0HH%%'3z779=Iz7798;K;KLr/   	to_pandasTF)binary-ints
binary-strr>   c                    t        j                  ddgddgddgddgddgddgddgddggt         j                        }|dk(  r?t        j                  g d      }t               j	                  |      }t        ddd      }n{|d	k(  r?t        j                  g d
      }t               j	                  |      }t        ddd      }n7t        j                  g dt         j                        }|}t        ddd      }t        j                  |      }g dddgg}t        j                  ddgddgddggt         j                        }	| rst        j                  d      }
|
j                  |dddf   t        j                  ddgt              |dddf      d      }|
j                  |	dddf   g dd      }	n|}t        j                  |t         j                        }t        |      D ]Q  \  }}|j!                  ||      D ]7  \  }}|||f   ||   }}t#        ||t%        |      |      }||||f      |||f<   9 S g }t        |      D ]4  \  }}t#        |dd|f   |t%        |      |      }|j'                  |       6 t        j                  |d   d   |d   d   g||d   d   g|d   d   |ggt         j                        }t)        |dd      }|j	                  ||      }t+        ||       t%        |j,                        dk(  sJ t/        d      D ]  }t+        |j,                  |   ||            |j1                  |	      }t+        ||       y)z,Check target encoder with multiple features.r   r   r1   r   r   )r}   r~   r}   r}   r~   r~   r}   r~   T)rJ   rK   r   )r:   r2   r:   r:   r:   r2   r2   r2   )r4   gffffff@g333333@g      @gffffff@g      @皙$@g333333@r0   r:   r   r   Nr6   r7   )feat0feat1)r7   r6   r8   r   )r   rO   rP   r   r[   r	   float32r   r   ra   r   r   rV   rY   r   r   rZ   r.   r]   r   r   r   r^   r   rd   )r   r!   r<   r   rn   	y_integerrM   r#   rL   rk   r   rj   rp   r   r   rq   rr   rs   rt   r   rw   rx   r   rv   r   ry   s                             r-   test_multiple_features_quickr   {  sq   
 
Q!Q!Q!Q!Q!Q!Q!QHPRPXPXI l"((CD N009	QQ=		%((34 N009	QQ=((DBJJW	11d3WWYFaV$JXXFFG	

 hhF   *,,"1a45%.?	!Q$P
 q!t?VWX  "}}YbjjI ,t#%88Iy#AIxy%/0)I2FB-b"c$iH8H(E/*9$Xu_5 $B -  ,t)ahCIv
 	!!"23	 - !#"1%'9!'<Q'?@'*1-."1%v.	

 jj! v!!
<C''9OO%=>s~~!###1Xq)+=a+@A  }}V,$&?@r/   z	y, y_meang333333@r@   r}   )r>   r=   zbinary-string)ids)r   r           c                 *   t        j                  dgdz  g      j                  }|j                  d   }t	        d|d      }|j                  ||       }t        |t        j                  |gg|d             |j                  d   d   t        j                  |      k(  sJ |j                  t        j                  |      k(  sJ t        j                  dgdgg      }|j                  |      }t        |t        j                  |ggdd             y)z5Check edge case where feature and target is constant.r   r@   r   r1   r   r   N)r   rO   rQ   r   r   r[   r   repeatr^   ra   rb   r`   rd   )	r   r#   r!   r   ri   r   r   rk   X_test_transs	            r-    test_constant_target_and_featurer     s     	1#(A
I
1V!
<C1%GGRYYz91EF>>!Q6==#8888v}}V4444XXsQCj!F==(LL"))fXJ"BCr/   c                 f   d}d}t         j                  j                  |       }|j                  |      }|j	                  d||      j                  dd      }|j                         }||   }||   }t        d|       }|j                  ||      }t        d	
      }|j                  ||      }	t        dd|       }
t        d|       }t        |
|||      j                         dk  sJ t        |
|||      j                         dk  sJ t        |
|	||      j                         dkD  sJ y )NrA   i  rF   r   rN   r   T)rK   rJ   F)rK   r   r@   )n_estimatorsmin_samples_leafrJ   2   )rI   rJ   r   皙?      ?)r   rS   rT   normalrU   rc   argsortr   r[   r   r   r
   r   )rf   cardinalityri   r   rn   rj   y_sorted_indicesru   X_encoded_train_shuffledX_encoded_train_no_shuffled	regressorrM   s               r-   Ftest_fit_transform_not_associated_with_y_if_ordinal_categorical_is_notr     sQ    KI
))

 2
3Cjjij(Gkk![yk9AA"aHG (&'G&'G"4>PQN-;;GWM"51N"0">">w"P &";MI 
r0B	CB9gw2>CCEKKK	#;WLQQS
		 		#>BOTTV
		r/   c                  J   t        j                  g dg      j                  } t        j                  g d      }t        ddd      }|j	                  | |      }t        |d   t        j                  |dd	              t        |d
   t        j                  |d	d              y	)zECheck edge case with zero smoothing and cv does not contain category.)
r   r   r   r   r   r   r   r   r   r   )
g @g333333@g333333?g@r3   g      "@r   gffffff,@g*@g      .@r   Fr1   )r!   rK   rM   r   r   NrN   )r   rO   rQ   r   r[   r   r   )r   r   r   r   s       r-   test_smooth_zeror    s    
01244A
GHA
sEa
8C1%G GAJ!"/ GBK2A0r/   )r   g     @@r   c                 h   t         j                  j                  |      }|j                  d      }d}t	        |d      j                  |j                  dd            }t        |||      \  }}}}	|j                  |      }
|
|j                  t         j                           }|
|j                  t         j                           }t        | |	      }|j                  ||      }|j                  |      }|j                  ||      }|j                  |      }t        ||       t        ||       y )
Ni  r   rA   ordinal)n_binsencoderN   r   rJ   r!   rJ   )r   rS   rT   r   r   r[   rc   r   rX   astypeint32r   rd   r   )r!   rf   r   r   r    r   rj   rk   rn   y_testpermutated_labelsX_train_permutedX_test_permutedru   X_train_encodedX_test_encodedX_train_permuted_encodedX_test_permuted_encodeds                     r-   3test_invariance_of_encoding_under_label_permutationr  )  s   
 ))

 2
3C 	


ALY?MM			"a	A (8	1-($GVWf 5()AB'bhh(?@O"&?QRN$227GDO#--f5N-;;<LgV,66GO%=>N$;<r/   r   c                    t        ddd      }d}t        j                  j                  |      }|j	                  |      }d|j	                  |      z  }d}t        |dd	|
      j                  ||z   j                  dd            }|j                  |      }	|	|j                  t        j                           }|j                  |      }
|j                  t        d|z        |d      j                  dd      }t        j                  ||
|gd      }t        ||d      \  }}}}|j                  ||      }|j!                  ||      dk  sJ |j!                  ||      dk  sJ t#        t%        | |      |      j                  ||      }|d   j&                  }|j!                  ||      dkD  sJ |       |j!                  ||      dkD  sJ |       |d   t)        j*                  dd      k(  sJ t        j,                  |dd        dk  j/                         sJ t%        | |      j                  ||      }|j1                  |      }|j1                  |      }|j                  ||      }|j&                  }|j!                  ||      dkD  sJ |       |j!                  ||      dk  sJ |       t-        |d         t-        |d         k  sJ y )Ngư>lsqrF)alphasolverfit_interceptiP  g?d   r  rW   )r  r  strategyrJ   rN   r   g?T)rF   replacer   r   r  r   r  r   g{Gz?)absg?gffffff?r1   )r   r   rS   rT   randnr   r[   rc   rX   r  r	  choiceintrR   r   r   scorer   r   coef_ra   rb   r  allrd   )r!   rf   linear_regressionri   r   r   noiser    X_informativer  
X_shuffledX_near_unique_categoriesr   rj   rk   rn   r
  	raw_modelmodel_with_cvcoefru   X_enc_no_cv_trainX_enc_no_cv_testmodel_no_cvs                           r-   *test_target_encoding_for_linear_regressionr-  M  s    DuM I
))

 2
3C		)A
 #))I&&EL$	
 mQY''A./  5%m&:&:288&DEM /J  #zzC)O9d  *  gb!n 
 		
$<=	A (811'M$GVWf
 "%%gw7I??7G,s222??66*S000 "V#68I	c'7  ""Dw036<<6vv.4:d:4 7fmmA40000FF48s"''))) #&sCGGN '009%//7#''(97CK D.83>DD>-v6<BdB<
 tAw<#d1g,&&&r/   c                      t        j                  dd      } | j                  dd      5  | j                  g dg dd      }t	        d	
      j                  |dg   |d          ddd       y# 1 sw Y   yxY w)z
    Test target-encoder cython code when y is read-only.

    The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
    Non-regression test for gh-27879.
    r   z2.0)
minversionzmode.copy_on_writeT)r}   r~   r~   )r   r;   r5   )xr   r>   )r<   r0  r   N)ra   r   option_contextr   r   r   )r   dfs     r-   test_pandas_copy_on_writer3    se     
		X%	8B			/	6\\oFG,/33BuIr#wG 
7	6	6s   <A//A8).r   numpyr   ra   numpy.testingr   r   sklearn.ensembler   sklearn.linear_modelr   sklearn.model_selectionr   r   r	   r
   r   sklearn.pipeliner   sklearn.preprocessingr   r   r   r   r.   markparametrizerO   rP   nanr   rV   rz   r   rQ   r   r   r   r   r   r   r   r  r  r-  r3   r/   r-   <module>r?     s   	   = 2 &  + < 
"((9BHH
-	.2
"((Cbff%RZZ
8	93?
"((*&
9	:FC	 C=1<(@A]A B 2]A@  
"((9BHH
-	.7
"((*&
9	:VX<NO hbhhy)8288O+DE C=1eA 2eAP  BHHqcBh!r)QC/0ACCK	

 BHH2",y89a"#		
 C=1: 2 :  	@ABHHi+,..E	
	 	 ,* 
Q"sCj!	Q!Q	!KL"dD\1<	

M
M& tUm4C=1(STOA U 2 5OAd 	3%"*	s#	1#(	Q	3%"*F	+Q/
 	2   #56D 7D +\1" #56 = 7 =F C=1o' 2o'd
Hr/   