
    {Kge3                    '   d dl Z d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZmZ d dlmZ d Zej(                  j+                  d	d
dg      d        Zej(                  j+                  d	d
dg      d        Zej(                  j+                  dej0                  ej2                  ej4                  g      ej(                  j+                  dej0                  ej2                  ej4                  g      d               Zej(                  j+                  dej0                  ej2                  ej4                  g      d        Zd Zd Zd Zd Z d Z!ej(                  j+                  dg dg dg ejD                  g dg dg       ejD                  g dg dge#       ejD                  g dd ejH                  d!gge#       ejD                  g dd  e%d"      d!gge#       ejD                  g d#g d$ge#       ejD                  g d%d ejH                  dgge#       ejD                  g d%d  e%d"      dgge#      gg d&'      d(        Z&ej(                  j+                  d	d
dg      ej(                  j+                  d)d*d+g      ej(                  j+                  d,dd-g      d.                      Z'ej(                  j+                  d)d*d+g      ej(                  j+                  d/d0d1gd2d1gd0d1ggg d3g d4g d3gfd5d gd6d gd7d8gd6d ggg d9g d:g d;gfg      d<               Z(d= Z)ej(                  j+                  d,g d>      ej(                  j+                  d?g d>      d@               Z*ej(                  j+                  dAdBdCg      ej(                  j+                  dd2d0g ejD                  dDdEg      g      dF               Z+ej(                  j+                  dAdBdCg      dG        Z,ej(                  j+                  dHdId1gdJd1ggdIdJgd1ggejZ                  f ejD                  d2d0gdKd0gg      d2dKgd0ggej\                  f ejD                  dLd!gdMd!gge#      dLdMgd!ggejZ                  f ejD                  dLd!gdMd!gg      dLdMgd!ggej^                  f ejD                  d2d0gejH                  d0gg      d2ejH                  gd0ggej4                  f ejD                  dLejH                  gdejH                  gge#      dLdgejH                  ggejZ                  f ejD                  dL e%d"      gd e%d"      gge#      dLdg e%d"      ggejZ                  fgg dN'      dO        Z0ej(                  j+                  d	d
dg      ej(                  j+                  dP ejD                  d d8gge#      jb                   ejD                  d dQgge#      jb                  g dRgejZ                  f ejD                  d2d0ggdS      jb                   ejD                  d2dTggdS      jb                  g dUgejd                  f ejD                  d d8gge#      jb                   ejD                  d dQgge#      jb                   ejD                  g dR      gejZ                  f ejD                  dd gge#      jb                   ejD                  dd8gge#      jb                  g dVge#f ejD                  d d8gge#      jb                   ejD                  d ejH                  gge#      jb                  g dWge#f ejD                  d dgge#      jb                   ejD                  d ejH                  gge#      jb                  g dXge#fgg dY'      dZ               Z3d[ Z4ej(                  j+                  d\e	e
g      d]        Z5d^ Z6d_ Z7ej(                  j+                  d`d-dadbgfdcg ddfg dedfdggfgg dh'      di        Z8dj Z9ej(                  j+                  dg dg dg ejD                  g dkg dlg       ejD                  g dg dge#      gg dm'      dn        Z:ej(                  j+                  dP ejD                  d d8gge#      jb                   ejD                  d dQgge#      jb                  g dRgejZ                  f ejD                  d2d0ggdS      jb                   ejD                  d2dTggdS      jb                  g dUgejd                  f ejD                  d d8gge#      jb                   ejD                  d dQgge#      jb                   ejD                  g dR      gejZ                  fgg do'      dp        Z;dq Z<dr Z=ej(                  j+                  dse%e>g      dt        Z?du Z@dv ZAdw ZBdx ZCdy ZDdz ZEej(                  j+                  d{ejH                  d e%d"      g      d|        ZFej(                  j+                  d,dIdKgg d}g      d~        ZGej(                  j+                  dd+d*gddg'      ej(                  j+                  d,d-g dgd-dg'      d               ZHej(                  j+                  d\e	e
g      d        ZIej(                  j+                  ddd0iddiddid0dddTddg      ej(                  j+                  ddg dgg      d               ZJej(                  j+                  d,dcd-d8gg      d        ZKej(                  j+                  d,d gdQgg      d        ZLej(                  j+                  dddKiddiddiddiddidKdddTddg      d        ZMej(                  j+                  d,d-d8gg      d        ZNej(                  j+                  d,d gdQgg      d        ZOd ZPej(                  j+                  ddKd2dddTig      d        ZQd ZRd ZSd ZTd ZUd ZVej(                  j+                  ddd2dg      d        ZWej(                  j+                  dd0dKdg      d        ZXej(                  j+                  dg d      ej(                  j+                  dg d      d               ZYd ZZej(                  j+                  d{ejH                  dg      d        Z[d Z\ej(                  j+                  d	dd
g      ej(                  j+                  dddg      d               Z]ej(                  j+                  d	d
dg      d        Z^ej(                  j+                  d	d
dg      d        Z_ej(                  j+                  d	d
dg      d        Z`d Zad Zbej(                  j+                  dejH                  dg      d        Zcej(                  j+                  dddg      ej(                  j+                  dejH                  dg      d               Zdej(                  j+                  dP ejD                  d ejH                  gge#      jb                   ejD                  d d8gge#      jb                   ejD                  d dQejH                  ge#      gejZ                  f ejD                  d ejH                  gge#      jb                   ejD                  d d8gge#      jb                   ejD                  d dQejH                  ge#      gejZ                  f ejD                  dejH                  ggej4                        jb                   ejD                  dDggej4                        jb                   ejD                  ddEejH                  g      gej4                  fgg d'      d        Zeej(                  j+                  d\e	e
g      d        Zfej(                  j+                  d ejD                  dejH                  dDgg      jb                   ejD                  dejH                  dgg      jb                   ejD                  dEgg      f ejD                  g d¢g      jb                   ejD                  g dâg      jb                   ejD                  ejH                  gg      f ejD                  dejH                  d8gge#      jb                   ejD                  dejH                  dgg      jb                   ejD                  dQgge#      f ejD                  g dŢge#      jb                   ejD                  g dƢg      jb                   ejD                  ejH                  gge#      fg      dǄ        Zgej(                  j+                  de      dɄ        Zhdʄ Ziej(                  j+                  dddMgg ejD                  ddMggdͬ       ejD                  ddMggdά      g      ej(                  j+                  ddLdMgg ejD                  dLdMggdͬ       ejD                  dLdMggdά      g      dЄ               Zjdф Zkd҄ Zldӄ Zmej(                  j+                  dd+d*g      dՄ        Znej(                  j+                  d ejD                  d gdgge#      d gejH                  gejH                  gg ej                  dgdgdgge#      f ejD                  ejH                  gdgd gge#      d gejH                  gejH                  gg ej                  dgejH                  gejH                  gge#      fg      d؄        Zpdل Zqdڄ Zrdۄ Zsd܄ Ztd݄ Zuej(                  j+                  dddKiddiddiddiddidKdddTddg      dބ        Zvd߄ Zwd Zxd Zyd Zzej(                  j+                  dddidd0ig      d        Z{ej(                  j+                  ddd2iddig      d        Z|d Z}d Z~ej(                  j+                  d\e	e
g      d        Zy)    N)sparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equal)CSR_CONTAINERSc                     t        j                  g dg dg      } t               }t        d      }|j                  |       }|j                  |       }|j                  dk(  sJ |j                  dk(  sJ t        j                  |      sJ t        j                  |      rJ t        |j                         g dg dg       t        |j                         |       y )N         r   r   r   Fsparse_outputr      )              ?r   r   r   )r   r   r   r   r   )	nparrayr   fit_transformshaper   issparser
   toarray)X
enc_sparse	enc_denseX_trans_sparseX_trans_denses        m/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_denser$      s     	)Y'(AJE2I--a0N++A.M6)))&(((??>***}---  #<>W"X ~--/?    handle_unknownignoreinfrequent_if_existc                    t        j                  g dg dg dg      }t        j                  g dg      }t        d      }|j                  |       t	        j
                  t        d      5  |j                  |       d d d        t        |       }|j                  |       |j                         }t        |j                  |      j                         t        j                  g d	g             t        ||       y # 1 sw Y   xY w)
N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr&   Found unknown categoriesmatch)r   r   r   r   r   r   r   )r   r   r   fitpytestraises
ValueError	transformcopyr
   r   r	   r&   r   X2oh	X2_passeds        r#   #test_one_hot_encoder_handle_unknownr:   )   s    
)Y	23A	9+	B 
g	.BFF1I	z)C	D
R 
E 
n	5BFF1I	I
Y'')
567
 B	" 
E	Ds   /DDc                    t        j                  g d      j                  d      }t        j                  ddg      j                  d      }t        |       }|j	                  |       |j                         }t        |j                  |      j                         t        j                  g dg dg             t        ||       y )N)11111111223334444)r   55555r=   r,   )r   r   r   r   r   r   r   r   )	r   r   reshaper   r0   r5   r
   r4   r   r6   s        r#   +test_one_hot_encoder_handle_unknown_stringsrD   A   s    
23;;GDA	7D/	"	*	*7	3B
 
n	5BFF1I	I
Y'')
&(<=>
 r9%r%   output_dtypeinput_dtypec                    t        j                  ddgg|       j                  }t        j                  ddgddgg|      }t        d|      }t	        |j                  |      j                         |       t	        |j                  |      j                  |      j                         |       t        d|d      }t	        |j                  |      |       t	        |j                  |      j                  |      |       y )Nr   r   dtypeauto)
categoriesrI   F)rK   rI   r   )	r   asarrayTr   r
   r   r   r0   r4   )rF   rE   r   
X_expectedr8   s        r#   test_one_hot_encoder_dtyperO   T   s     	

QF8;/11AaVaV,LAJ	&	=Br''*224jArvvay**1-557D	&E	RBr''*J7rvvay**1-z:r%   c                    t        j                  d      }|j                  ddgddgd      }t        j                  g dg dg| 	      }t        | 	      }t        |j                  |      j                         |       t        |j                  |      j                  |      j                         |       t        | d
      }t        |j                  |      |       t        |j                  |      j                  |      |       y )Npandasabr   r   ABr   r   r   r   r   r   r   r   rH   F)rI   r   )r1   importorskip	DataFramer   r   r   r
   r   r   r0   r4   )rE   pdX_dfrN   r8   s        r#   !test_one_hot_encoder_dtype_pandasr]   c   s    			X	&B<<sCj1v67D<6lKJ	\	*Br''-557Drvvd|--d3;;=zJ	\	?Br''-z:rvvd|--d3Z@r%   c                  Z   t               } g dg dg dg dg}| j                  |       | j                         }t        g d|       | j                  g d      }t        g d|       t	        j
                  t        d	      5  | j                  d
dg       d d d        y # 1 sw Y   y xY w)N)Maler   girlr   r   )Female)   r`   r   
   )r_   3   boy   r   )r_   [   r`         )	x0_Femalex0_Malex1_1x1_41x1_51x1_91x2_boyx2_girlx3_1x3_2x3_12x3_21x4_3x4_10x4_30)onetwothreefourfive)
one_Femaleone_Maletwo_1two_41two_51two_91	three_boy
three_girlfour_1four_2four_12four_21five_3five_10five_30z!input_features should have lengthr.   ry   rz   )r   r0   get_feature_names_outr
   r1   r2   r3   )encr   feature_namesfeature_names2s       r#   "test_one_hot_encoder_feature_namesr   s   s    
/C!%"$		A GGAJ--/M	
" 	%* ../VWN	
" 	%* 
z)L	M!!5%.1 
N	M	Ms   B!!B*c                     t               } t        j                  ddggt              j                  }| j                  |       | j                         }t        ddg|       | j                  dg      }t        dd	g|       y )
Nu   c❤t1dat2rH   u	   x0_c❤t1x0_dat2u   n👍meinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrM   r0   r   r
   )r   r   r   s      r#   *test_one_hot_encoder_feature_names_unicoder      st    
/C
8V$%V466AGGAJ--/MY/?--i[-IM(.9=Ir%   c                     d } t        |       }t        j                  ddggt              j                  }|j                  |       |j                         }t        ddg|       |j                  dg	      }t        d
dg|       d }t        |      j                  |      }d}t        j                  t        |      5  |j                          ddd       y# 1 sw Y   yxY w)z=Check the behaviour of `feature_name_combiner` as a callable.c                 $    | dz   t        |      z   S )N_)reprfeaturecategorys     r#   name_combinerzHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner   s    }tH~--r%   )feature_name_combinerNoneNrH   z	x0_'None'x0_NonerR   r   za_'None'a_Nonec                      y)Nr    r   s     r#   wrong_combinerzItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combiner   s    r%   zMWhen `feature_name_combiner` is a callable, it should return a Python string.r.   )r   r   r   r   rM   r0   r   r
   r1   r2   	TypeError)r   r   r   r   r   err_msgs         r#   1test_one_hot_encoder_custom_feature_name_combinerr      s    . m
<C
64.!022AGGAJ--/MY/?--cU-CM
H-}= n
=
A
A!
DCW  
y	0!!# 
1	0	0s   CC&c                     t        j                  ddgg      j                  } t               }|j	                  g dg       |j                         d   g dgk(  sJ |j                  |       j                         j                  dk(  sJ |j	                  g dg       |j                  |       j                         j                  dk(  sJ y )	Nr   r   )r   r   r   r   rK   rK   )r   r*   )r   r   r   r   r*   r   )	r   r   rM   r   
set_params
get_paramsr   r   r   )r   r8   s     r#   test_one_hot_encoder_set_paramsr      s    
1a&A	BMMl^M,==?<(\N:::A&&(..&888MMo.M/A&&(..&888r%   c                    t        d      }|j                  |       }t        dd      }|j                  |       }t        |j                         |       t	        j
                  |      r|j                  dk(  sJ |j                         S )NrJ   r   FrK   r   csr)r   r   r	   r   r   r   format)r   r   Xtr1Xtr2s       r#   check_categorical_onehotr      sq    
6
*CQD
6
?CQDDLLND)??4 T[[E%999<<>r%   r   defr   7   abcr   r   )rc   r   r   )r   r   r   )rS   rU   cat)rR   rV   r   rH   )rS   r   r   rR   r   nan)Nr   r   )rR   r   r   )Nr   N)mixednumericr   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)idsc                 \   t        t        j                  |       d d dgf         }t        |ddgddgg       t        t        j                  |       d d ddgf         }t        |g dg dg       t	        d      j                  |       }t        |j                         g dg dg       y )	Nr   r   )r   r   r   r   r   r   r   r   rJ   r   )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r	   r   r   r   )r   Xtrs     r#   test_one_hot_encoderr      s    0 #288A;q1#v#6
7CC1a&1a&)*
"288A;q1a&y#9
:CC,56
6
*
8
8
;CCKKMO_#EFr%   sparse_FTdropfirstc                     g dg dg dg}t        ||      }|j                  |      }t        j                  |t              }t        |j                  |      |       ddgddgd	dgg}t        |d
|      }|j                  |      }t        j                  |      }t        |j                  |      |       |g dg dg dg}t        || ddgddgg dg      }|j                  |      }t        j                  |t              }d |d<   t        |j                  |      |       ddgddgd	dgg}t        |ddgddgg|       }|j                  |      }t        j                  |t              }d |d<   d |d d df<   t        |j                  |      |       t        j                  g dg dg      }t        j                  d      }t        j                  t        |      5  |j                  |       d d d        y # 1 sw Y   y xY w)Nr   r   )r   r   r   r   r   rH   r   r   r   r   rJ   )r   rK   r   r   r   )6   r   8   )r   r&   rK   )r   r   r   r   )r   rK   r&   r   r   r   r   r   r   )Shape of the passed X data is not correctr.   )r   r   r   r   r   r
   inverse_transformreescaper1   r2   r3   )r&   r   r   r   r   X_trexpmsgs           r#   test_one_hot_encoder_inverser     s    
8A
gD
9CQD
((1F
#Cs,,T2C8
R1b'Ar7#A
g&t
LCQD
((1+Cs,,T2C8| ^^<!)A=

   #hhq'D	3006< Wq"g2w'!AR))

   #hhq'D	AqD	3006< 88Y	*+D
))?
@C	z	-d# 
.	-	-s   )HHz
X, X_transr   r   r   r   r   r   r   ry   rz   r{   rS   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                     t        |      j                  |       }d}|rt        |d      }t        j                  t
        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r.   N)r   r0   r   r1   r2   r3   r   )r   X_transr   r   r   s        r#   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownr   @  s]    & g
.
2
21
5C	A 
 $Wh7	z	-g& 
.	-	-   A""A+c                      t        j                  ddgddgddggt              } t        dd	      }|j	                  |       }t        |j                  |      |        y )
Nr_   r   ra   r   r   rH   	if_binaryFr   r   )r   r   r   r   r   r
   r   )r   oher   s      r#   &test_one_hot_encoder_inverse_if_binaryr   `  sV    
61+!}xm<FKA
[
>CQDs,,T2A6r%   )r   r   N
reset_dropc                    t        j                  ddgddgddggt              }t        | d      }|j	                  |       |j                  |      }|j                         }|j                  |	       t        |j                  |      |       t        |j                  |      |       t        |j                         |       y )
Nr_   r   ra   r   r   rH   Fr   r   )r   r   r   r   r0   r4   r   r   r
   r   r	   )r   r   r   r   r   r   s         r#   test_one_hot_encoder_drop_resetr   g  s     	61+!}xm<FKA
T
7CGGAJ==D--/MNN
N#s,,T2A6CMM!$d+s002MBr%   methodr0   r         @      @c                     t               }d}t        j                  t        |      5   t	        ||      |        d d d        y # 1 sw Y   y xY w)Nz'Expected 2D array, got 1D array insteadr.   )r   r1   r2   r3   getattr)r   r   r8   r   s       r#   test_X_is_not_1Dr   v  s;     
B
3C	z	-FA 
.	-	-s   AAc                 
   t        j                  d      }|j                  g d      }t               }dt	        |       d}t        j
                  t        |      5   t        ||       |       d d d        y # 1 sw Y   y xY w)NrQ   )   r   r*   r   z+Expected a 2-dimensional container but got z	 instead.r.   )r1   rY   Seriesr   typer2   r3   r   )r   r[   r   r8   r   s        r#   test_X_is_not_1D_pandasr     sd    			X	&B
		,A	B7Qy	
JC	z	-FA 
.	-	-s   A99BzX, cat_exp, cat_dtyper   r   r   rU   rV   )r   r   r   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                    | | d d d   fD ]  }t        d      }|j                  |       t        |j                  t              sJ t        |j                  |      D ]w  \  }}|j                         }t        |d         rt        |d         sJ |d d |d d k(  sJ |j                         |k(  sJ t        j                  |j                  |      rwJ   y )Nr@   rJ   r   )r   r0   
isinstancecategories_listziptolistr   r   
issubdtyperI   )r   cat_exp	cat_dtypeXir   resr   res_lists           r#   test_one_hot_encoder_categoriesr     s    F !DbD'lv.#//4000COOW5HCzz|HSW%$Xb\222}CR000zz|s***==I666 6 r%   zX, X2, cats, cat_dtypedrR   rS   cint64r*   r   r   r   )NrR   z)rR   rS   r  )rR   Nr  )r   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanc                    t        |      }t        j                  g dg dg      }t        |j	                  |       j                         |       t        |j                  d         t        |d         k(  sJ |j                  d   j                         t        |d         k(  sJ |j                  d   j                  |k(  sJ t        |      }t        j                  t        d      5  |j                  |       d d d        t        ||      }t        j                  g dg dg      }t        |j                  |      j                  |      j                         |       y # 1 sw Y   jxY w)	Nr   r   r   r   r   r   r   r   r-   r.   rK   r&   )r   r   r   )r   r   r   r
   r   r   r   rK   r   r   rI   r1   r2   r3   r0   r4   )r   r7   catsr   r&   r   r   s          r#   )test_one_hot_encoder_specified_categoriesr	    s&   f 4
(C
((O_5
6Cs((+335s;q!"d47m333??1$$&$tAw-777 ??1##y000 4
(C	z)C	D 
E
4
GC
((O_5
6Cswwr{,,R088:C@	 
E	Ds   -E((E1c                     t        j                  ddggt              j                  } t	        g dg      }t        j                  g dg dg      }t        |j                  |       j                  |       j                         |       t        |j                  |       j                         |       |j                  d   j                         g dk(  sJ t        j                  |j                  d   j                  t         j                        sJ t        j                  d	d
gg      j                  } t	        g dg      }d}t        j                   t"        |      5  |j                  |        d d d        y # 1 sw Y   y xY w)NrR   rS   rH   )rS   rR   r   r   r  r  r   r   r   )r   r   r   z%Unsorted categories are not supportedr.   )r   r   r   rM   r   r
   r0   r4   r   r   r   r   r   rI   object_r1   r2   r3   )r   r   r   r   s       r#   (test_one_hot_encoder_unsorted_categoriesr    s   
3*V,..A
O#4
5C
((O_5
6Cswwqz++A.668#>s((+335s;??1$$&/999==+112::>>> 	1a&A
I;
/C
1C	z	-! 
.	-	-s   #E>>FEncoderc                 6   t        j                  dt         j                  dg      g} | |      }t        j                  ddggt              j                  }t        j                  t        d      5  |j                  |       ddd       y# 1 sw Y   yxY w)zTest encoder for specified categories that nan is at the end.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    r   r   r   rH   zNan should be the last elementr.   N)	r   r   r   r   rM   r1   r2   r3   r0   r  r  r   r   s       r#   ,test_encoder_nan_ending_specified_categoriesr    sl     HHa^$%D
T
"C
1a&(**A	z)I	J
 
K	J	Js   4BBc                     t        j                  ddgddggt              j                  } t	        g dg dg      }t        j                  g d	g d
g      }t        |j                  |       j                         |       |j                  d   j                         g dk(  sJ t        j                  |j                  d   j                  t         j                        sJ |j                  d   j                         g dk(  sJ t        j                  |j                  d   j                  t         j                        sJ y )NrR   rS   r   r   rH   r   )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   rM   r   r
   r   r   r   r   r   rI   r  r   r   r   s      r#   7test_one_hot_encoder_specified_categories_mixed_columnsr  #  s    
3*q!f%V466A
OY#?
@C
((24RS
TCs((+335s;??1$$&/999==+112::>>>??1$$&)333==+112::>>>r%   c                      t        j                  d      } | j                  ddgddgd      }t        |      }t	        |g dg dg       y )	NrQ   rR   rS   r   r   rT   rW   rX   )r1   rY   rZ   r   r	   )r[   r\   r   s      r#   test_one_hot_encoder_pandasr  0  sF    			X	&B<<sCj1v67D
"4
(CC,56r%   zdrop, expected_namesx0_cx2_br   )r  x1_2r  )r   r   rS   x0_bx2_a)r   binarymanualc                     g dg dg}t        |       }|j                  |       |j                         }t        ||       y )N)r   r   rR   )rS   r   rS   r   )r   r0   r   r
   )r   expected_namesr   r   r   s        r#   'test_one_hot_encoder_feature_names_dropr  9  s;     
&A
T
"CGGAJ--/M~}5r%   c                     ddgddgddgg} t        j                  g dg dg dg      }t        j                  d d	g      }t        d
d      }|j                  |       }t	        |j
                  |       t        ||       ddgddgddgg} t        j                  ddgddgddgg      }t        j                  d	d g      }t        d
d      }|j                  |       }t	        |j
                  |       t        ||       y )Nrc   yes   nori   )r   r   r   r   rB   )r   r   r   r   r   r   Fr   truerR   falser   r   )r   r   r   r   r
   	drop_idx_r	   )r   expectedexpected_drop_idxr   results        r#   *test_one_hot_encoder_drop_equals_if_binaryr*  K  s   
er4j2u+.Axx	35IJH $+
[
>Cq!Fs}}&78FH% ###7Axx#sc3Z#s<=H!T+
[
>Cq!Fs}}&78FH%r%   )rc   r   r   )r"  r   r   )r   r   r   c                     t               }t        j                  g dg dgd      }t        |j	                  |       |j                  d             t        d      }t        |j	                  |       |       y )Nr   r   r   r   r   r   r  rH   float64)r   r   r   r
   r   astyper  s      r#   test_ordinal_encoderr0  c  s^     
C
((Iy)
9Cs((+SZZ	-BC
w
'Cs((+S1r%   )r   r   zobject-string-catc                    t        |      }t        j                  dgdgg      }t        |j	                  |       |       t        |j                  d         t        |d         k(  sJ |j                  d   j                         t        |d         k(  sJ |j                  d   j                  |k(  sJ t        |      }t        j                  t        d      5  |j                  |       d d d        y # 1 sw Y   y xY w)Nr   r   r   r   r-   r.   )r   r   r   r
   r   r   rK   r   r   rI   r1   r2   r3   r0   )r   r7   r  r   r   r   s         r#   )test_ordinal_encoder_specified_categoriesr2  t  s    2 D
)C
((SEC5>
"Cs((+S1q!"d47m333??1$$&$tAw-777 ??1##y000 D
)C	z)C	D 
E	D	Ds   C88Dc                     g dg dg} t               }|j                  |       }t        j                  | t              }t        |j                  |      |       t        j                  g dg dg      }t        j                  d      }t        j                  t        |      5  |j                  |       d d d        y # 1 sw Y   y xY w)Nr   r   rH   )r   r   r   r   rW   r   r.   )r   r   r   r   r   r
   r   r   r   r1   r2   r3   )r   r   r   r   r   s        r#   test_ordinal_encoder_inverser4    s    	(A

CQD
((1F
#Cs,,T2C8 88\<01D
))?
@C	z	-d# 
.	-	-s   %C  C	c                     t        dd      } t        j                  ddgddgdd	ggt        
      }t        j                  ddgddgddggt        
      }| j	                  |       | j                  |      }t        j                  ddgddgddggd
      }t        ||       | j                  |      }t        j                  dd gd dgddggt        
      }t        ||       y )Nuse_encoded_valuer&   unknown_valuerR   xrS   yr   r  rH   xyblar   r   r   r  )r   r   r   r   r0   r4   r
   r   )r   X_fitr   X_trans_encr   X_trans_invinv_exps          r#   +test_ordinal_encoder_handle_unknowns_stringrB    s    
(;2
NCHHsCj3*sCj9HEhhdeS\C:>fMGGGEN--(K
((QGb!Wq!f-W
=C{C(''4KhhddC[3*=VLG{G,r%   rI   c                    t        dd      }t        j                  ddgddgdd	gg| 
      }t        j                  ddgddgddgg| 
      }|j                  |       |j	                  |      }t        j                  ddgddgddggd
      }t        ||       |j                  |      }t        j                  dd gd dgddggt        
      }t        ||       y )Nr6  r8  r      r      r   	   rH   rf      r   r  )r   r   r   r0   r4   r
   r   r   )rI   r   r>  r   r?  r   r@  rA  s           r#   ,test_ordinal_encoder_handle_unknowns_numericrI    s    
(;4
PCHHq!fq!fq!f-U;EhhB"a1a&1?GGGEN--(K
((QIay1a&1
AC{C(''4KhhD	D!9q!f5VDG{G,r%   c                      t        dt        j                        } t        j                  dgdgdgg      }| j	                  |       | j                  dgdgdgg      }t        |dgdgt        j                  gg       y )Nr6  r8  r   r   r   r*   r   )r   r   r   r   r0   r4   r
   )r   r>  r   s      r#   (test_ordinal_encoder_handle_unknowns_nanrK    so     (;266
RCHHqcA3_%EGGENmmaS1#sO,Gw!qcBFF8 45r%   c                      t        dt        j                  t              } t        j                  dgdgdgg      }t        j                  t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)Nr6  )r&   r9  rI   r   r   r   z'dtype parameter should be a float dtyper.   )	r   r   r   intr   r1   r2   r3   r0   )r   r>  s     r#   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtyperN    s\     *"&&C HHqcA3_%E	z)R	S 
T	S	Ss   A22A;c                      t        j                  g dgt              j                  } g d}t	        |      }d}t        j                  t        |      5  |j                  |        d d d        y # 1 sw Y   y xY w)N)LowMediumHighrQ  rP  rH   )rP  rQ  rR  r   z*Shape mismatch: if categories is an array,r.   )	r   r   r   rM   r   r1   r2   r3   r0   )r   r  r   r   s       r#   +test_ordinal_encoder_raise_categories_shaperS    sU    
<=VLNNA$D
D
)C
6C	z	-
 
.	-	-s   A11A:c            	         t        d      } t        j                  g dg dgd      }t        j                  ddgd	d
ggd      t        j                  ddgd	d
ggd      t        j                  ddgddgg      t        j                  ddgddgg      t        j                  ddgd	dggd      fD ]  }| j                  |       t	        t        d      D cg c](  }| j                  |   j                  |j                  k(  * c}      sJ t        | j                  |      j                         |        ddgd	d
gg}| j                  |       t	        t        d      D cg c]=  }t        j                  | j                  |   j                  t        j                        ? c}      sJ t        | j                  |      j                         |       ddgd	dgg}| j                  |       t	        t        d      D cg c]  }| j                  |   j                  dk(    c}      sJ t        | j                  |      j                         |       y c c}w c c}w c c}w )NrJ   r   )r   r   r   r   )r   r   r   r   r.  rH   r   r   r   r*   r  rR   rS   r   r      a   b   c   dr   )r   r   r   r0   allranger   rI   r
   r4   r   r   integer)r   r   r   is       r#   test_encoder_dtypesr]    s   
6
*C
(((*>?y
QC 	1a&1a&!1
1a&1a&!3
3*sCj)*
4,t-.
1c(QH%X6 	
qJACOOA&,,7JKKK3==+335s; Q!QAGGAJUSTXVXcooa066

CXVWWWs}}Q'//137
SAs8AGGAJeAhGh"((H4hGHHHs}}Q'//137 K
 W
 Hs   -I
&AI #I%c                  B   t        j                  d      } t        d      }t        j                  g dg dgd      }| j                  dd	gd
dgddgdd      }|j                  |       t        t        d	      D cg c]  }|j                  |   j                  dk(    c}      sJ t        |j                  |      j                         |       | j                  dd	gddgddgd      }|d   j                  |d   j                  |d   j                  g}|j                  |       t        t        d
      D cg c]!  }|j                  |   j                  ||   k(  # c}      sJ t        |j                  |      j                         |       y c c}w c c}w )NrQ   rJ   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r.  rH   r   r   r   r*   r   r   rU   rV   Cr  rR   rS   r   r   rU   rV   r`  )r1   rY   r   r   r   rZ   r0   rY  rZ  r   rI   r
   r4   r   )r[   r   r   r   r\  X_types         r#   test_encoder_dtypes_pandasrb    sj   			X	&B
6
*C
((	')GHC
 	Aq6AaV<GLAGGAJU1XFX"((G3XFGGGs}}Q'//137
Aq6c
#sDEAfllAcFLL!C&,,7FGGAJuQxHx!"((F1I5xHIIIs}}Q'//137 G Is   ?#F>&Fc                  |    t               } ddgddgg}t        j                  j                  | j                  |       y )Nr_   r   ra   r   )r   r   testingassert_no_warningsr   )r   r   s     r#   test_one_hot_encoder_warningrf    s5    
/C
!xm$AJJ!!#"3"3Q7r%   missing_valuec                    dddd| g}t        |      }g dg ddddd| gg}|j                  |      j                         }g dg d	g d
g}t        ||       |j                  |u sJ t        |j                  |j                        D cg c]
  \  }}||    }}}|j                  |      }	t        j                  |t              }
t        |d         rt        |d d |d d        t        |d         sJ t        |d         sJ t        |
d d d df   |	d d d df          t        |
dd df   |	dd df          t        |
d         sJ t        |	d         sJ y t        ||       t        |
|	       y c c}}w )Nr   rf   r   r   r   )r   rf   r   r   rR   )r   rf   r   r   rR   )r   r   r   r   r   )r   r   r   r   r   r   rH   r@   )r@   r@   )r   r   r   r
   r   r   r   r&  r   r   r   r   r   )rg  cats_to_dropr   r   transr   r   r   dropped_catsX_inv_transX_arrays              r#    test_one_hot_encoder_drop_manualrn  "  s   2q"m4L
\
*C	Ar=)	A
 a ((*EO_
=Cuc"88|### *-S__cmm)L)LgG)L   ''.Khhq'G \"%&<,l3B.?@\"-...\"-...71crc6?K3B3,?@ 	72ss7+[SbS-ABWV_---[0111<67K0)s   E;)r   r   rb   rR   c                     t        |       }d}t        j                  t        |      5  |j	                  g dg dg dg       d d d        y # 1 sw Y   y xY w)Nr   z-`drop` should have length equal to the numberr.   r   r   )r   r   ;   )r   r1   r2   r3   r0   )r   r   r   s      r#   test_invalid_drop_lengthrq  G  s>    
T
"C=G	z	1@A 
2	1	1s   AAdensityr   denserR   r   rS   r  c                    t        |       }t        | |      }g dg dg}|j                  |       |j                  |       t        |j                  |j                         |dk(  rt        |j                  d       n=t        ||j                  |j                        D ]  \  }}}|t        |         |k(  rJ  t        |j                  t        j                        sJ |j                  j                  t        k(  sJ y )Nr   r   )r   r   rR   rt  r   r   )r   r0   r
   r   r&  r   rM  r   r   ndarrayrI   r   )rr  r   ohe_baseohe_testr   drop_catdrop_idxcat_lists           r#   test_categoriesr|  O  s     73H7>H	&ALLOLLOx++X-A-ABw8--q1,/($$h&:&:-
(Hh CM*h666-
 h(("**555##v---r%   c                 <    d |        j                         d   v sJ y )NcategoricalX_types)	_get_tags)r  s    r#   "test_encoders_has_categorical_tagsr  c  s     GI//1)<<<<r%   kwargsmax_categoriesmin_frequency   g(\?r   )r  r  rf   rK   rJ   rR   rS   r   r   c                 .   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d|d	d
d| j	                  |      }t        |j                  g dg       dgdgdgdgdgg}t        j                  ddgddgddgddgddgg      }|j                  |      }t        ||       dgdgdz  z   D cg c]  }|g }}|j                  |      }	t        ||	       |j                         }
t        ddg|
       yc c}w )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.rR   r   rS   r"  r   rc   r   r   r(   F)rK   r&   r   rR   r   r   er   r   infrequent_sklearnr*   r  x0_infrequent_sklearnNr   r   r   rM   r   r0   r
   infrequent_categories_r4   r	   r   r   )r  rK   X_trainr   X_testr'  r   colexpected_invX_invr   s              r#   test_ohe_infrequent_two_levelsr  h  sL    hh	SEBJ.#;seaiGHIKKG
 , 	
 
c'l  s11O3DEecUSEC53%0Fxx!Q!Q!Q!Q!Q@AHmmF#GHg&&)U.B-Ca-G%GH%GcSE%GLH!!'*E|U+--/M 78-H Is   

Dc                    t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       j	                  |      }|j
                  d   |j                  d      dk(  sJ t        j                  dgdgg      }|j                  |      }t        dgdgg|       |j                         }t        dg|       |j                  |      }t        dgdgg|       y)z3Test two levels and dropping the frequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr   r&   r   r  r   r   r   r  r  N)r   r   rM   r   r0   r   r&  r4   r	   r   r
   r   )r   r  r   r  r   r   	X_inverses          r#   ,test_ohe_infrequent_two_levels_drop_frequentr    s    hh	SEBJ.#;seaiGHIKKG
,	
 
c'l  ??1cmmA./3666XXusen%FmmF#GaS1#J(--/M/0-@%%g.I 456	Br%   c                 (   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       }d| d   d}t	        j
                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr   r  Unable to drop category r   ( from feature 0 because it is infrequentr.   Nr   r   rM   r   r1   r2   r3   r0   r   r  r   r   s       r#   5test_ohe_infrequent_two_levels_drop_infrequent_errorsr    s    
 hh	SEBJ.#;seaiGHIKKG
,	C %T!WK/W
XC	z	- 
.	-	-   -BBrG  gQ?g{Gz?rF  c                 
   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        dd	d
d| j	                  |      }t        |j                  ddgg       dgdgdgdgdgg}t        j                  g dg dg dg dg dg      }|j                  |      }t        ||       dgdgdgdgdgg}|j                  |      }t        ||       |j                         }t        g d|       y)zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.rR   r   rS   r"  r   rc   r   r   r(   Fr&   r   r  r-  r   r   r   r,  r  )r  r  r  Nr   r  )	r  r  r   r  r'  r   r  r  r   s	            r#    test_ohe_infrequent_three_levelsr    s'     hh	SEBJ.#;seaiGHIKKG
 ,EEK	c'l  s11S#J<@ecUSEC53%0FxxIy)YOPHmmF#GHg& 
				L !!'*E|U+--/M@-Pr%   c                 $   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       j	                  |      }t        j                  dgdgdgg      }t        ddgddgddgg|j                  |             |j                  d      j	                  |       d}t        j                  t        |      5  |j                  dgdgg      }ddd       t        ddgddgg       y# 1 sw Y   xY w)z5Test three levels and dropping the frequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr  r   r   r'   r,   r-   r.   r  N)r   r   rM   r   r0   r	   r4   r   r1   warnsUserWarning)r   r  r   r  r   r   s         r#   .test_ohe_infrequent_three_levels_drop_frequentr    s    hh	SEBJ.#;seaiGHIKKG
,	
 
c'l  XXusecU+,FaVaVaV,cmmF.CD NN(N+//8
$C	k	---#/ 
. aVaV$g. 
.	-s   DDc                 (   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        d	d
d|       }d| d   d}t	        j
                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)z7Test three levels and dropping the infrequent category.rR   r   rS   r"  r   rc   r   r   r(   Fr  r  r   r  r.   Nr  r  s       r#   7test_ohe_infrequent_three_levels_drop_infrequent_errorsr    s     hh	SEBJ.#;seaiGHIKKG
,	C %T!WK/W
XC	z	- 
.	-	-r  c                      t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  } t        d	d
d      j	                  |       }t        |j                  ddgg       dgdgdgdgg}t        j                  g dg dg dg dg      }|j                  |      }t        ||       dgg}d}t        j                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.rR   r   rS   r"  r   rc   r   r   r+   F)r&   r   r  r-  r  r,  badz.Found unknown categories \['bad'\] in column 0r.   N)r   r   rM   r   r0   r
   r  r4   r	   r1   r2   r3   )r  r   r  r'  r   r   s         r#   (test_ohe_infrequent_handle_unknown_errorr  
  s     hh	SEBJ.#;seaiGHIKKG
eA	c'l  s11S#J<@ ecUSEC5)FxxIy)DEHmmF#GHg& gYF
;C	z	-f 
.	-	-s   C44C=c                    t        j                  dgdz  dgdz  z   gt              j                  }t	        dg dgddd	| j                  |      }dgd
gdgdgdgg}t        j                  ddgddgddgddgddgg      }|j                  |      }t        ||       dddgg}dgdgg}|D ]B  }|j                  |      j                  |       t        dgdgg|j                  |             D y)zG'a' is the only frequent category, all other categories are infrequent.rR   r   r  ri   rH   r   r   rR   rS   Fr(   rK   r   r&   rS   r   r   r   r   r   r   r   Nr   )	r   r   r   rM   r   r0   r4   r	   r   )r  r  r   r  r'  r   dropsr   s           r#   5test_ohe_infrequent_two_levels_user_cats_one_frequentr  "  s    hh	SEBJ./v>@@G
 (), 	
 
c'l  ecUSEC53%0Fxx!Q!Q!Q!Q!Q@AHmmF#GHg& kC5)EecU^FD!%%g.!qc
CMM&$9: r%   c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  } t	        g d
gddd      j                  |       }t        |j                  g dg       dgdgdgdgdgg}t        j                  ddgddgddgddgddgg      }|j                  |      }t        ||       dgdgdz  z   D cg c]  }|g }}|j                  |      }t        ||       yc c}w )zFTest that the order of the categories provided by a user is respected.rR   r   rS   r"  r   rc   r   r   rH   r  Fr(   r   rK   r   r&   r  )r   r   rR   r  r   r   r  r*   Nr   r   r   rM   r   r0   r
   r  r4   r	   r   )r  r   r  r'  r   r  r  r  s           r#   (test_ohe_infrequent_two_levels_user_catsr  >  s.   hh
cURZ	3%"*	,uqy	89a  (),	
 
c'l  s11O3DEecUSEC53%0Fxx!Q!Q!Q!Q!Q@AHmmF#GHg& '*U.B-Ca-G%GH%GcSE%GLH!!'*E|U+ Is   
C=c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  } t	        g d
gddd      j                  |       }t        |j                  ddgg       dgdgdgdgdgg}t        j                  g dg dg dg dg dg      }|j                  |      }t        ||       dgdgdgdgdgg}|j                  |      }t        ||       y)zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.rR   r   rS   r"  r   rc   r   r   rH   r   r   rS   rR   Fr(   r  r  r,  r  r-  r  Nr  )r  r   r  r'  r   r  r  s          r#   *test_ohe_infrequent_three_levels_user_catsr  Y  s   
 hh
cURZ	3%"*	,uqy	89a  (),	
 
c'l  s11S#J<@ecUSEC53%0FxxIy)YOPHmmF#GHg&
 
				L !!'*E|U+r%   c                      t         j                  g dg df   } t        ddd      }|j                  |        ddgddgg}|j	                  |      }t        |g d	g d
g       y)zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   	r   r   r   r   r   r   r   r   r   r   r   F)r  r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r   r0   r4   r	   )r   r   r  r   s       r#   test_ohe_infrequent_mixedr  }  sc     	)+FFGA
q{%
PCGGAJ!fq!fFmmF#G GlL9:r%   c            
      b   t         j                  g dg dg df   } t        ddd      }|j                  |       j	                         }t        |j                  d   d	d
g       t        |j                  d	   d	dg       t        |j                  d
   d       |j                         }t        g d|       g dg dg dg dg dg dg dg dg dg	}t        ||       g dg dg}|j                  |      }g dg dg}t        ||j	                                |j                  |      }t        j                  g dg dgt              }t        ||       t        ddd      j                  |       }t        j                  t         d      5  |j                  |       ddd       g d g d!g}|j                  |      }g d"g dg}t        ||j	                                |j                  |      }t        j                  g d#g d$gt              }t        ||       y# 1 sw Y   xY w)%z?Test infrequent categories with feature matrix with 3 features.r  )	r   r   r   r   r   rc   r   r   r   )	r   r   r   r   r   r   r   r   r   rJ   r   r(   rK   r  r&   r   r   r   rc   N)x0_0x0_3r  x1_0x1_5x1_infrequent_sklearnx2_0x2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r*   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r  N)r  r   NrH   r+   r-   r.   )r   r   r   )r   rc   r   )r   r   r   r   r   r   r   r   )r  r  r   )r   r  r   )r   r  r   r   r   r
   r  r   r	   r4   r   r   r   r0   r1   r2   r3   )	r   r   r   r   r'  r  X_test_transr  r  s	            r#   'test_ohe_infrequent_multiple_categoriesr    s    	#$#	%	A !<QC "**,Gs11!4q!f=s11!4q"g>s11!4d;
 --/M		
 	 	!        
H Hg&#F==(L )*BCHHl2245!!,/E88	(*IJRXL |U+ !G	c!f  
z)C	Df 
E $F==(L(*BCHHl2245!!,/E88	8:VWL |U+! 
E	Ds   H%%H.c            
         t        j                  d      } | j                  g dg ddddg      }t        dd	d
      }|j	                  |      j                         }t        |j                  d   ddg       t        |j                  d   g d       g dg dg dg dg dg dg dg dg dg	}t        ||       | j                  ddgddgdddg      }g dg dg}|j                  |      }t        ||j                                |j                  |      }t        j                  ddgddggt              }t        ||       | j                  ddgddgdddg      }|j                  |      j                         }g dg dg}t        ||       |j                  |      }t        j                  ddgddggt              }t        ||       y)zHTest infrequent categories with a pandas dataframe with multiple dtypes.rQ   	rR   fr   r  r  rR   r   rS   rS   	r   r   r   rc   rc   rf   r   r   r   )strrM  r  rM  columnsrJ   r   r(   r  r   rR   rS   r   r   r   rf   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r     rf   r  rH   r   r   N)r1   rY   rZ   r   r   r   r
   r  r	   r4   r   r   r   r   )	r[   r   r   r   r'  r  r  r  r  s	            r#   .test_ohe_infrequent_multiple_categories_dtypesr    s    
		X	&B
@1	
  	 	A !<QC "**,Gs11!4sCjAs11!4jA 	
H Hg&\\3*b"X>PU\WF"$67H==(LHl2245!!,/E88
 4	5=Q7RSL |U+ \\3*b!W=u~\VF==(002L"$67HHl+!!,/E88
#	$';Q&?@L |U+r%   rh   )r  r  c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        dd	d
d| }|j	                  |       |j                  dgg      }t        |dgg       y),All user provided categories are infrequent.rR   r   rS   r"  r   rc   r   r   r(   Fr  r   Nr   )r   r   rM   r   r0   r4   r	   r  r  r   r   s       r#   $test_ohe_infrequent_one_level_errorsr  +  s     hh	SEBJ.#;seaiGHIKKG
 ,EEKC GGGmmcUG$GGqcU#r%   c                     t        j                  dgdz  gt              j                  }t	        dg dgddd| j                  |      }|j                  dgdgg      }t        |d	gd	gg       y
)r  r  r   rH   r  Fr(   r  rR   r   Nr   )r   r   r   rM   r   r0   r4   r	   r  s       r#   5test_ohe_infrequent_user_cats_unknown_training_errorsr  9  s     hh	{&133G
 (), 	
 
c'l  mmcUSEN+GGqcA3Z(r%   zinput_dtype, category_dtype)OOOUUOUUSOSUSS
array_type)r   r   	dataframec                    t        j                  dgdgg|       }t        j                  ddg|      g}t        |d      j                  |      }t	        dgdgdgdgg||       }|j                  |      }t        j                  ddgddgddgddgg      }t        ||       t        |      j                  |      }	|	j                  |      }t        j                  dgdgdgdgg      }t        ||       y	)
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    rS   rR   rH   Fr   r   r   r   N)	r   r   r   r0   r   r4   r	   r   r
   )
rF   category_dtyper  r   rK   r   r  r   r'  oes
             r#   test_encoders_string_categoriesr  J  s     	3%#{3A((C:^<=J
:U
C
G
G
JC
use$jF mmF#Gxx!Q!Q!Q!Q89HGX&	:	.	2	21	5Bll6"Gxx!qcA3,-Hw)r%   c                  4   t        j                  dgdggd      } t        j                  ddgd      g}t        |d      }t        j                  d      }t        j                  t        |	      5  |j                  |        d
d
d
       y
# 1 sw Y   y
xY w)zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    rS   rR   UrH   SFr   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r.   N)	r   r   r   r   r   r1   r2   r3   r0   )r   rK   r   r   s       r#   $test_mixed_string_bytes_categoricalsr  i  sy     	3%#s+A((C:S12J
:U
CC
))	'C
 
z	-
 
.	-	-s   3BBc                     t        j                  dd| d| ggt              j                  }t	        dd      j                  |      }|j                         }t        |ddd	|  g       y )
NrR   rS   rH   Fr'   r   r&   x0_ar  x0_)r   r   r   rM   r   r0   r   r
   )rg  r   r   namess       r#   )test_ohe_missing_values_get_feature_namesr  ~  se     	3]C?@OQQA
eH
E
I
I!
LC%%'Euvv]O/DEFr%   c            	      (   t        j                  d      } | j                  g dt        j                  dddt        j
                  gt              ddd	g
      }t        j                  g dg dg dg dg      }t        |      }t        ||       y )NrQ   )dogr   Nr   r   r   r*   rH   )col1col2r  r  r  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r1   rY   rZ   r   r   r   floatr   r	   )r[   dfexpected_df_transr   s       r#   %test_ohe_missing_value_support_pandasr    s    			X	&B	/HHaArvv.e<	
   
 
B !!!!		
 #2
&CC*+r%   pd_nan_typepd.NAznp.nanc           
         t        j                  d      }| dk(  r|j                  nt        j                  }|j                  d|j                  dd|ddgd      i      }t        j                  g d	g d
g dg dg d
g      }t        d|      }|j                  |      }t        ||       t        |j                        dk(  sJ t        |j                  d   d d g d       t        j                  |j                  d   d         sJ y )NrQ   r  r  r   rR   rS   r   rH   )r   r   r   r   )r   r   r   r   )r   r   r   r   r  Fr  r   r   r@   r   )r1   rY   NAr   r   rZ   r   r   r   r   r	   lenr   r
   isnan)r  r&   r[   pd_missing_valuer  r  r   df_transs           r#   1test_ohe_missing_value_support_pandas_categoricalr     s     
		X	&B +w 6ruuBFF	BIIsC)93DJIW	

B
 	
 eN
KC  $H%x0s1$$$sq)#2.@88COOA&r*+++r%   c                    ddgddgddgg}t        dd|       }|j                  |      }t        j                  g d	g d
g dg      }t	        ||       ddgg}t        j                  g d	g      }d}t        j                  t        |      5  |j                  |      }ddd       t	        ||       |j                  |      }t        |t        j                  ddggt                     y# 1 sw Y   OxY w)zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.rR   r   rS   r   r   r   Fr   r   r&   r   r   )r   r   r   r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr.   NrH   r   r   r   r   r	   r1   r  r  r4   r   r
   r   r&   r   r   r   rN   r  warn_msgr  s           r#   /test_ohe_drop_first_handle_unknown_ignore_warnsr    s     qC8c1X&A
E.C "G	
J GZ( AhZF9+&J	 
 
k	2--' 
3GZ( !!*-Eubhhaz@A 
3	2   C//C8c                    ddgddgddgg}t        dd|       }|j                  |      }t        j                  g d	g d
g dg      }t	        ||       ddgg}t        j                  g dg      }d}t        j                  t        |      5  |j                  |      }ddd       t	        ||       |j                  |      }t        |t        j                  ddggt                     y# 1 sw Y   OxY w)zDCheck drop='if_binary' and handle_unknown='ignore' during transform.rR   r   rS   r   r   r   Fr  r  r   rW   r   r   )r   r   r   r   r  r.   NrH   r  r  s           r#   3test_ohe_drop_if_binary_handle_unknown_ignore_warnsr
    s     qC8c1X&A
nC "G	
J GZ( AhZF<.)J	 
 
k	2--' 
3GZ( !!*-Eubhhd}FCD 
3	2r  c                 >   ddgddgddgg}t        dd| ddgddgg      }|j                  |       d	dgg}t        j                  ddgg      }d
}t	        j
                  t        |      5  |j                  |      }ddd       t        |       y# 1 sw Y   xY w)znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.rR   r   rS   r   r   r   F)r   r   r&   rK   r   zqFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr.   N)	r   r0   r   r   r1   r  r  r4   r	   )r&   r   r   r  rN   r  r   s          r#   'test_ohe_drop_first_explicit_categoriesr  	  s    
 qC8c1X&A
%#JA'	C GGAJAhZFAq6(#J	A  
k	2--' 
3GZ( 
3	2s   ,BBc                     t        j                  d      } | j                  g dg ddddg      }t        d	      }|j	                  d
       d}t        j
                  t        |      5  |j                  |       ddd       |j                  |       t        j
                  t        |      5  |j                  |       ddd       y# 1 sw Y   PxY w# 1 sw Y   yxY w)zJRaise informative error message when pandas output and sparse_output=True.rQ   r  )r  rS   rS   )rR   rS   rR   rS   r  Tr   r4   zxPandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas outputr.   N)
r1   rY   rZ   r   
set_outputr2   r3   r   r0   r4   )r[   r  r   r   s       r#   'test_ohe_more_informative_error_messager  $  s    			X	&B	IO<sCj	QB
d
+CNNXN&	S  
z	-" 
. GGBK	z	-b 
.	-	 
.	- 
.	-s   -C3CCC#c                  D   t        j                  t         j                  dddgg      j                  } t	        t         j
                        }dt         j
                   }t        j                  t        |      5  |j                  |        ddd       y# 1 sw Y   yxY w)zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   rH   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r.   N)
r   r   r   rM   r   int32r1   r2   r3   r0   )r   r  r   s      r#   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtyper  8  su     	2663S)*+--A	bhh	'B	002z	;  
z	-
q	 
.	-	-s   ;BBencoded_missing_valuer7  c                    t        j                  t         j                  dddggt         j                        j                  }t        |       j                  |      }t        |j                        dk(  sJ t        |j                  d   ddt         j                  g       |j                  |      }t        || gdgdgdgg       |j                  |      }t        ||       y)	z.Test ordinal encoder with nan on float dtypes.r   r   rH   r  r   r   r   N)r   r   r   r.  rM   r   r0   r  r   r	   r4   r   )r  r   r  r   r  s        r#   5test_ordinal_encoder_passthrough_missing_values_floatr  F  s     	2663S)*"**=??A	.C	D	H	H	KBr~~!###BNN1%S"&&'9:ll1oGG45usecUKL$$W-IIq!r%   c           
         t        j                  d      }| dk(  r|j                  nt        j                  }|j                  d|j                  dd|ddgd      i      }t        |	      j                  |      }t        |j                        d
k(  sJ t        |j                  d   dd g d       t        j                  |j                  d   d         sJ |j                  |      }t        |dgdg|gdgdgg       |j                  |      }|j                   dk(  sJ t        |dddf   ddg       t        |dddf   ddg       t        j                  |d         sJ y)z0Check ordinal encoder is compatible with pandas.rQ   r  r  r   rR   rS   r   rH   r  r   r   Nr   r   r@          @r   r   )r   r   r   r   )r1   rY   r  r   r   rZ   r   r   r0   r  r   r
   r  r4   r	   r   r   )r  r  r[   r  r  r  r  r  s           r#   =test_ordinal_encoder_missing_value_support_pandas_categoricalr  X  s`    
		X	&B +w 6ruuBFF	BIIsC)93DJIW	

B 
.C	D	H	H	LBr~~!###r~~a(!,o>88BNN1%b)***||BHHuse.C-DsecUST$$X.I??f$$$y!Q'#s4yQ'#s488IdO$$$r%   r  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                 v   t        |      }t        j                  dgt        j                  gg      }t	        |j                  |       |       |j                  d   j                  |k(  sJ t        |      }t        j                  t        d      5  |j                  |       ddd       y# 1 sw Y   yxY w)z.Test ordinal encoder for specified categories.r   r   r   r-   r.   N)r   r   r   r   r
   r   r   rI   r1   r2   r3   r0   )r   r7   r  r   r  r   s         r#   =test_ordinal_encoder_specified_categories_missing_passthroughr  y  s    L 
4	(B
((SEBFF8$
%Cr''*C0 >>!""i/// 
4	(B	z)C	D
r
 
E	D	Ds   B//B8c                 $   t        j                  g dt              g} | |      }t        j                  ddggt              j                  }t	        j
                  t        d      5  |j                  |       ddd       y# 1 sw Y   yxY w)	zTest encoder for specified categories have duplicate values.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    )rR   rS   rR   rH   r   rR   rS   z5the predefined categories contain duplicate elements.r.   N)r   r   r   rM   r1   r2   r3   r0   r  s       r#   +test_encoder_duplicate_specified_categoriesr    sl     HH_F34D
T
"C
3*V,..A	Q
 	

 
 
s   +BBzX, expected_X_trans, X_testr   r   )r   r   r   )r   r  r   r   )r   rR   rS   )r  r   r   c                     t        dd      }|j                  |       }t        ||       t        |j                  |      dgg       y)z>Test the interaction between missing values and handle_unknownr6  r@   r8  g      N)r   r   r	   r4   )r   expected_X_transr  r  r   s        r#   /test_ordinal_encoder_handle_missing_and_unknownr!    sC    8 
':"	MBq!GG-.BLL(D6(3r%   csr_containerc                    t        j                  g dg dg      } | |      }t               }d}t        j                  t
        |      5  |j                  |       ddd       t        j                  t
        |      5  |j                  |       ddd       |j                  |      } | |      }t        j                  t
        |      5  |j                  |       ddd       y# 1 sw Y   xY w# 1 sw Y   dxY w# 1 sw Y   yxY w)zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z2Sparse data was passed, but dense data is requiredr.   N)	r   r   r   r1   r2   r   r0   r   r   )r"  r   X_sparseencoderr   r   r!   s          r#   test_ordinal_encoder_sparser&    s     	)Y'(AQHGBG	y	0H 
1	y	0h' 
1 ##A&G"7+N	y	0!!.1 
1	0 
1	0	0	0
 
1	0s$   C) C5D)C25C>D
c                  B   t        j                  g d      ddt         j                  f   } t        g dgdd      }|j	                  |        t        g dgd      }t        j                  t        d	
      5  |j	                  |        ddd       y# 1 sw Y   yxY w)zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)r@   r   r   r6  rD  )rK   r&   r9  r+   r  r-   r.   )r   r   newaxisr   r0   r1   r2   r3   )r   r  s     r#   -test_ordinal_encoder_fit_with_unseen_categoryr)    sw     	#$Q

]3A	<0CSW
B FF1I	J<	HB	z)C	D
q	 
E	D	Ds   :BBr  AAOr  r  c                     t        dd      }|j                  |        |j                  |      }t        |ddgg       y)zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r6  ir8  r   N)r   r0   r4   r	   )r  r  r   r   s       r#   1test_ordinal_encoder_handle_unknown_string_dtypesr-  	  s;    * (;2
NCGGGmmF#GGr1gY'r%   c                  8   t        j                  g d      j                  dd      } t               j	                  |       }t        |j                  t        j                  | d      j                         |j                  |       }t        |dgdgdgdgg       y)	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6kr@   r   r   )axisr   r   N)
r   r   rC   r   r0   r
   r   sortrM   r4   )r   r%  r   s      r#   #test_ordinal_encoder_python_integerr1  %  s     		
	 gb!n  ""1%Gw**BGGAA,>,@,@A"Gw!qcA3 45r%   c                      t        j                  d      } g d}| j                  g dg|      }t               j	                  |      }|j                         }t        ||       y)z-Check feature names out is same as the input.rQ   )rS   r   rR   r  r  N)r1   rY   rZ   r   r0   r   r
   )r[   r  r   r   feature_names_outs        r#   .test_ordinal_encoder_features_names_out_pandasr4  9  sX    			X	&BE
i[%0A



q
!C113u/0r%   c                  &   t        j                  dgdgt         j                  ggt              } t	        dt         j                  d      j                  |       }|j                  |       }t        |dgdgdgg       t        j                  d	gt         j                  ggt              }|j                  |      }t        |t         j                  gdgg       |j                  |      }|d   d   J t        j                  |d   d         sJ y
)zECheck interactions between encode_unknown and missing value encoding.rR   rS   rH   r6  r&   r9  r  r   r   r   N)
r   r   r   r   r   r0   r4   r	   r   r  )r   r  r   r  r  X_roundtrips         r#   0test_ordinal_encoder_unknown_missing_interactionr9  E  s     	3%#)8A	*ff 
 
c!f	  ll1oGGqcA3-. XXurvvh'v6F<<'LLBFF8bT"23 &&|4K q>!$$$ 88KN1%&&&r%   with_pandasc                 t   t        j                  ddgddgdt         j                  ggt              }d}| r0t	        j
                  d      }|j                  |d	d
g      }|dz   }n|dz   }t        d      }t	        j                  t        |      5  |j                  |       ddd       y# 1 sw Y   yxY w)zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.rR   r  rS   r   r   rH   zTencoded_missing_value \(1\) is already used to encode a known category in features: rQ   letterpetr  z	\['pet'\]z\[1\]r   r  r.   N)r   r   r   r   r1   rY   rZ   r   r2   r3   r0   )r:  r   	error_msgr[   r  s        r#   0test_ordinal_encoder_encoded_missing_value_errorr?  c  s     	3,esBFFm<FKA
	 
   *LLXu$5L6,	(		a	0B	z	3
q	 
4	3	3s   B..B7z4X_train, X_test_trans_expected, X_roundtrip_expected1c                    t        dt        j                  t        j                        j                  |       }t        j                  dgt        j                  gdgg      }|j                  |      }t        ||       |j                  |      }|j                  d   }t        |      D ]A  }||df   }	||df   }
|	|
J t        |	      rt        j                  |
      r9J |
|	k(  rAJ  y)znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r6  r7  r@  rS   r   N)r   r   r   r0   r   r4   r	   r   r   rZ  r   r  )r  X_test_trans_expectedX_roundtrip_expectedr  r  r  r8  	n_samplesr\  expected_valvals              r#   9test_ordinal_encoder_unknown_missing_interaction_both_nanrG  }  s    4 
*ff ff
 
c'l	  XXurvvh./F<<'L L"78&&|4K$**1-I9+AqD1!Q$;;<(88C= =,&&& r%   c                  L   t        j                  d      } | j                  ddgddgd      }t               }|j	                  d       d}t        j
                  t        |	      5  |j                  |       d
d
d
       t        d      j	                  d      }t        d      j	                  d      }|j                  |      }|j                  |      }t        |j                         |       t        |j                         |j                         y
# 1 sw Y   xY w)z*Check OneHotEncoder works with set_output.rQ   rR   rS   r   r   rT   r  zCPandas output does not support sparse data. Set sparse_output=Falser.   NFr   default)r1   rY   rZ   r   r  r2   r3   r   r	   to_numpyr
   r   r  )r[   r\   r   r/   ohe_default
ohe_pandas	X_defaultX_pandass           r#   test_one_hot_encoder_set_outputrO    s    			X	&B<<sCj1v67D
/CNNXN&QE	z	/$ 
0  e4??)?TKU3>>>RJ))$/I''-HH%%'3z7798;K;KL 
0	/s   'DD#c                     t        j                  d      } | j                  ddgddgd      }t               j	                  d      }t               j	                  d      }|j                  |      }|j                  |      }t        |j                         |       t        |j                         |j                         y	)
z+Check OrdinalEncoder works with set_output.rQ   rR   rS   r   r   rT   rI  r  N)r1   rY   rZ   r   r  r   r	   rJ  r
   r   r  )r[   r\   ord_default
ord_pandasrM  rN  s         r#   test_ordinal_set_outputrS    s    			X	&B<<sCj1v67D "--	-BK!,,x,@J))$/I''-HH%%'3z7798;K;KLr%   c                     g dddgg} t        |       }|j                  ddgg       t        |       t        |j                        k(  sJ t	        |j                        D ])  \  }}|j
                  t        k(  sJ t        | |   |       + y)zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    )asmmaseasrasacsr@  2r   rU  N)r   r0   r  r   	enumeraterI   r   r
   )rK   r   nr   s       r#    test_predefined_categories_dtyper]    s    
 6SzBJ
:
.CGGdC[Mz?c#//2222COO,3yyF""":a=#. -r%   c                  `   t        j                  dgdgt         j                  ggt              } t	        d      j                  |       }t        |dgdgdgg       t	        dd	      j                  |       }t        j                  d
gg      }|j                  |      }t        |dgg       y)zBCheck missing value or unknown encoding can equal the cardinality.r  r   rH   r   r  r   r   r6  r8  snakeN)	r   r   r   r   r   r   r	   r0   r4   )r   r   r   r  s       r#   1test_ordinal_encoder_missing_unknown_encoding_maxr`    s    
5'E7RVVH-V<A15CCAFGGqcA3_-
(;1
M
Q
QRS
TCXXyk"FmmF#GGqcU#r%   c                  H   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt              j                  } t	        dd	d
      j                  |       }t        |j                         g d       |j                  d   |j                  d      dk(  sJ t        j                  dgdz  dgdz  z   dgdz  z   gt              j                  } t	        dd	d      j                  |       }t        |j                         dg       |j                  d   |j                  d      dk(  sJ t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   dgdz  z   gt              j                  } t	        dd	dg      j                  |       }t        |j                         g d       |j                  d   |j                  d      dk(  sJ t	        dd	d      j                  |       }t        |j                         g d       |j                  J y)zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    rR   r   rS   r*   r   r   r  rH   Fr   )r  r   r   )r  x0_dx0_er  r   rc   r   r  )r  r  rc  r  N)r  r  rb  rc  r  )
r   r   r   rM   r   r0   r
   r   r   r&  )r   r   s     r#   #test_drop_idx_infrequent_categoriesrd    s&   
 	
cUQY	#	*cUQY	6#	BC6	a  au7
K
O
OPQ
RC!!#%V ??1cmmA./3666
3%!)seai'3%"*45VDFFA
au;
O
S
STU
VCs0025L4MN??1cmmA./3666

cUQY	#	*cUQY	6#	BC6	a  auC5
I
M
Ma
PC!!#%V ??1cmmA./3666
au4
H
L
LQ
OC!!#A ==   r%   c                    t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   g      j                  }t        dd	d
d| j	                  |      }t        |j                  g dg       t        |j                  ddgg       dgdgdgdgdgg}dgdgdgdgd
gg}|j                  |      }t        ||       |j                  |      }dgdgdgdgdgg}t        ||       y)zGTest parameters for grouping 'a', and 'd' into the infrequent category.rR   r   rS   r"  r   rc   r   r   r6  r@   r8  r  r  r   r   r   r  Nr   )r   r   rM   r   r0   r
   r   r  r4   r	   r   )r  r  ordinalr  expected_transr   r  expected_inverses           r#   ,test_ordinal_encoder_infrequent_three_levelsri    s$    hh	SEBJ.#;seaiGHIKKG *"@F	c'l  w**-A,BCw55c
|DecUSEC53%0FcA3aS2$/N'GG^,))'2I					 y"23r%   c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  } t	        g d
gddd      j                  |       }t        |j                  g d
g       t        |j                  ddgg       dgdgdgdgdgg}dgdgdgdgdgg}|j                  |      }t        ||       |j                  |      }dgdgdgdgdgg}t        ||       y)zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    rR   r   rS   r"  r   rc   r   r   rH   r  r6  r@   )rK   r  r&   r9  r  r   r   r   r  N)r   r   r   rM   r   r0   r
   r   r  r4   r	   r   )r  rf  r  rg  r   r  rh  s          r#   6test_ordinal_encoder_infrequent_three_levels_user_catsrk  @  s,    hh
cURZ	3%"*	,uqy	89a  ()*	
 
c'l  w**-A,BCw55c
|DecUSEC53%0FcA3aS2$/N'GG^,))'2I					 y"23r%   c                     t        j                  g dg df      } t        d      j                  |       }t	        |j
                  d   ddg       |j
                  d   J ddgddgg}ddgddgg}|j                  |      }t        ||       |j                  |      }t        j                  ddgd	dggt        
      }t	        ||       y)zETest when feature 0 has infrequent categories and feature 1 does not.r  r  r   r  r   r   r   Nr  rH   )r   column_stackr   r0   r
   r  r4   r	   r   r   r   )r   rf  r  rg  r   r  rh  s          r#   %test_ordinal_encoder_infrequent_mixedro  d  s     	46QRSAA.2215Gw55a81a&A))!,444!fq!fF!fq!f%N'GG^,))'2Ixx!Q*>)B C6Ry"23r%   c            	      z   t        j                  d      } | j                  g d      }| j                  g dg d| j	                  dgdz  dgdz  z   d	gz   d
gz   |      dg d      }t        d      j                  |      }t        |j                  d   ddg       t        |j                  d   g d       t        |j                  d   d
d	g       | j                  g dg d| j	                  dgd	gz   d
gz   dgz   |      dg d      }g dg dg dg dg}|j                  |      }t        ||       y)zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rQ   )birdr   r  r_  r  r  r  r*   r   r   r_  rq  rH   )r  rM  r~  r  rm  r   rR   rS   r   r  r   )rR   rS   r  r   )rf   r   rc   r   )r   r   r   )r   r   r   )r   r   r   r  N)r1   rY   CategoricalDtyperZ   r   r   r0   r
   r  r4   r	   )r[   categorical_dtyper   rf  r  rg  r   s          r#   :test_ordinal_encoder_infrequent_multiple_categories_dtypesrt  y  s[    
		X	&B++,KL
@199!ugk)WI5@' % 	
 . 	 
	A A.2215G w55a83*Ew55a8*Ew55a867:KL\\'!997)#vh.%8' % 	
 .  
F  IyAN'GG^,r%   c                     t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   t         j                  gz   gt        	      j                  } t        d
ddd      j                  |       }t        |j                  g dg       t        j                  dgdgdgdgdgt         j                  ggt        	      }dgdgdgdgdgdgg}|j                  |      }t        ||       y)zJCheck behavior of unknown_value and encoded_missing_value with infrequent.rR   r   rS   r"  r   rc   r   r   rH   r6  r   )r&   r9  r  r  r  r  r   r   N)r   r   r   r   rM   r   r0   r
   r  r4   r	   )r  rf  r  rg  r   s        r#   .test_ordinal_encoder_infrequent_custom_mappingrv    s    hh
cURZ	3%"*	,uqy	8BFF8	CDFa  *	
 
c'l  w557HIXXusecUSEC5266(C6RFcA3aS1#s3N'GG^,r%   c                 d   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  }t	        di | d
ddj                  |      }t	        d
d      j                  |      }dgdgdgdgdgg}t        |j                  |      |j                  |             y)zMAll categories are considered frequent have same encoding as default encoder.rR   r   rS   r"  r   rc   r   r   rH   r6  r@   r8  r  Nr   r   r   r   rM   r   r0   r	   r4   )r  r  adjusted_encoderdefault_encoderr  s        r#   !test_ordinal_encoder_all_frequentr{    s     hh
cURZ	3%"*	,uqy	89a  & 
!4B	c'l  %*"	c'l  ecUSEC53%0F""6*O,E,Ef,Mr%   d   c                 "   t        j                  dgdz  dgdz  z   dgdz  z   dgdz  z   gt        	      j                  }t	        di | d
ddj                  |      }dgdgdgdgdgg}t        |j                  |      dgdgdgdgdgg       y)zAWhen all categories are infrequent, they are all encoded as zero.rR   r   rS   r"  r   rc   r   r   rH   r6  r@   r8  r  r   Nr   rx  )r  r  r%  r  s       r#   #test_ordinal_encoder_all_infrequentr~    s     hh
cURZ	3%"*	,uqy	89a   
!4B	c'l  ecUSEC53%0FG%%f-aS1#sRD/IJr%   c                     t        j                  t         j                  gdz  dgdz  z   dgdz  z   dgz   dgz   gt              j                  } t        d	
      j                  |       }t        j                  dddt         j                  ggt              j                  }|j                  |      }t        |dgdgdgt         j                  gg       y)z5Check behavior when missing value appears frequently.r"  r  rc   r   r   r_  deerrH   r   rm  r   r   r   N	r   r   r   r   rM   r   r0   r4   r	   r   rf  r  r   s       r#   -test_ordinal_encoder_missing_appears_frequentr    s    

&&B%2	%!	3wi	?6(	JK	 a  A.2215GXXrvv67vFHHF'GGqcA3bffX67r%   c            	         t        j                  t         j                  gdgdz  z   dgdz  z   dgz   dgz   dgdz  d	gdz  z   gt        
      j                  } t        d      j                  |       }t        j                  ddgdd	gt         j                  d	gdd	gddggt        
      }|j                  |      }t        |ddgddgt         j                  dgddgddgg       y)z7Check behavior when missing value appears infrequently.r  rc   r   r   r_  r  redrG  greenrH   r*   )r  r   r   r   Nr  r  s       r#   /test_ordinal_encoder_missing_appears_infrequentr    s    
 	VVHw|#ugk1WI=HGaK7)a-'	
 	 a  1-11!4GXXeWVVWGEN	
 	F 'GGq!fq!frvvqkAq6Aq6JKr%   c                     t        j                  dgdgdggt              } | g dg      }t        j                  t
              5  |j                  |       ddd       y# 1 sw Y   yxY w)a!  Check that we raise a `NotFittedError` by calling transform before fit with
    the encoders.

    One could expect that the passing the `categories` argument to the encoder
    would make it stateless. However, `fit` is making a couple of check, such as the
    position of `np.nan`.
    rU   rV   r`  rH   r_  r   N)r   r   r   r1   r2   r   r4   )r  r   r%  s      r#   test_encoder_not_fittedr  	  sT     	3%#&f5A/!23G	~	&! 
'	&	&r   )r   numpyr   r1   scipyr   sklearn.exceptionsr   sklearn.preprocessingr   r   sklearn.utils._missingr   sklearn.utils._testingr   r	   r
   sklearn.utils.fixesr   r$   markparametrizer:   rD   r  float32r.  rO   r]   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r  r[  str_r   rM   r  r	  r  r  r  r  r  r*  r0  r2  r4  rB  rM  rI  rK  rN  rS  r]  rb  rf  rn  rq  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r
  r  r  r  r  r  r  r  r!  r&  r)  r-  r1  r4  r9  r?  rL   rG  rO  rS  r]  r`  rd  ri  rk  ro  rt  rv  r{  r~  r  r  r  r   r%   r#   <module>r     s   	    - ? 0 
 /@. )H6K+LM# N#. )H6K+LM& N&$ "((BJJ

)KL2::rzz(JK
; L M
; "((BJJ

)KLA MA92xJ$4	9
 	(+z*+#%67vF/C#78G/Cuu#=>fM"O4FC/C#67vF/Cut#<=VL			  .G/.G )H6K+LMUDM2$1,$ 2 3 N,$^ UDM2b'Ar7QG	$y)Y&GHS\E3<'3%Fo?	
	'	 3'*7 !=>'CD
C E ?
C E?#;<1vxrxxc
';<= > = E?#;< = "+r{	#uenrd%;RZZH	Aq6Aq6"	#q!fqc]BJJ?BHHsElS%L1@3Z%!JJ	

 
C<#u.	/3*ug1FP	Aq6BFFA;'	(Arvv;*<bjjIBHHsBFFmdBFF^4FC4[266(#JJ	
 BHHsE%L)D%,+?@O4[5<.)JJ	
*	/   B7C B7" )H6K+LM BHHsCj\022BHHsCj\022JJ		
 BHHq!fXW-//BHHq!fXW-//KHH		
 BHHsCj\022BHHsCj\022RXXo&'JJ		
 BHHtSk]&133BHHtSk]&133		
 BHHsCj\022BHHsBFFm_F355		
 BHHsDk]&133BHHsBFFm_F355		
?%L	Q  0bAc0 NdA($ ]N$CD
 E

?7 	66"#	./	()
 	&  66&0 	(+{+,#%67vF
 	'  22  BHHsCj\022BHHsCj\022JJ		
 BHHq!fXW-//BHHq!fXW-//KHH		
 BHHsCj\022BHHsCj\022RXXo&'JJ		
( 	3-  010"$- 5#,/- 0-6	868,8 2664u*FG!1 H!1H 5!*.A!BCB DB T5M'7JK'=!9?RS. T L.$ ]N$CD= E= 	1	"	$q1r2	 1E0F'GHI I	I6 +w!>?C @C. 3%#0 1" 	1	!	!	$	$q1q1QQ< 'C5!12/ 3/. 3%#0 10 !a8?A:NO;;2,6!,H;$X,v>,B bA$N#OP
$ Q
$ a1$M#NO) P)  !#M 'EF* G*6* 2664.9G :G,. ),A8+LM((;<, = N,< )H6K+LM"B N"BJ )H6K+LM!E N!EH )H6K+LM) N)4( 02662,?" @"" ((;<02662,?% @ =%>  3-7993*V4663RVV,F;<

	 3-7993*V4663RVV,F;<

	 3-

;==3%

3553RVV,-.

	%4	9  !DE!D$ ]N$CD E ! BHHsBFFC()*,,BHHsBFFC()*,,BHHseW	
 BHHo&'))BHHo&'))BHHrvvhZ 	
 BHHsBFFC()8::BHHsBFFC()*,,BHHseWF+	
 BHHo&f577BHHo&'))BHHrvvhZv.	
!24324 .92 :2," 
4+c*4+c* 
s3*S)3*S)	( 	(6(	1'< u6 72 :
 BHHsecU^62S266(RVVH%BJJvv.f=	
 BHHrvvhu-V<S266(RVVH%BJJx"&&2&A	
&''&'BM.M /"	$!!H 	1	!	!	$	$q1q1446!4H4*--`-* 	1	!( 	1	#
K
K
8L8 ]N$CD Er%   