
    {Kg{,                         d dl mZ d dlmZ d dlmZ d dlZddlm	Z	 dddd	Z
dd
Z G d de      Zd Z G d de      Zd Zd ZdddZddZ G d de      Zd Zy)    )Counter)suppress)
NamedTupleN   )is_scalar_nanFreturn_inversereturn_countsc                `    | j                   t        k(  rt        | ||      S t        | ||      S )a  Helper function to find unique values with support for python objects.

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.

    Parameters
    ----------
    values : ndarray
        Values to check for unknowns.

    return_inverse : bool, default=False
        If True, also return the indices of the unique values.

    return_counts : bool, default=False
        If True, also return the number of times each unique item appears in
        values.

    Returns
    -------
    unique : ndarray
        The sorted unique values.

    unique_inverse : ndarray
        The indices to reconstruct the original array from the unique array.
        Only provided if `return_inverse` is True.

    unique_counts : ndarray
        The number of times each of the unique values comes up in the original
        array. Only provided if `return_counts` is True.
    r   )dtypeobject_unique_python
_unique_np)valuesr	   r
   s      Y/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/sklearn/utils/_encode.py_uniquer   
   s:    > ||v>
 	
 ~]     c                    t        j                  | ||      }d\  }}|r|^ }}|r|^ }}|s|r|d   }|j                  rit        |d         r[t        j                  |t         j
                        }|d|dz    }|r||||kD  <   |r#t        j                  ||d       ||<   |d|dz    }|f}|r||fz  }|r||fz  }t        |      dk(  r|d   S |S )zHelper function to find unique values for numpy arrays that correctly
    accounts for nans. See `_unique` documentation for details.r   )NNr   Nr   )npuniquesizer   searchsortednansumlen)r   r	   r
   uniquesinversecountsnan_idxrets           r   r   r   3   s     ii~]G !OGV"&#'!* ||gbk2//'2662-GaK()0GGg%& ffVGH%56F7OMgk*F*CzyX]3q6++r   c                   ,    e Zd ZU dZeed<   eed<   d Zy)MissingValuesz'Data class for missing data informationr   nonec                     g }| j                   r|j                  d       | j                  r|j                  t        j                         |S )z3Convert tuple to a list where None is always first.N)r$   appendr   r   )selfoutputs     r   to_listzMissingValues.to_listb   s6    99MM$88MM"&&!r   N)__name__
__module____qualname____doc__bool__annotations__r)    r   r   r#   r#   \   s    1	I
Jr   r#   c                     | D ch c]  }|t        |      s| }}|s| t        dd      fS d|v r*t        |      dk(  rt        dd      }nt        dd      }nt        dd      }| |z
  }||fS c c}w )a.  Extract missing values from `values`.

    Parameters
    ----------
    values: set
        Set of values to extract missing from.

    Returns
    -------
    output: set
        Set with missing values extracted.

    missing_values: MissingValues
        Object with missing value information.
    NF)r   r$   r   T)r   r#   r   )r   valuemissing_values_setoutput_missing_valuesr(   s        r   _extract_missingr5   l   s    " "!%U]mE6J6   }U;;;!!!"a'$1e$$G! %2d$F! -$U C ((F((('s
   A3A3c                   (     e Zd ZdZ fdZd Z xZS )_nandictz!Dictionary with support for nans.c                 |    t         |   |       |j                         D ]  \  }}t        |      s|| _         y  y N)super__init__itemsr   	nan_value)r'   mappingkeyr2   	__class__s       r   r;   z_nandict.__init__   s6    !!--/JCS!!& *r   c                 ^    t        | d      rt        |      r| j                  S t        |      )Nr=   )hasattrr   r=   KeyErrorr'   r?   s     r   __missing__z_nandict.__missing__   '    4%-*<>>!smr   )r*   r+   r,   r-   r;   rE   __classcell__r@   s   @r   r7   r7      s    +r   r7   c                     t        t        |      D ci c]  \  }}||
 c}}      }t        j                  | D cg c]  }||   	 c}      S c c}}w c c}w )z,Map values based on its position in uniques.)r7   	enumerater   array)r   r   ivaltablevs         r   _map_to_integerrP      sT    9W+=>+=Cc1f+=>?E88v.v!U1Xv.// ?.s
   A
Ac                   	 t        |       }t        |      \  }}t        |      }|j                  |j	                                t        j                  || j                        }|f}|r|t        | |      fz  }|r|t        | |      fz  }t        |      dk(  r|d   S |S # t        $ r1 t        d t        d | D              D              }t        d|       w xY w)Nr   c              3   4   K   | ]  }|j                     y wr9   )r,   ).0ts     r   	<genexpr>z!_unique_python.<locals>.<genexpr>   s     L/K!q~~/Ks   c              3   2   K   | ]  }t        |        y wr9   )type)rT   rO   s     r   rV   z!_unique_python.<locals>.<genexpr>   s     2KFq47Fs   zPEncoders require their input argument must be uniformly strings or numbers. Got r   r   )setr5   sortedextendr)   r   rK   r   	TypeErrorrP   _get_countsr   )r   r	   r
   uniques_setmissing_valuesr   typesr!   s           r   r   r      s    
&k&6{&C#^%~--/0((7&,,7 *C022FG,..X]3q6++  
Ls2KF2K/KLL'',g/
 	

s   A$B" ":CT)check_unknownc                   | j                   j                  dv r	 t        | |      S |r%t        | |      }|rt	        dt        |             t        j                  ||       S # t        $ r}t	        dt        |             d}~ww xY w)a  Helper function to encode values into [0, n_uniques - 1].

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.
    The numpy method has the limitation that the `uniques` need to
    be sorted. Importantly, this is not checked but assumed to already be
    the case. The calling method needs to ensure this for all non-object
    values.

    Parameters
    ----------
    values : ndarray
        Values to encode.
    uniques : ndarray
        The unique values in `values`. If the dtype is not object, then
        `uniques` needs to be sorted.
    check_unknown : bool, default=True
        If True, check for values in `values` that are not in `unique`
        and raise an error. This is ignored for object dtype, and treated as
        True in this case. This parameter is useful for
        _BaseEncoder._transform() to avoid calling _check_unknown()
        twice.

    Returns
    -------
    encoded : ndarray
        Encoded values
    OUSz%y contains previously unseen labels: N)	r   kindrP   rC   
ValueErrorstr_check_unknownr   r   )r   r   ra   ediffs        r   _encoderj      s    : ||E!	O"6733 !&'2D #HT!TUUw//  	ODSVHMNN	Os   A# #	B,BBc                 X   d}| j                   j                  dv r
t        |       }t        |      \  }}t        |      t              \  |z
  }|j                  xr j                   }|j
                  xr j
                   }fd}	|rT|s|s|r*t        j                  | D 
cg c]
  }
 |	|
       c}
      }n$t        j                  t        |       t              }t        |      }|r|j                  d       |r|j                  t        j                         nt        j                  |       }t        j                  ||d      }|rG|j                  rt        j                   | |      }n$t        j                  t        |       t              }t        j"                  |      j%                         rSt        j"                  |      }|j%                         r.|j                  r|rt        j"                  |       }d||<   ||    }t        |      }|r||fS |S c c}
w )a  
    Helper function to check for unknowns in values to be encoded.

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.

    Parameters
    ----------
    values : array
        Values to check for unknowns.
    known_values : array
        Known values. Must be unique.
    return_mask : bool, default=False
        If True, return a mask of the same shape as `values` indicating
        the valid values.

    Returns
    -------
    diff : list
        The unique values present in `values` and not in `know_values`.
    valid_mask : boolean array
        Additionally returned if ``return_mask=True``.

    Nrc   c                 j    | v xs- j                   xr | d u xs j                  xr t        |       S r9   )r$   r   r   )r2   missing_in_uniquesr^   s    r   is_validz _check_unknown.<locals>.is_valid  sG    $ )%** "TM) &)) )!%(r   rR   Tassume_uniquer   )r   rd   rY   r5   r   r$   r   rK   onesr   r.   listr&   r   	setdiff1dr   isinisnanany)r   known_valuesreturn_mask
valid_mask
values_setmissing_in_valuesri   nan_in_diffnone_in_diffrn   r2   unique_valuesdiff_is_nanis_nanrm   r^   s                 @@r   rg   rg      s   2 J||E![
(8(D%
%,'*:;*G''K''++J4F4J4J0J(--M6H6M6M2M	 {lXXF&KF5xF&KL
WWS[=
DzKKKK		&)||M<tLyyWWV\:
WWS[=
 88L!%%'((4.K 99XXf-F)*Jv& [L)DzZKC 'Ls   ,H'c                   .     e Zd ZdZ fdZd Zd Z xZS )_NaNCounterz$Counter with support for nan values.c                 B    t         |   | j                  |             y r9   )r:   r;   _generate_items)r'   r<   r@   s     r   r;   z_NaNCounter.__init__D  s    --e45r   c              #      K   |D ]:  }t        |      s| t        | d      sd| _        | xj                  dz  c_        < yw)z>Generate items without nans. Stores the nan counts separately.	nan_countr   r   N)r   rB   r   )r'   r<   items      r   r   z_NaNCounter._generate_itemsG  s?     D &
4-!"NNaN s   AAc                 ^    t        | d      rt        |      r| j                  S t        |      )Nr   )rB   r   r   rC   rD   s     r   rE   z_NaNCounter.__missing__Q  rF   r   )r*   r+   r,   r-   r;   r   rE   rG   rH   s   @r   r   r   A  s    .6 r   r   c                 p   | j                   j                  dv rnt        |       }t        j                  t        |      t        j                        }t        |      D ]%  \  }}t        t              5  ||   ||<   ddd       ' |S t        | d      \  }}t        j                  ||d      }t        j                  |d         rt        j                  |d         rd|d<   t        j                  |||         }	t        j                  |t        j                        }||	   ||<   |S # 1 sw Y   xY w)zGet the count of each of the `uniques` in `values`.

    The counts will use the order passed in by `uniques`. For non-object dtypes,
    `uniques` is assumed to be sorted and `np.nan` is at the end.
    OUrR   NT)r
   ro   r   )r   rd   r   r   zerosr   int64rJ   r   rC   r   rt   ru   r   
zeros_like)
r   r   counterr(   rL   r   r~   r   uniques_in_valuesunique_valid_indicess
             r   r]   r]   W  s
    ||D f%#g,bhh7 )GAt(##DMq	 $# * &vTBM6 dK	xxb!"rxx'< $"??='BS:TU]]7"((3F &'; <FM $#s   2	D,,D5	)FF)F)collectionsr   
contextlibr   typingr   numpyr   _missingr   r   r   r#   r5   dictr7   rP   r   rj   rg   r   r]   r0   r   r   <module>r      sr        # ',5 &R&,RJ  #)Lt  0,4 /3 '0TRj' ,r   