
    {KgJ                         d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ dededefdZdedede
j2                  fdZd Z	 ddZ	 ddZ	 	 ddZy)z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)chunk_generatorget_chunk_n_rows)check_pandas_support)	pd_fillna	arff_datainclude_columnsreturnc                 N   t               t               t               f}t        |      D ci c]  \  }}||
 }}}t        | d   | d   | d         D ]J  \  }}}||v s|d   j                  |       |d   j                  |       |d   j                  ||          L |S c c}}w )a  Obtains several columns from sparse ARFF representation. Additionally,
    the column indices are re-labelled, given the columns that are not
    included. (e.g., when including [1, 2, 3], the columns will be relabelled
    to [0, 1, 2]).

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    r      r   )list	enumeratezipappend)	r   r   arff_data_new	array_idx
column_idxreindexed_columnsvalrow_idxcol_idxs	            a/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py_split_sparse_columnsr      s    . *.(@M;D_;U;U"7)Z
I;U   "%Yq\9Q<1!NWgo%!##C(!##G,!##$5g$>?	 "O
 s   B!c                 0   t        | d         dz   }|t        |      f}t        |      D ci c]  \  }}||
 }}}t        j                  |t        j
                        }t        | d   | d   | d         D ]  \  }}	}
|
|v s|||	||
   f<    |S c c}}w )Nr   dtyper   r   )maxlenr   npemptyfloat64r   )r   r   num_obsy_shaper   r   r   yr   r   r   s              r   _sparse_data_to_arrayr*   6   s    
 )A,!#GO,-G;D_;U;U"7)Z
I;U   	

+A!$Yq\9Q<1!NWgo%58Ag(112 "O Hs   Bc                 z    | |   }t        |      dk\  r	| |   }||fS t        |      dk(  r| |d      }||fS d}||fS )a  Post process a dataframe to select the desired columns in `X` and `y`.

    Parameters
    ----------
    frame : dataframe
        The dataframe to split into `X` and `y`.

    feature_names : list of str
        The list of feature names to populate `X`.

    target_names : list of str
        The list of target names to populate `y`.

    Returns
    -------
    X : dataframe
        The dataframe containing the features.

    y : {series, dataframe} or None
        The series or dataframe containing the target.
    r   r   r   N)r#   )framefeature_namestarget_namesXr)   s        r   _post_process_framer0   H   sh    , 	mA
<A,
 a4K	 
\	a	,q/" a4K a4K    c                 	   d } ||       }|dk(  rt         j                  nt         j                  }|dk(   }	t        j                  |||	      }
||z   }|
d   D ci c]  \  }}t	        |t
              r||v r|| }}}|dk(  rt        d      }t        |
d         }t        |j                               }t        |
d         }|j                  |g|d	      }|j                  d
      j                         }t        |      }|D cg c]	  }||v s| }}||   g}t        |
d   |      D ](  }|j                  |j                  ||d	      |          * t!        |      dk\  r$|d   j#                  |d   j$                        |d<   |j'                  |d
      }t)        ||      }~~i }|j*                  D ]N  }||   d   }|j-                         dk(  rd||<   $|j-                         dk(  rd||<   =|j$                  |   ||<   P |j#                  |      }t/        |||      \  }}nn|
d   }|D  cg c]  } t1        ||    d          }!} |D  cg c]  } t1        ||    d          }"} t	        |t2              rz|t5        d      |d   dk(  rd}#n|d   |d   z  }#t7        j8                  t:        j<                  j?                  |      d|#      } |j@                  | }|dd|!f   }|dd|"f   }nt	        |tB              rtE        ||!      }$tG        |d         dz   }%|%t!        |!      f}&tH        jJ                  jM                  |$d   |$d   |$d   ff|&t6        jN                        }|jQ                         }tS        ||"      }nt5        dtU        |             |D  ch c]  } | |v  }'} |'sntW        |'      rt7        jX                  t[        |      D ( cg c]`  \  }(} t7        j\                  t7        j^                  |ja                  |       d      |dd|(|(dz   f   j#                  t0        d            b c} }(      }ntc        |'      rt5        d       |jd                  d   dk(  r|jA                  d!      }n|jd                  d   dk(  rd}|dk(  r||dfS ||d|fS c c}}w c c}w c c} w c c} w c c} w c c} }(w )"a  ARFF parser using the LIAC-ARFF library coded purely in Python.

    This parser is quite slow but consumes a generator. Currently it is needed
    to parse sparse datasets. For dense datasets, it is recommended to instead
    use the pandas-based parser, although it does not always handles the
    dtypes exactly the same.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    c              3   @   K   | D ]  }|j                  d        y w)Nutf-8)decode)	gzip_filelines     r   _io_to_generatorz+_liac_arff_parser.<locals>._io_to_generator   s     D++g&& s   sparsepandas)return_typeencode_nominal
attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deepr   r   r   )ignore_index	data_typeintegerInt64nominalcategoryindexNz6shape must be provided when arr['data'] is a Generatorr&   )r!   count)shaper!   z-Unexpected type for data obtained from arff: Or    )r@   zAMix of nominal and non-nominal targets is not currently supported)rI   )3r   COO	DENSE_GENload
isinstancer   r   r   keysnext	DataFramememory_usagesumr
   r	   r   r#   astypedtypesconcatr   r?   lowerr0   intr   
ValueErrorr$   fromiter	itertoolschainfrom_iterablereshapetupler   r"   spr9   
coo_matrixr&   tocsrr*   typeallhstackr   takeasarraypopanyrK   ))r6   output_arrays_typeopenml_columns_infofeature_names_to_selecttarget_names_to_selectrK   r8   streamr;   r<   arff_containercolumns_to_selectnamecat
categoriespdcolumns_infocolumns_names	first_rowfirst_df	row_bytes	chunksizecolcolumns_to_keepdfsr>   r,   rW   column_dtyper/   r)   r   col_namefeature_indices_to_selecttarget_indices_to_selectrJ   arff_data_Xr'   X_shapeis_classificationis)                                            r   _liac_arff_parserr   h   sp   n' i(F  2X=%))5??K -89NZZKN 02HH (55ID#c4 T->%> 	c	5  
 X%!"CD">,#?@\..01 /0	<<]<O))t)488:	$Y/	 +8T-33BS;S3-T()#N6$:IFDJJT=uEoV G s8q=V]]3q6==1CF
 		#D	1"e$ MMD.t4[AL!!#y0  't##%2)t$||D1t " V$"*,B
1 #6*	 4%
3 #H-g673 	" %
 3$
2 #H-g672 	! $

 i+} L  Qx2~a58+;;--i8D
  4<<'DQ112AQ001A	5)/	;TUK)A,'!+G$= >?G		$$Q+a.+a.!ABjj % A
 	A%i1IJA ?Y?PQ 
 4J
3IxH
"3I 	 
 !"#		 (11G'H
 (I8	 GG

:>>(#;3G!QQY,..s.? (IA "#S  771:?		% AWWQZ1_AX%!UD  az!!E& UL%
$
N
s+   !R.
	R4R48R9R>)S!A%S
c           
         ddl }| D ]2  }|j                  d      j                         j                  d      s2 n i }|D ]<  }	||	   d   }
|
j                         dk(  rd||	<   $|
j                         dk(  s8d	||	<   > t	        |      D 	ci c]  \  }}	|	|v r|||	    }}}	dd
dgd
dddd|d	}i ||xs i } |j
                  | fi |}	 |D 	cg c]  }	|	 c}	|_        ||z   }|j                  D cg c]	  }||v s| }}||   }t        j                  d      fd}|j                  j                         D 	cg c]  \  }	}t        ||j                        r|	 }}	}|D ]#  }||   j                   j#                  |      ||<   % t%        |||      \  }}|dk(  r|||dfS |j'                         |j'                         }}|j                  j                         D 	ci c]6  \  }	}t        ||j                        r|	|j(                  j+                         8 }}	}||d|fS c c}	}w c c}	w # t        $ r!}|j                  j                  d      |d}~ww xY wc c}w c c}}	w c c}}	w )a^  ARFF parser using `pandas.read_csv`.

    This parser uses the metadata fetched directly from OpenML and skips the metadata
    headers of ARFF file itself. The data is loaded as a CSV file.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The GZip compressed file with the ARFF formatted payload.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities are:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected to build `X`.

    target_names_to_select : list of str
        A list of the target names to be selected to build `y`.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    r   Nr4   z@datarC   rD   rE   rF   rG   F?%"T\)	header	index_col	na_valueskeep_default_nacomment	quotecharskipinitialspace
escapecharr!   zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.z^'(?P<contents>.*)'$c                 Z    t        j                  |       }|| S |j                  d      S )Ncontents)researchgroup)input_stringmatchsingle_quote_patterns     r   strip_single_quotesz0_pandas_arff_parser.<locals>.strip_single_quotes  s.    		.=={{:&&r1   r:   )r:   r5   rY   
startswithr   read_csvr?   r[   errorsParserErrorr   compilerW   itemsrP   CategoricalDtypert   rename_categoriesr0   to_numpyru   tolist)r6   rl   rm   rn   ro   read_csv_kwargsrv   r7   rW   rs   r   r   dtypes_positionaldefault_read_csv_kwargsr,   excrr   r}   r~   r   r!   categorical_columnsr/   r)   ru   r   s                            @r   _pandas_arff_parserr   4  s   p  ;;w%%'227;  F#*40=9, #F4L!Y.%F4L $ '':;;MGT6> 	;   U  "
 M0L_5JLOBKK	5_5E

 +>>*=$*=> 02HH&+mmPmss>O7OsmOP/"E ::&=>' !<<--//KD%eR001 	/  
 #3Z^^556IJc
 # u&=?UVDAqX%!UD  zz|QZZ\1 !<<--//KD%eR001 	e%%''/  
 az!!W0 ? ii##@
 	 Q.sH   H-H8 	H3H8 8	I%I%"I*);I03H8 8	I"II"c                 t    |dk(  rt        | |||||      S |dk(  rt        | |||||      S t        d| d      )a6  Load a compressed ARFF file using a given parser.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    parser : {"pandas", "liac-arff"}
        The parser used to parse the ARFF file. "pandas" is recommended
        but only supports loading dense datasets.

    output_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    z	liac-arffr:   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.)r   r   r[   )r6   parseroutput_typerm   rn   ro   rK   r   s           r   load_arff_from_gzip_filer     sq    v  #"
 	
 
8	"#"
 	
 x'LM
 	
r1   )N)NN)__doc__r]   r   collectionsr   collections.abcr   typingr   numpyr$   scipyrb   	externalsr   externals._arffr   utils._chunkingr	   r
   utils._optional_dependenciesr   utils.fixesr   r   ndarrayr*   r0   r   r   r    r1   r   <module>r      s    ?  	 # %     0 ? ? # ! 48  F!48ZZ$L I"d U"~ P
r1   