
    xKg"                         d dl mZ d dlmZ d dlmZ d dlmZm	Z	m
Z
  G d de      Z G d de
      Z G d	 d
e	      Z G d de      ZdZ G d dej"                        ZdZ G d dej(                        Zy)    )cuda)array)deviceufunc)UFuncMechanismGeneralizedUFuncGUFuncCallStepsc                   *    e Zd ZdZd Zd ZddZd Zy)CUDAUFuncDispatcherzD
    Invoke the CUDA ufunc specialization for the given inputs.
    c                 4    || _         |j                  | _        y N)	functions__name__)selftypes_to_retty_kernelspyfuncs      Z/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/numba/cuda/vectorizers.py__init__zCUDAUFuncDispatcher.__init__   s    /    c                 D    t         j                  | j                  ||      S )a  
        *args: numpy arrays or DeviceArrayBase (created by cuda.to_device).
               Cannot mix the two types in one call.

        **kws:
            stream -- cuda stream; when defined, asynchronous mode is used.
            out    -- output array. Can be a numpy array or DeviceArrayBase
                      depending on the input arguments.  Type must match
                      the input arguments.
        )CUDAUFuncMechanismcallr   )r   argskwss      r   __call__zCUDAUFuncDispatcher.__call__   s     "&&t~~tSAAr   c                    t        t        | j                  j                               d         dk(  sJ d       |j                  dk(  sJ d       |j
                  d   }g }|dk(  rt        d      |dk(  r|d   S |xs t        j                         }|j                         5  t        j                  j                  j                  |      r|}nt        j                  ||      }| j                  |||      }t        d|j                         }|j#                  ||	       d d d        |d   S # 1 sw Y   d   S xY w)
Nr      zmust be a binary ufunc   zmust use 1d arrayzReduction on an empty array.)r   )dtypestream)lenlistr   keysndimshape	TypeErrorr   r    auto_synchronizecudadrvdevicearrayis_cuda_ndarray	to_device_CUDAUFuncDispatcher__reducenp_arrayr   copy_to_host)r   argr    ngpu_memsmemoutbufs           r   reducezCUDAUFuncDispatcher.reduce   s'   4++-.q12a7 	A :A 	A7xx1}111}IIaL6:;;!Vq6M (4;;=$$&||''77<nnS&1--Xv6C4syy1CS0 ' 1v ' 1vs   #B D00D=c                    |j                   d   }|dz  dk7  ri|j                  |dz
        \  }}|j                  |       |j                  |       | j                  |||      }|j                  |        | ||||      S |j                  |dz        \  }}	|j                  |       |j                  |	        | ||	||       |dz  dkD  r| j                  |||      S |S )Nr   r   r   )r3   r    )r%   splitappendr,   )
r   r2   r1   r    r0   fatcutthincutr3   leftrights
             r   __reducezCUDAUFuncDispatcher.__reduce;   s    IIaLq5A:!iiA.OFGOOF#OOG$--&9COOC W#f==))AF+KD%OOD!OOE"u$v6Avz}}T8V<<r   Nr   )r   
__module____qualname____doc__r   r   r5   r,    r   r   r
   r
      s    (B:r   r
   c                   H     e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
 xZS )	_CUDAGUFuncCallSteps_streamc                 X    t         |   ||||       |j                  dd      | _        y )Nr    r   )superr   getrE   )r   ninnoutr   kwargs	__class__s        r   r   z_CUDAGUFuncCallSteps.__init__X   s(    dD&1zz(A.r   c                 ,    t        j                  |      S r   r   is_cuda_arrayr   objs     r   is_device_arrayz$_CUDAGUFuncCallSteps.is_device_array\       !!#&&r   c                     t         j                  j                  j                  |      r|S t        j                  |      S r   r   r(   r)   r*   as_cuda_arrayrP   s     r   as_device_arrayz$_CUDAGUFuncCallSteps.as_device_array_   2     <<##33C8J!!#&&r   c                 D    t        j                  || j                        S Nr   )r   r+   rE   )r   hostarys     r   r+   z_CUDAGUFuncCallSteps.to_devicei   s    ~~gdll;;r   c                 @    |j                  || j                        }|S rZ   )r.   rE   )r   devaryr[   r3   s       r   to_hostz_CUDAGUFuncCallSteps.to_hostl   s     !!'$,,!?
r   c                 F    t        j                  ||| j                        S N)r%   r   r    )r   device_arrayrE   )r   r%   r   s      r   allocate_device_arrayz*_CUDAGUFuncCallSteps.allocate_device_arrayp   s      uE$,,OOr   c                 D     |j                  || j                        |  y rZ   )forallrE   )r   kernelnelemr   s       r   launch_kernelz"_CUDAGUFuncCallSteps.launch_kernels   s    1eDLL148r   )r   r?   r@   	__slots__r   rR   rW   r+   r^   rb   rg   __classcell__rL   s   @r   rD   rD   S   s1    I/''<P9r   rD   c                   :     e Zd Z fdZed        Zd Zd Z xZS )CUDAGeneralizedUFuncc                 H    |j                   | _         t        | 	  ||       y r   )r   rG   r   )r   	kernelmapenginer   rL   s       r   r   zCUDAGeneralizedUFunc.__init__x   s    F+r   c                     t         S r   )rD   r   s    r   _call_stepsz CUDAGeneralizedUFunc._call_steps|   s    ##r   c                     t         j                  j                  j                  |d|j                  |j
                        S Nr>   r%   stridesr   gpu_data)r   r(   r)   DeviceNDArrayr   rw   )r   aryr%   s      r   _broadcast_scalar_inputz,CUDAGeneralizedUFunc._broadcast_scalar_input   s9    ||''55E>B<?II?B|| 6 M 	Mr   c                     t        |      t        |j                        z
  }d|z  |j                  z   }t        j                  j
                  j                  |||j                  |j                        S rt   )	r!   r%   rv   r   r(   r)   rx   r   rw   )r   ry   newshapenewax
newstridess        r   _broadcast_add_axisz(CUDAGeneralizedUFunc._broadcast_add_axis   sa    HCII.E\CKK/
||''55H>H<?II?B|| 6 M 	Mr   )	r   r?   r@   r   propertyrr   rz   r   ri   rj   s   @r   rl   rl   w   s(    , $ $MMr   rl   c                   >    e Zd ZdZdZd Zd Zd Zd Zd Z	d Z
d	 Zy
)r   z%
    Provide CUDA specialization
    r   c                 0     |j                  ||      |  y rZ   )rd   )r   funccountr    r   s        r   launchzCUDAUFuncMechanism.launch   s    )E&)40r   c                 ,    t        j                  |      S r   rN   rP   s     r   rR   z"CUDAUFuncMechanism.is_device_array   rS   r   c                     t         j                  j                  j                  |      r|S t        j                  |      S r   rU   rP   s     r   rW   z"CUDAUFuncMechanism.as_device_array   rX   r   c                 0    t        j                  ||      S rZ   )r   r+   )r   r[   r    s      r   r+   zCUDAUFuncMechanism.to_device   s    ~~gf55r   c                 &    |j                  |      S rZ   )r.   )r   r]   r    s      r   r^   zCUDAUFuncMechanism.to_host   s    ""&"11r   c                 2    t        j                  |||      S r`   )r   ra   )r   r%   r   r    s       r   rb   z(CUDAUFuncMechanism.allocate_device_array   s      uE&IIr   c                    t        t        |            D cg c](  }||j                  k\  s|j                  |   ||   k7  r|* }}t        |      t        |j                        z
  }dg|z  t	        |j
                        z   }|D ]  }d||<   	 t        j                  j                  j                  |||j                  |j                        S c c}w )Nr   ru   )ranger!   r$   r%   r"   rv   r   r(   r)   rx   r   rw   )r   ry   r%   ax
ax_differs
missingdimrv   s          r   broadcast_devicez#CUDAUFuncMechanism.broadcast_device   s    #(U#4 5#4Rsxx2%)3 #4
 5 Z#cii.0
#
"T#++%66BGBK  ||''55E>E<?II?B|| 6 M 	M5s   -CN)r   r?   r@   rA   DEFAULT_STREAMr   rR   rW   r+   r^   rb   r   rB   r   r   r   r      s3     N1''62JMr   r   z
def __vectorized_{name}({args}, __out__):
    __tid__ = __cuda__.grid(1)
    if __tid__ < __out__.shape[0]:
        __out__[__tid__] = __core__({argitems})
c                   4    e Zd Zd Zd Zd Zd Zed        Zy)CUDAVectorizec                      t        j                  |dd      | j                        }||j                  |j                     j
                  j                  fS )NT)deviceinline)r   jitr   	overloadsr   	signaturereturn_type)r   sigcudevfns      r   _compile_corezCUDAVectorize._compile_core   sE    9$((3tD9$++F))#((3==IIIIr   c                 ~    | j                   j                  j                         }|j                  t        |d       |S )N__cuda____core__)r   __globals__copyupdater   )r   corefnglbls      r   _get_globalszCUDAVectorize._get_globals   s5    {{&&++-!') 	*r   c                 ,    t        j                  |      S r   r   r   r   fnobjr   s      r   _compile_kernelzCUDAVectorize._compile_kernel   s    xxr   c                 B    t        | j                  | j                        S r   )r
   rn   r   rq   s    r   build_ufunczCUDAVectorize.build_ufunc   s    "4>>4;;??r   c                     t         S r   )vectorizer_stager_sourcerq   s    r   _kernel_templatezCUDAVectorize._kernel_template   s    ''r   N)	r   r?   r@   r   r   r   r   r   r   rB   r   r   r   r      s,    J@ ( (r   r   zy
def __gufunc_{name}({args}):
    __tid__ = __cuda__.grid(1)
    if __tid__ < {checkedarg}:
        __core__({argitems})
c                   .    e Zd Zd Zd Zed        Zd Zy)CUDAGUFuncVectorizec                     t        j                  | j                  | j                        }t	        | j
                  || j                        S )N)rn   ro   r   )r   GUFuncEngineinputsig	outputsigrl   rn   r   )r   ro   s     r   r   zCUDAGUFuncVectorize.build_ufunc   s9    ))$--H#dnn+1+/;;8 	8r   c                 8     t        j                  |      |      S r   r   r   s      r   r   z#CUDAGUFuncVectorize._compile_kernel   s    txx}U##r   c                     t         S r   )_gufunc_stager_sourcerq   s    r   r   z$CUDAGUFuncVectorize._kernel_template   s    $$r   c                      t        j                  |d      | j                        }| j                  j                  j                         }|j                  t         |d       |S )NT)r   r   )r   r   r   py_funcr   r   r   )r   r   r   glblss       r   r   z CUDAGUFuncVectorize._get_globals   sP    +#d+DKK8((--/$"(* 	+r   N)r   r?   r@   r   r   r   r   r   rB   r   r   r   r      s%    8$ % %r   r   N)numbar   numpyr   r-   numba.np.ufuncr   numba.np.ufunc.deviceufuncr   r   r   objectr
   rD   rl   r   r   DeviceVectorizer   r   DeviceGUFuncVectorizer   rB   r   r   <module>r      s     # &9 9H& HV!9? !9HM+ M2-M -M` (K// (2 +;; r   