
    xKg                        d dl Zd dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m
Z* d dl+m,Z, d dl+m-Z- d dl.m/Z/ g dZ0 G d dejb                        Z2 G d de3      Z4 G d d      Z5 G d de      Z6 G d de      Z7 G d deejb                        Z8y)     N)config	serializesigutilstypestypingutils)Cache	CacheImpl)global_compiler_lock)
Dispatcher)NumbaPerformanceWarning)Purposetypeof)get_current_device)wrap_arg)compile_cudaCUDACompiler)driver)get_context)cuda_target)missing_launch_config_msgnormalize_kernel_dimensions)r   cuda)_dispatcher)warn)hsinhcoshloghlog10hlog2hexphexp10hexp2hsqrthrsqrthfloorhceilhrcphrinthtrunchdivc                        e Zd ZdZe	 	 	 d fd	       Zed        Zed        Zd Z	ed        Z
ed        Ze fd       Zd	 Zd
 Zed        Zed        Zed        Zed        Zed        Zd Zd Zd Zd ZddZddZddZd Z xZS )_Kernelz
    CUDA Kernel specialized for a given set of argument types. When called, this
    object launches the kernel on the device.
    c                    |rt        d      t        | 	          d| _        d | _        || _        || _        || _        || _        |xs g | _	        ||
rdndd}t               j                  }t        | j
                  t        j                  | j                  | j                  |||||	      }|j                  }| j
                  j                   }|j"                  }|j$                  }|j'                  |j(                  |j*                  ||||||	      \  }}|sg }d|j-                         v | _        | j.                  rd|_        t2        D cg c]  }d	| |j-                         v r| }}|rqt4        j6                  j9                  t4        j6                  j;                  t<                    }t4        j6                  j?                  |d
      }|jA                  |       |D ]  }|jC                  |        |jD                  | _#        |jH                  | _$        |jJ                  | _&        || _'        |jP                  | _(        || _        |j*                  | _        |jR                  | _)        g | _*        g | _+        g | _,        y c c}w )Nz,Cannot compile a device function as a kernelF   r   )fastmathoptdebuglineinfoinliner1   nvvm_optionscccudaCGGetIntrinsicHandleT__numba_wrapper_zcpp_function_wrappers.cu)-RuntimeErrorsuper__init__
objectmodeentry_pointpy_funcargtypesr4   r5   
extensionsr   compute_capabilityr   r   voidtarget_context__code__co_filenameco_firstlinenoprepare_cuda_kernellibraryfndescget_asm_strcooperativeneeds_cudadevrtcuda_fp16_math_funcsospathdirnameabspath__file__joinappendadd_linking_filename
entry_name	signaturetype_annotation_type_annotation_codelibrarycall_helperenvironment_referenced_environmentsliftedreload_init)selfr@   rA   linkr4   r5   r6   r1   rB   max_registersr2   devicer7   r8   crestgt_ctxcodefilenamelinenumlibkernelfnresbasedirfunctions_cu_pathfilepath	__class__s                             Y/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/numba/cuda/dispatcher.pyr=   z_Kernel.__init__.   sU   
 MNN     
 $* !1

  !44DLL%**dmm"&**%-#)%-)5!# %%||$$##%%11$,,27<2:G2?AV
 D 69JJ"&C0 B0b%bT*coo.?? 0 B ggoobggooh&?@G "W-G!IKK)*H  *  !++ $ 4 4++ &kk++(*%;Bs   Jc                     | j                   S N)r]   rc   s    rt   rJ   z_Kernel.library   s           c                     | j                   S rv   )r\   rw   s    rt   r[   z_Kernel.type_annotation   s    $$$rx   c                     | j                   S rv   )r`   rw   s    rt   _find_referenced_environmentsz%_Kernel._find_referenced_environments   s    ,,,rx   c                 6    | j                   j                         S rv   )rE   codegenrw   s    rt   r}   z_Kernel.codegen   s    ""**,,rx   c                 @    t        | j                  j                        S rv   )tuplerZ   argsrw   s    rt   argument_typesz_Kernel.argument_types   s    T^^(())rx   c	                     | j                  |       }	t        | |	          d|	_        ||	_        ||	_        ||	_        d|	_        ||	_        ||	_	        ||	_
        ||	_        ||	_        |	S )&
        Rebuild an instance.
        N)__new__r<   r=   r?   rM   rY   rZ   r\   r]   r4   r5   r^   rB   )clsrM   rX   rZ   codelibraryr4   r5   r^   rB   instancers   s             rt   _rebuildz_Kernel._rebuild   st     ;;s#c8%'#*"&$(! +$*(rx   c           
          t        | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                        S )a  
        Reduce the instance for serialization.
        Compiled definitions are serialized in PTX form.
        Type annotation are discarded.
        Thread, block and shared memory configuration are serialized.
        Stream information is discarded.
        )rM   rX   rZ   r   r4   r5   r^   rB   )	dictrM   rY   rZ   r]   r4   r5   r^   rB   rw   s    rt   _reduce_statesz_Kernel._reduce_states   sL      0 0t"nn$:K:K**t}} $ 0 0T__N 	Nrx   c                 8    | j                   j                          y)z7
        Force binding to current CUDA context
        N)r]   
get_cufuncrw   s    rt   bindz_Kernel.bind   s     	$$&rx   c                 ^    | j                   j                         j                  j                  S )zN
        The number of registers used by each thread for this kernel.
        )r]   r   attrsregsrw   s    rt   regs_per_threadz_Kernel.regs_per_thread   s%    
   ++-33888rx   c                 ^    | j                   j                         j                  j                  S )zD
        The amount of constant memory used by this kernel.
        )r]   r   r   constrw   s    rt   const_mem_sizez_Kernel.const_mem_size   %    
   ++-33999rx   c                 ^    | j                   j                         j                  j                  S )zM
        The amount of shared memory used per block for this kernel.
        )r]   r   r   sharedrw   s    rt   shared_mem_per_blockz_Kernel.shared_mem_per_block   s%    
   ++-33:::rx   c                 ^    | j                   j                         j                  j                  S )z:
        The maximum allowable threads per block.
        )r]   r   r   
maxthreadsrw   s    rt   max_threads_per_blockz_Kernel.max_threads_per_block   s%    
   ++-33>>>rx   c                 ^    | j                   j                         j                  j                  S )zM
        The amount of local memory used per thread for this kernel.
        )r]   r   r   localrw   s    rt   local_mem_per_threadz_Kernel.local_mem_per_thread   r   rx   c                 6    | j                   j                         S )z6
        Returns the LLVM IR for this kernel.
        )r]   get_llvm_strrw   s    rt   inspect_llvmz_Kernel.inspect_llvm   s       --//rx   c                 :    | j                   j                  |      S )z7
        Returns the PTX code for this kernel.
        )r8   )r]   rL   )rc   r8   s     rt   inspect_asmz_Kernel.inspect_asm   s       ,,,33rx   c                 6    | j                   j                         S )zv
        Returns the CFG of the SASS for this kernel.

        Requires nvdisasm to be available on the PATH.
        )r]   get_sass_cfgrw   s    rt   inspect_sass_cfgz_Kernel.inspect_sass_cfg   s       --//rx   c                 6    | j                   j                         S )zp
        Returns the SASS code for this kernel.

        Requires nvdisasm to be available on the PATH.
        )r]   get_sassrw   s    rt   inspect_sassz_Kernel.inspect_sass   s       ))++rx   c                    | j                   t        d      |t        j                  }t	        | j
                  d| j                  |       t	        d|       t	        | j                   |       t	        d|       y)
        Produce a dump of the Python source of this function annotated with the
        corresponding Numba IR and type information. The dump is written to
        *file*, or *sys.stdout* if *file* is *None*.
        Nz Type annotation is not available filezP--------------------------------------------------------------------------------zP================================================================================)r\   
ValueErrorsysstdoutprintrY   r   )rc   r   s     rt   inspect_typesz_Kernel.inspect_types  sg       (?@@<::D$*=*=>TJhT"d##$/hT"rx   c                     t               }| j                  j                         }t        |t              rt        j                  d |      }|j                  |||      }|j                  j                  }||z  S )a  
        Calculates the maximum number of blocks that can be launched for this
        kernel in a cooperative grid in the current context, for the given block
        and dynamic shared memory sizes.

        :param blockdim: Block dimensions, either as a scalar for a 1D block, or
                         a tuple for 2D or 3D blocks.
        :param dynsmemsize: Dynamic shared memory size in bytes.
        :return: The maximum number of blocks in the grid.
        c                     | |z  S rv    )xys     rt   <lambda>z5_Kernel.max_cooperative_grid_blocks.<locals>.<lambda>&  s    QUrx   )
r   r]   r   
isinstancer   	functoolsreduce$get_active_blocks_per_multiprocessorrf   MULTIPROCESSOR_COUNT)rc   blockdimdynsmemsizectxcufuncactive_per_smsm_counts          rt   max_cooperative_grid_blocksz#_Kernel.max_cooperative_grid_blocks  sq     m""--/h& ''(:HEH@@AIALN ::22x''rx   c                    | j                   j                         | j                  r|j                  dz   }j                  j                  |      \  }}|t        j                  t        j                        k(  sJ t        j                         }	|j                  d|       g }
g }t        | j                  |      D ]  \  }}| j                  ||||
|        t        j                  r t        j                  j!                  d      }nd }|xr |j"                  xs |}t        j$                  j"                  g|||||d| j&                  i | j                  rt        j(                  t        j*                  	             |	j,                  dk7  rfd}dD cg c]  } |d|z          }}dD cg c]  } |d|z          }}|	j,                  }| j.                  j1                  |      \  }}}|d	}n1|\  }}}t2        j4                  j7                  |      }d
|d|d|d}|d|d|}|r|d|d   f|dd  z   }n|f} || |
D ]	  } |         y c c}w c c}w )N__errcode__r   )streamrM   c                     j                   j                  j                  d| d      \  }}t        j                         }t        j                  t        j                  |      ||       |j                  S )N__)	moduleget_global_symbolrX   ctypesc_intr   device_to_host	addressofvalue)rX   memszvalr   s       rt   load_symbolz#_Kernel.launch.<locals>.load_symbolS  s`    $mm==?E{{?C?E FGC !,,.C))&*:*:3*?bI99$rx   zyxtidctaid zIn function z, file z, line z, ztid=z ctaid=z:    )r]   r   r4   rX   r   r   r   sizeofr   memsetzipr   _prepare_argsr   USE_NV_BINDINGbindingCUstreamhandlelaunch_kernelrM   r   r   r   r^   get_exceptionrP   rQ   rS   )rc   r   griddimr   r   	sharedmemexcnameexcmemexcszexcvalretr
kernelargstvzero_streamstream_handler   ir   r   ri   excclsexc_argsloclocinfosymrr   linenoprefixwbr   s                                 @rt   launchz_Kernel.launch-  sq   ""--/::kkM1G"MM;;GDMFEFMM&,,7777\\^FMM!FM+ 
++T2DAqq!VT:> 3    ..11!4KK06==?K 	V]] 	;%	;&	; '	; +		;
 (	; *.)9)9	; ::!!&"2"26":FEJ||q % 8==u!{519-u=;@A5aWq[15A||(,(8(8(F(Ft(L%#; G,/)C6!wwx8HFIFNFLOG 18eD,2HQK @B  %H  &wHh'' BD / >As   $I&<I+c                    t        | j                        D ]  }|j                  ||||      \  }} t        |t        j
                        rt        |      j                  ||      }t        j                  }t        j                  d      }	t        j                  d      }
 ||j                        } ||j                  j                        }t        j                  |      }t        j                   rt#        |      }t        j                  |      }|j%                  |	       |j%                  |
       |j%                  |       |j%                  |       |j%                  |       t'        |j(                        D ]&  }|j%                   ||j*                  |                ( t'        |j(                        D ]&  }|j%                   ||j,                  |                ( yt        |t        j.                        r+ t1        t        d|z        |      }|j%                  |       y|t        j2                  k(  rWt        j4                  t7        j2                  |      j9                  t6        j:                              }|j%                  |       y|t        j<                  k(  r't        j>                  |      }|j%                  |       y|t        j@                  k(  r't        jB                  |      }|j%                  |       y|t        jD                  k(  r0t        jF                  t#        |            }|j%                  |       y|t        jH                  k(  r]|j%                  t        jB                  |jJ                               |j%                  t        jB                  |jL                               y|t        jN                  k(  r]|j%                  t        j>                  |jJ                               |j%                  t        j>                  |jL                               yt        |t        jP                  t        jR                  f      rB|j%                  t        jT                  |j9                  t6        jV                                     yt        |t        jX                        rgt        |      j                  ||      }|jZ                  }t        j                   rt        j                  t#        |            }|j%                  |       yt        |t        j\                        rCt_        |      t_        |      k(  sJ ta        ||      D ]  \  }}| jc                  |||||        yt        |t        jd                        r+	 | jc                  |j                  |jf                  |||       yti        ||      # th        $ r ti        ||      w xY w)zF
        Convert arguments to ctypes and append to kernelargs
        )r   r   r   zc_%sN)5reversedrB   prepare_argsr   r   Arrayr   	to_devicer   	c_ssize_tc_void_psizedtypeitemsizer   device_pointerr   intrV   rangendimshapestridesIntegergetattrfloat16c_uint16npviewuint16float64c_doublefloat32c_floatbooleanc_uint8	complex64realimag
complex128
NPDatetimeNPTimedeltac_int64int64Recorddevice_ctypes_pointer	BaseTuplelenr   r   
EnumMemberr   NotImplementedError)rc   tyr   r   r   r   	extensiondevaryc_intpmeminfoparentnitemsr  ptrdataaxcvaldevrecr   r   s                       rt   r   z_Kernel._prepare_argsu  s?    "$//2I,,	 - GB 3 b%++&c],,T6:F%%Fooa(G__Q'FFKK(Ffll334H''/C$$#h??3'Dg&f%f%h'd#FKK(!!&b)9":; )FKK(!!&);"<= ) EMM*/766B;/4Dd#5== ??2::c?#7#7		#BCDd#5== ??3'Dd#5== >>#&Dd#5== >>#c(+Dd#5??"fnnSXX67fnnSXX675###foochh78foochh78U--u/@/@ABfnnSXXbhh-?@AELL)c],,T6:F..C$$ooc#h/c"EOO,r7c#h&&&B1""1azB % E,,-3""HHciiz &b#..	 ' 3)"c223s   )W	 	W)	NFFFFNNTFrv   )r   r   r   )__name__
__module____qualname____doc__r   r=   propertyrJ   r[   r{   r}   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__rs   s   @rt   r.   r.   (   s+   
 ;@JN6;Z Zx ! ! % %- - - * *  *N' 9 9 : : ; ; ? ? : :040,#"(,FP\/rx   r.   c                       e Zd Zd Zd Zd Zy)ForAllc                 p    |dk  rt        d|z        || _        || _        || _        || _        || _        y )Nr   z0Can't create ForAll with negative task count: %s)r   
dispatcherntasksthread_per_blockr   r   )rc   r?  r@  tpbr   r   s         rt   r=   zForAll.__init__  sE    A:O%& ' '$ #"rx   c                 &   | j                   dk(  ry | j                  j                  r| j                  }n | j                  j                  | }| j	                  |      }| j                   |z   dz
  |z  } |||| j
                  | j                  f   | S )Nr   r   )r@  r?  specialized
specialize_compute_thread_per_blockr   r   )rc   r   rD  r   r   s        rt   __call__zForAll.__call__  s    ;;!??&&//K4$//44d;K11+>;;)A-(:+{7Hdkk>>* +,02 	2rx   c                 $   | j                   }|dk7  r|S t               }t        t        |j                  j                                     }t        |j                  j                         d| j                  d      } |j                  di |\  }}|S )Nr   i   )funcb2d_funcmemsizeblocksizelimitr   )rA  r   nextiter	overloadsvaluesr   r]   r   r   get_max_potential_block_size)rc   r?  rB  r   rm   kwargs_s          rt   rF  z ForAll._compute_thread_per_block  s    ##!8J -C $z33::<=>F((335#	F 6S55??FAsJrx   N)r4  r5  r6  r=   rG  rF  r   rx   rt   r=  r=    s    #2rx   r=  c                       e Zd Zd Zd Zy)_LaunchConfigurationc                     || _         || _        || _        || _        || _        t
        j                  r4d}|d   |d   z  |d   z  }||k  rd| d}t        t        |             y y y )N   r   r      z
Grid size zB will likely result in GPU under-utilization due to low occupancy.)	r?  r   r   r   r   r   CUDA_LOW_OCCUPANCY_WARNINGSr   r   )	rc   r?  r   r   r   r   min_grid_size	grid_sizemsgs	            rt   r=   z_LaunchConfiguration.__init__  s    $ "--  M
WQZ/'!*<I=(#I; /A A,S12 ) .rx   c                     | j                   j                  || j                  | j                  | j                  | j
                        S rv   )r?  callr   r   r   r   rc   r   s     rt   rG  z_LaunchConfiguration.__call__  s6    ##D$,,$(KKA 	Arx   N)r4  r5  r6  r=   rG  r   rx   rt   rU  rU    s    3.Arx   rU  c                       e Zd Zd Zd Zd Zy)CUDACacheImplc                 "    |j                         S rv   )r   )rc   rm   s     rt   r   zCUDACacheImpl.reduce   s    $$&&rx   c                 ,    t        j                  di |S )Nr   )r.   r   )rc   rE   payloads      rt   rebuildzCUDACacheImpl.rebuild#  s    *'**rx   c                      y)NTr   )rc   rg   s     rt   check_cachablezCUDACacheImpl.check_cachable&  s     rx   N)r4  r5  r6  r   re  rg  r   rx   rt   ra  ra    s    '+rx   ra  c                   &     e Zd ZdZeZ fdZ xZS )	CUDACachezS
    Implements a cache that saves and loads CUDA kernels and compile results.
    c                 l    ddl m}  |d      5  t        |   ||      cd d d        S # 1 sw Y   y xY w)Nr   )target_overrider   )numba.core.target_extensionrk  r<   load_overload)rc   sigrE   rk  rs   s       rt   rm  zCUDACache.load_overload7  s,     	@V$7(n= %$$s   *3)r4  r5  r6  r7  ra  _impl_classrm  r:  r;  s   @rt   ri  ri  1  s      K> >rx   ri  c                   D    e Zd ZdZdZeZef fd	Ze	d        Z
d Z ej                  d      d"d       Zd	 Zd#d
Ze	d        Zd Zd Zd Zd Zd Ze	d        Zd$dZd$dZd$dZd$dZd$dZd Zd$dZd Zd Z d$dZ!d$dZ"d$dZ#d$dZ$d$dZ%e&d         Z'd! Z( xZ)S )%CUDADispatchera  
    CUDA Dispatcher object. When configured and called, the dispatcher will
    specialize itself for the given arguments (if no suitable specialized
    version already exists) & compute capability, and launch on the device
    associated with the current context.

    Dispatcher objects are not to be constructed by the user, but instead are
    created using the :func:`numba.cuda.jit` decorator.
    Fc                 F    t         |   |||       d| _        i | _        y )N)targetoptionspipeline_classF)r<   r=   _specializedspecializations)rc   r@   rs  rt  rs   s       rt   r=   zCUDADispatcher.__init__R  s0    (6 	 	8 "  "rx   c                 ,    t        j                  |       S rv   )
cuda_typesrq  rw   s    rt   _numba_type_zCUDADispatcher._numba_type_b  s    ((..rx   c                 8    t        | j                        | _        y rv   )ri  r@   _cacherw   s    rt   enable_cachingzCUDADispatcher.enable_cachingf  s    -rx   rW  )maxsizec                 >    t        ||      \  }}t        | ||||      S rv   )r   rU  )rc   r   r   r   r   s        rt   	configurezCUDADispatcher.configurei  s&    7J#D'8VYOOrx   c                 P    t        |      dvrt        d       | j                  | S )N)rX  r0      z.must specify at least the griddim and blockdim)r$  r   r  r_  s     rt   __getitem__zCUDADispatcher.__getitem__n  s+    t9I%MNNt~~t$$rx   c                 "    t        | ||||      S )a3  Returns a 1D-configured dispatcher for a given number of tasks.

        This assumes that:

        - the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
          1-1 basis.
        - the kernel checks that the Global Thread ID is upper-bounded by
          ``ntasks``, and does nothing if it is not.

        :param ntasks: The number of tasks.
        :param tpb: The size of a block. An appropriate value is chosen if this
                    parameter is not supplied.
        :param stream: The stream on which the configured dispatcher will be
                       launched.
        :param sharedmem: The number of bytes of dynamic shared memory required
                          by the kernel.
        :return: A configured dispatcher, ready to launch on a set of
                 arguments.)rB  r   r   )r=  )rc   r@  rB  r   r   s        rt   forallzCUDADispatcher.foralls  s    ( dFFiPPrx   c                 8    | j                   j                  d      S )aS  
        A list of objects that must have a `prepare_args` function. When a
        specialized kernel is called, each argument will be passed through
        to the `prepare_args` (from the last object in this list to the
        first). The arguments to `prepare_args` are:

        - `ty` the numba type of the argument
        - `val` the argument value itself
        - `stream` the CUDA stream used for the current call to the kernel
        - `retr` a list of zero-arg functions that you may want to append
          post-call cleanup work to.

        The `prepare_args` function must return a tuple `(ty, val)`, which
        will be passed in turn to the next right-most `extension`. After all
        the extensions have been called, the resulting `(ty, val)` will be
        passed into Numba's default argument marshalling logic.
        rB   )rs  getrw   s    rt   rB   zCUDADispatcher.extensions  s    & !!%%l33rx   c                      t        t              rv   )r   r   )rc   r   rR  s      rt   rG  zCUDADispatcher.__call__  s    233rx   c                     | j                   r-t        t        | j                  j	                                     }n t        j                  j                  | g| }|j                  |||||       y)zJ
        Compile if necessary and invoke this kernel with *args*.
        N)	rD  rM  rN  rO  rP  r   r   
_cuda_callr   )rc   r   r   r   r   r   rm   s          rt   r^  zCUDADispatcher.call  sX     $t~~44678F ++66tCdCFdGXvyArx   c                     |rJ |D cg c]  }| j                  |       }}| j                  t        |            S c c}w rv   )typeof_pyvalcompiler   )rc   r   kwsarA   s        rt   _compile_for_argsz CUDADispatcher._compile_for_args  s@    w267$QD%%a($7||E(O,, 8s   =c                     	 t        |t        j                        S # t        $ rH t	        j
                  |      r1t        t	        j                  |d      t        j                        cY S  w xY w)NF)sync)r   r   argumentr   r   is_cuda_arrayas_cuda_array)rc   r   s     rt   r  zCUDADispatcher.typeof_pyval  sc    		#w//00 	!!#& d005A%..0 0 	s    AA-+A-c                    t               j                  }t        |D cg c]  }| j                  j	                  |       c}      }| j
                  rt        d      | j                  j                  ||f      }|r|S | j                  }t        | j                  |      }|j                  |       |j                          d|_        || j                  ||f<   |S c c}w )zd
        Create a new instance of this dispatcher specialized for the given
        *args*.
        zDispatcher already specialized)rs  T)r   rC   r   	typingctxresolve_argument_typerD  r;   rv  r  rs  rq  r@   r  disable_compileru  )rc   r   r8   r  rA   specializationrs  s          rt   rE  zCUDADispatcher.specialize  s    
  !44>BCdT^^11!4dCE?@@--112x.A!!**'6CEx(&&(&*#-;R\* Ds   "Cc                     | j                   S )z>
        True if the Dispatcher has been specialized.
        )ru  rw   s    rt   rD  zCUDADispatcher.specialized  s    
    rx   c                 L   |#| j                   |j                     j                  S | j                  r6t	        t        | j                   j                                     j                  S | j                   j                         D ci c]  \  }}||j                   c}}S c c}}w )a  
        Returns the number of registers used by each thread in this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get register
                          usage for. This may be omitted for a specialized
                          kernel.
        :return: The number of registers used by the compiled variant of the
                 kernel for the given signature and current device.
        )rO  r   r   rD  rM  rN  rP  itemsrc   rZ   rn  overloads       rt   get_regs_per_threadz"CUDADispatcher.get_regs_per_thread  s      >>)..1AAAT^^22456FFF *.)=)=)?A)?X 111)?A A A   B c                 L   |#| j                   |j                     j                  S | j                  r6t	        t        | j                   j                                     j                  S | j                   j                         D ci c]  \  }}||j                   c}}S c c}}w )a  
        Returns the size in bytes of constant memory used by this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get constant
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The size in bytes of constant memory allocated by the
                 compiled variant of the kernel for the given signature and
                 current device.
        )rO  r   r   rD  rM  rN  rP  r  r  s       rt   get_const_mem_sizez!CUDADispatcher.get_const_mem_size  s      >>)..1@@@T^^22456EEE *.)=)=)?A)?X 000)?A A Ar  c                 L   |#| j                   |j                     j                  S | j                  r6t	        t        | j                   j                                     j                  S | j                   j                         D ci c]  \  }}||j                   c}}S c c}}w )a  
        Returns the size in bytes of statically allocated shared memory
        for this kernel.

        :param signature: The signature of the compiled kernel to get shared
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of shared memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        )rO  r   r   rD  rM  rN  rP  r  r  s       rt   get_shared_mem_per_blockz'CUDADispatcher.get_shared_mem_per_block        >>)..1FFFT^^22456KKK *.)=)=)?A)?X 666)?A A Ar  c                 L   |#| j                   |j                     j                  S | j                  r6t	        t        | j                   j                                     j                  S | j                   j                         D ci c]  \  }}||j                   c}}S c c}}w )a(  
        Returns the maximum allowable number of threads per block
        for this kernel. Exceeding this threshold will result in
        the kernel failing to launch.

        :param signature: The signature of the compiled kernel to get the max
                          threads per block for. This may be omitted for a
                          specialized kernel.
        :return: The maximum allowable threads per block for the compiled
                 variant of the kernel for the given signature and current
                 device.
        )rO  r   r   rD  rM  rN  rP  r  r  s       rt   get_max_threads_per_blockz(CUDADispatcher.get_max_threads_per_block  s      >>)..1GGGT^^22456LLL *.)=)=)?A)?X 777)?A A Ar  c                 L   |#| j                   |j                     j                  S | j                  r6t	        t        | j                   j                                     j                  S | j                   j                         D ci c]  \  }}||j                   c}}S c c}}w )a  
        Returns the size in bytes of local memory per thread
        for this kernel.

        :param signature: The signature of the compiled kernel to get local
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of local memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        )rO  r   r   rD  rM  rN  rP  r  r  s       rt   get_local_mem_per_threadz'CUDADispatcher.get_local_mem_per_thread/  r  r  c                 *   | j                   r| j                  t        |             | j                  j                  }dj                  |      }t        j                  ||| j                        }t        j                  | j                        }||||fS )z
        Get a typing.ConcreteTemplate for this dispatcher and the given
        *args* and *kws* types.  This allows resolution of the return type.

        A (template, pysig, args, kws) tuple is returned.
        zCallTemplate({0}))key
signatures)_can_compilecompile_devicer   r@   r4  formatr   make_concrete_templatenopython_signaturesr   pysignature)rc   r   r  	func_namerX   call_templatepysigs          rt   get_call_templatez CUDADispatcher.get_call_templateB  s     d, LL))	")))455iD,D,DF!!$,,/eT3..rx   c                    || j                   vr"| j                  5  | j                  j                  d      }| j                  j                  d      }| j                  j                  d      }| j                  j                  d      }| j                  j                  d      rdnd|d}t	               j
                  }t        | j                  ||||||||		      }	|	| j                   |<   |	j                  j                  |	j                  |	j                  |	j                  g       d
d
d
       |	S | j                   |   }	|	S # 1 sw Y   	S xY w)zCompile the device function for the given argument types.

        Each signature is compiled once by caching the compiled function inside
        this object.

        Returns the `CompileResult`.
        r4   r5   r6   r1   r2   r0   r   )r2   r1   r3   N)rO  _compiling_counterrs  r  r   rC   r   r@   rE   insert_user_functionr?   rK   rJ   )
rc   r   return_typer4   r5   r6   r1   r7   r8   rg   s
             rt   r  zCUDADispatcher.compile_device]  s6    t~~%((**..w7--11*=++//9--11*= !% 2 2 6 6u =11 ( 
 ()<<#DLL+t*/-5+1-51=')+ (,t$##889I9I9=:>,,I- )8  >>$'D9 )8 s   DEEc                     |D cg c]  }|j                    }}| j                  ||d       || j                  |<   y c c}w )NTr   )_code_insertrO  )rc   rm   rA   r  c_sigs        rt   add_overloadzCUDADispatcher.add_overload  s?    "*+(Q(+UF.#)x  ,s   >c                    t        j                  |      \  }}||t        j                  k(  sJ | j                  r,t        t        | j                  j                                     S | j                  j                  |      }||S | j                  j                  || j                        }|| j                  |xx   dz  cc<   n{| j                  |xx   dz  cc<   | j                  st!        d      t#        | j$                  |fi | j&                  }|j)                          | j                  j+                  ||       | j-                  ||       |S )z
        Compile and bind to the current context a version of this kernel
        specialized for the given signature.
        r   zCompilation disabled)r   normalize_signaturer   nonerD  rM  rN  rO  rP  r  r{  rm  	targetctx_cache_hits_cache_missesr  r;   r.   r@   rs  r   save_overloadr  )rc   rn  rA   r  rm   s        rt   r  zCUDADispatcher.compile  s)   
 !) < <S A+"kUZZ&??? T^^224566^^''1F! **3?S!Q&! s#q(#$$"#9::T\\8Jt7I7IJFKKMKK%%c62&(+rx   c                    | j                   j                  d      }|F|r'| j                  |   j                  j	                         S | j                  |   j                         S |rF| j                  j                         D ci c]   \  }}||j                  j	                         " c}}S | j                  j                         D ci c]  \  }}||j                          c}}S c c}}w c c}}w )z
        Return the LLVM IR for this kernel.

        :param signature: A tuple of argument types.
        :return: The LLVM IR for the given signature, or a dict of LLVM IR
                 for all previously-encountered signatures.

        rf   )rs  r  rO  rJ   r   r   r  )rc   rZ   rf   rn  r  s        rt   r   zCUDADispatcher.inspect_llvm  s     ##''1 ~~i088EEGG~~i0==??-1^^-A-A-CE-CMC X--::<<-CE E .2^^-A-A-CE-CMC X2244-CE EEEs   %C(	C.c                    t               j                  }| j                  j                  d      }|H|r(| j                  |   j
                  j                  |      S | j                  |   j                  |      S |rG| j                  j                         D ci c]!  \  }}||j
                  j                  |      # c}}S | j                  j                         D ci c]  \  }}||j                  |       c}}S c c}}w c c}}w )a+  
        Return this kernel's PTX assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The PTX code for the given signature, or a dict of PTX codes
                 for all previously-encountered signatures.
        rf   )	r   rC   rs  r  rO  rJ   rL   r   r  )rc   rZ   r8   rf   rn  r  s         rt   r   zCUDADispatcher.inspect_asm  s
     !44##''1 ~~i088DDRHH~~i0<<R@@-1^^-A-A-CE-CMC X--99"==-CE E .2^^-A-A-CE-CMC X11"55-CE EEEs   &D  Dc                    | j                   j                  d      rt        d      || j                  |   j	                         S | j                  j                         D ci c]  \  }}||j	                          c}}S c c}}w )a  
        Return this kernel's CFG for the device in the current context.

        :param signature: A tuple of argument types.
        :return: The CFG for the given signature, or a dict of CFGs
                 for all previously-encountered signatures.

        The CFG for the device in the current context is returned.

        Requires nvdisasm to be available on the PATH.
        rf   z'Cannot get the CFG of a device function)rs  r  r;   rO  r   r  rc   rZ   rn  defns       rt   r   zCUDADispatcher.inspect_sass_cfg  s     !!(+HII >>),==?? &*^^%9%9%;=%;	T ..00%;= = =   #Bc                    | j                   j                  d      rt        d      || j                  |   j	                         S | j                  j                         D ci c]  \  }}||j	                          c}}S c c}}w )a  
        Return this kernel's SASS assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The SASS code for the given signature, or a dict of SASS codes
                 for all previously-encountered signatures.

        SASS for the device in the current context is returned.

        Requires nvdisasm to be available on the PATH.
        rf   z(Cannot inspect SASS of a device function)rs  r  r;   rO  r   r  r  s       rt   r   zCUDADispatcher.inspect_sass  s     !!(+IJJ >>),99;; &*^^%9%9%;=%;	T **,,%;= = =r  c                     |t         j                  }| j                  j                         D ]  \  }}|j	                  |        y)r   Nr   )r   r   rO  r  r   )rc   r   rS  r  s       rt   r   zCUDADispatcher.inspect_types  s>     <::D~~++-GAtD) .rx   c                      | ||      }|S )r   r   )r   r@   rs  r   s       rt   r   zCUDADispatcher._rebuild  s    
 w.rx   c                 D    t        | j                  | j                        S )zd
        Reduce the instance for serialization.
        Compiled definitions are discarded.
        )r@   rs  )r   r@   rs  rw   s    rt   r   zCUDADispatcher._reduce_states  s     
 DLL"&"4"46 	6rx   r3  )r   r   r   rv   )*r4  r5  r6  r7  
_fold_argsr   targetdescrr   r=   r8  ry  r|  r   	lru_cacher  r  r  rB   rG  r^  r  r  rE  rD  r  r  r  r  r  r  r  r  r  r   r   r   r   r   r9  r   r   r:  r;  s   @rt   rq  rq  @  s    JK>J "  / /. Y%P &P%
Q, 4 4(4	B-0 ! !A&A(A&A*A&/6%N*
"HE.E0=*=,
*  6rx   rq  )9numpyr  rP   r   r   r   
numba.corer   r   r   r   r   r   numba.core.cachingr	   r
   numba.core.compiler_lockr   numba.core.dispatcherr   numba.core.errorsr   numba.core.typing.typeofr   r   numba.cuda.apir   numba.cuda.argsr   numba.cuda.compilerr   r   numba.cuda.cudadrvr   numba.cuda.cudadrv.devicesr   numba.cuda.descriptorr   numba.cuda.errorsr   r   
numba.cudarx  numbar   r   warningsr   rO   ReduceMixinr.   objectr=  rU  ra  ri  rq  r   rx   rt   <module>r     s     	 
   H H / 9 , 5 4 - $ : % 2 -< *   * i/i## i/X+V +\A A:I $> >a6Z!6!6 a6rx   