
    xKg+^                     j   d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZmZmZ ddlmZmZmZ ddlmZmZ  ej6                  e      ZdZdZd	Z d
Z!dZ"eZ#eZ$djK                         Z& e'e&      D ]  \  Z(Z) e*ejV                  e   e)e(        dZ,dZ-d Z. ej^                         Z0 G d de1      Z2 G d de1      Z3dZ4dddddddddddddZ5d Z6d Z7d Z8d Z9dZ: G d d e1      Z;d!Z<d"Z=d#Z>d$Z?d%Z@d& ZAd' ZBd( ZCd) ZDd* ZEd+ ZFd, ZG ej                  d-      ZId. ZJd/ ZKd0 ZLy)1z(
This is a direct translation of nvvm.h
    N)c_void_pc_intPOINTERc_char_pc_size_tbyrefc_char)ir   )	NvvmErrorNvvmSupportErrorNvvmWarning)get_libdeviceopen_libdeviceopen_cudalib)cgutilsconfig         a  
NVVM_SUCCESS
NVVM_ERROR_OUT_OF_MEMORY
NVVM_ERROR_PROGRAM_CREATION_FAILURE
NVVM_ERROR_IR_VERSION_MISMATCH
NVVM_ERROR_INVALID_INPUT
NVVM_ERROR_INVALID_PROGRAM
NVVM_ERROR_INVALID_IR
NVVM_ERROR_INVALID_OPTION
NVVM_ERROR_NO_MODULE_IN_PROGRAM
NVVM_ERROR_COMPILATION
ze-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64ze-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64c                  8    	 t                y# t        $ r Y yw xY w)z(
    Return if libNVVM is available
    TF)NVVMr        [/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/numba/cuda/cudadrv/nvvm.pyis_availabler   <   s%       s   
 	c                   V   e Zd ZdZe ee       ee      fe ee      fe ee      feeee	efeeee	efeee ee      fee ee	      feeefee ee	      feeefe ee       ee       ee       ee      feee ee      fdZ
dZd Zd Zed        Zed        Zd Zd	 Zdd
Zy)r   zProcess-wide singleton.
    )nvvmVersionnvvmCreateProgramnvvmDestroyProgramnvvmAddModuleToProgramnvvmLazyAddModuleToProgramnvvmCompileProgramnvvmGetCompiledResultSizenvvmGetCompiledResultnvvmGetProgramLogSizenvvmGetProgramLognvvmIRVersionnvvmVerifyProgramNc                    t         5  | j                  t        j                  |       x| _        }	 t	        d      |_        |j                  j                         D ]<  \  }}t        |j
                  |      }|d   |_        |dd  |_        t        |||       > d d d        | j                  S # t        $ r}d | _        d}t        ||z        d }~ww xY w# 1 sw Y   | j                  S xY w)Nnvvmz;libNVVM cannot be found. Do `conda install cudatoolkit`:
%sr   r   )
_nvvm_lock_NVVM__INSTANCEobject__new__r   driverOSErrorr   _PROTOTYPESitemsgetattrrestypeargtypessetattr)clsinsteerrmsgnameprotofuncs          r   r/   zNVVM.__new__   s    ~~%(.s(;;7".v"6DK $(#3#3#9#9#;KD%"4;;5D#(8DL$)!"IDMD$-	 $< $ ~~  7%)CN2F*6A:66	7 $ ~~s/   )CB.AC.	C7CCCC*c                     | j                         }|d   | _        |d   | _        |d   | _        |d   | _        t               | _        y )Nr   r      r   )get_ir_version_majorIR_minorIR	_majorDbg	_minorDbgget_supported_ccs_supported_ccs)selfir_versionss     r   __init__zNVVM.__init__   sG    ))+#A#A$Q$Q/1r   c                 P    | j                   | j                  fdk  rt        S t        S )N)r      )rB   rC   _datalayout_original_datalayout_i128rH   s    r   data_layoutzNVVM.data_layout   s#    MM4==)F2''##r   c                     | j                   S N)rG   rO   s    r   supported_ccszNVVM.supported_ccs   s    """r   c                     t               }t               }| j                  t        |      t        |            }| j                  |d       |j                  |j                  fS )NzFailed to get version.)r   r   r   check_errorvalue)rH   majorminorerrs       r   get_versionzNVVM.get_version   sN    uU|U5\:67{{EKK''r   c                 B   t               }t               }t               }t               }| j                  t        |      t        |      t        |      t        |            }| j                  |d       |j                  |j                  |j                  |j                  fS )NzFailed to get IR version.)r   r(   r   rU   rV   )rH   majorIRminorIRmajorDbgminorDbgrY   s         r   rA   zNVVM.get_ir_version   sx    ''77  ww!&x%/C9:}}gmmX^^X^^KKr   c                 x    |r8t        |t        |         }|r!t        |       t        j                  d       y |y )Nr   )r   RESULT_CODE_NAMESprintsysexit)rH   errormsgrd   excs        r   rU   zNVVM.check_error   s6    C!25!9:Cc
	 r   )F)__name__
__module____qualname____doc__nvvm_resultr   r   nvvm_programr   r   r2   r-   r/   rJ   propertyrP   rS   rZ   rA   rU   r   r   r   r   r   K   s<   
 $WU^WU^D *7<+@A  +GL,AB
 x8#E x8'E ugh.?A wx'8&: #.|X!F #.|WX=N!O *<B &wu~wu~!%.'%.: *<%h/1c3Kl J*2 $ $ # #(Lr   r   c                   6    e Zd Zd Zd Zd Zd Zd Zd Zd Z	y)	CompilationUnitc                     t               | _        t               | _        | j                  j	                  t        | j                              }| j                  j                  |d       y )NzFailed to create CU)r   r0   rm   _handler   r   rU   )rH   rY   s     r   rJ   zCompilationUnit.__init__   sF    f#~kk++E$,,,?@%:;r   c                     t               }|j                  t        | j                              }|j	                  |dd       y )NzFailed to destroy CUT)rd   )r   r    r   rr   rU   )rH   r0   rY   s      r   __del__zCompilationUnit.__del__   s8    ''dll(;<3 6TBr   c                     | j                   j                  | j                  |t        |      d      }| j                   j	                  |d       y)z
         Add a module level NVVM IR to a compilation unit.
         - The buffer should contain an NVVM module IR either in the bitcode
           representation (LLVM3.0) or in the text representation.
        NFailed to add module)r0   r!   rr   lenrU   rH   bufferrY   s      r   
add_modulezCompilationUnit.add_module   s?     kk00v14VdD%;<r   c                     | j                   j                  | j                  |t        |      d      }| j                   j	                  |d       y)z
        Lazily add an NVVM IR module to a compilation unit.
        The buffer should contain NVVM module IR either in the bitcode
        representation or in the text representation.
        Nrv   )r0   r"   rr   rw   rU   rx   s      r   lazy_add_modulezCompilationUnit.lazy_add_module   s?     kk44T\\658[$H%;<r   c           
         d }|j                         D cg c]  \  }} |||       }}}t        t        |      z  |D cg c]  }t        |j                  d             c} }| j                  j                  | j                  t        |      |      }| j                  |d       | j                  j                  | j                  t        |      |      }| j                  |d       t               }| j                  j                  | j                  t        |            }| j                  |d       t        |j                  z         }	| j                  j                  | j                  |	      }| j                  |d       | j                         | _        | j                   r%t#        j$                  | j                   t&               |	dd S c c}}w c c}w )	a  Perform Compilation.

        Compilation options are accepted as keyword arguments, with the
        following considerations:

        - Underscores (`_`) in option names are converted to dashes (`-`), to
          match NVVM's option name format.
        - Options that take a value will be emitted in the form
          "-<name>=<value>".
        - Booleans passed as option values will be converted to integers.
        - Options which take no value (such as `-gen-lto`) should have a value
          of `None` passed in and will be emitted in the form "-<name>".

        For documentation on NVVM compilation options, see the CUDA Toolkit
        Documentation:

        https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
        c                 z    | j                  dd      } |d|  S t        |t              rt        |      }d|  d| S )N_-=)replace
isinstanceboolint)kvs     r   stringify_optionz1CompilationUnit.compile.<locals>.stringify_option   sG    		#s#Ay1#w!T"Fqc1#;r   utf8zFailed to verify
zFailed to compile
z&Failed to get size of compiled result.zFailed to get compiled result.)categoryN)r3   r   rw   encoder0   r)   rr   
_try_errorr#   r   r$   r   r	   rV   r%   get_loglogwarningswarnr   )
rH   optionsr   r   r   xc_optsrY   reslenoutput_buffers
             r   compilezCompilationUnit.compile   s|   (		 7>mmoFoda#Aq)oFS\)6=-?6= .6ahhv6F-G6=-? @ kk++DLL#g,O12 kk,,T\\3w<P23 kk33DLL%-PEF&,,.1kk//mL=> <<>88MM$(([9Q7 G-?s   G!Gc                 b    | j                   j                  ||d| j                                y )N
)r0   rU   r   )rH   rY   rf   s      r   r   zCompilationUnit._try_error%  s!    dlln%EFr   c                    t               }| j                  j                  | j                  t	        |            }| j                  j                  |d       |j                  dkD  rtt        |j                  z         }| j                  j                  | j                  |      }| j                  j                  |d       |j                  j                  d      S y)Nz#Failed to get compilation log size.r   zFailed to get compilation log.r    )
r   r0   r&   rr   r   rU   rV   r	   r'   decode)rH   r   rY   logbufs       r   r   zCompilationUnit.get_log(  s    kk//eFmL%JK<<!v||+.F++//fECKK##C)IJ<<&&v..r   N)
rh   ri   rj   rJ   rt   rz   r|   r   r   r   r   r   r   rp   rp      s(    <C
==: xGr   rp   )r   r   )r      r   r   )r   r@   )r   r   )   r   )r   r   )r   r@   )r   r   )r   r@   )r   r   )rL   r   rL   r   rL   r   )rL   	   r   r   )r   r   )r   r   )r   r   )r   r   ))   r@   )r   r   )r   r   )r   r   )r   r   )r   r   )r   rL   )   r   )r   r   )r   r@   )r   r   )r   r   c                    	 t         |    \  }}t        t        D cg c]  }||cxk  r|k  rn n| c}      S c c}w # t        $ r9 t        t        D cg c]  }|t        j
                  k\  r| nc c}w c}      cY S w xY wrR   )CTK_SUPPORTEDtupleCOMPUTE_CAPABILITIESKeyErrorr   CUDA_DEFAULT_PTX_CC)ctk_versionmin_ccmax_ccccs       r   ccs_supported_by_ctkr   Q  s    	<&{3#7 1#7R2// #7 1 2 	2 1 < #7 ;#7Rv999 #7 ; < 	<<s'   > 9> > B A1
0B ?B c                      	 ddl m}  | j                         }t        t              }||k  r8d}|d    d|d    }d| d|d    d|d    d}t        j                  |       |S t        |      }|S #  d}|cY S xY w)	Nr   )runtimer   .r   zCUDA Toolkit z is unsupported by Numba - z! is the minimum required version.)numba.cuda.cudadrv.runtimer   rZ   minr   r   r   r   )r   cudart_version_supported_cc
min_cudartctk_verunsupported_vers         r   rF   rF   ^  s    6 ,,. ]#J
"#A&'q):(;<*7)3N(m_Ajm_ =// 	o&(8M% s   A1 1A9c                     t               j                  }|sd}t        |      t        |      D ]5  \  }}|| k(  r|c S || kD  s|dk(  rd| |z   z  }t        |      ||dz
     c S  |d   S )z
    Given a compute capability, return the closest compute capability supported
    by the CUDA toolkit.

    :param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
    :return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
    zmNo supported GPU compute capabilities found. Please check your cudatoolkit version matches your CUDA version.r   z?GPU compute capability %d.%d is not supported(requires >=%d.%d)r   )r   rS   r   	enumerate)myccrS   rf   ir   s        r   find_closest_archr   w  s     F((MQs##=)2:I$YAv+.2Ri9&s++ %QU++ *  r   c                 p    t         j                  rt         j                  }d|z  S t        | |f      }d|z  S )z1Matches with the closest architecture option
    zcompute_%d%d)r   FORCE_CUDA_CCr   )rW   rX   archs      r   get_arch_optionr     s@     ## D   !%0D  r   aN  Missing libdevice file.
Please ensure you have a CUDA Toolkit 11.2 or higher.
For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:

    $ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"

For CUDA 11, ``cudatoolkit`` is required:

    $ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
c                       e Zd ZdZd Zd Zy)	LibDeviceNc                     | j                   (t               t        t              t	               | _         | j                   | _        y rR   )_cache_r   RuntimeErrorMISSING_LIBDEVICE_FILE_MSGr   bcrO   s    r   rJ   zLibDevice.__init__  s5    <<&"#=>>)+DL,,r   c                     | j                   S rR   )r   rO   s    r   getzLibDevice.get  s    wwr   )rh   ri   rj   r   rJ   r   r   r   r   r   r     s    Gr   r   z
    %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
    %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
a  
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
    %iptr = bitcast {T}* %ptr to {Ti}*
    %old2 = load volatile {Ti}, {Ti}* %iptr
    br label %attempt

attempt:
    %old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
    %dold = bitcast {Ti} %old to {T}
    %dnew = {OP} {T} %dold, %val
    %new = bitcast {T} %dnew to {Ti}
    {CAS}
    %repeat = icmp ne {Ti} %cas, %old
    br i1 %repeat, label %attempt, label %done

done:
    %result = bitcast {Ti} %old to {T}
    ret {T} %result
}}
a  
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
entry:
    %old2 = load volatile {T}, {T}* %iptr
    br label %attempt

attempt:
    %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
    %bndchk = icmp ult {T} %old, %val
    %inc = add {T} %old, 1
    %new = select i1 %bndchk, {T} %inc, {T} 0
    {CAS}
    %repeat = icmp ne {T} %cas, %old
    br i1 %repeat, label %attempt, label %done

done:
    ret {T} %old
}}
a  
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
entry:
    %old2 = load volatile {T}, {T}* %iptr
    br label %attempt

attempt:
    %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
    %dec = add {T} %old, -1
    %bndchk = icmp ult {T} %dec, %val
    %new = select i1 %bndchk, {T} %dec, {T} %val
    {CAS}
    %repeat = icmp ne {T} %cas, %old
    br i1 %repeat, label %attempt, label %done

done:
    ret {T} %old
}}
a  
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
    %ptrval = load volatile {T}, {T}* %ptr
    ; Return early when:
    ; - For nanmin / nanmax when val is a NaN
    ; - For min / max when val or ptr is a NaN
    %early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
    br i1 %early_return, label %done, label %lt_check

lt_check:
    %dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
    ; Continue attempts if dold less or greater than val (depending on whether min or max)
    ; or if dold is NaN (for nanmin / nanmax)
    %cmp = fcmp {OP} {T} %dold, %val
    br i1 %cmp, label %attempt, label %done

attempt:
    ; Attempt to swap in the value
    %old = bitcast {T} %dold to {Ti}
    %iptr = bitcast {T}* %ptr to {Ti}*
    %new = bitcast {T} %val to {Ti}
    {CAS}
    %dcas = bitcast {Ti} %cas to {T}
    br label %lt_check

done:
    ret {T} %ptrval
}}
c                 .    t         j                  |       S )NTi)cas_nvvmformatr   s    r   ir_casr   #  s    ??b?!!r   c           	      ^    t        | |||t        |            }t        j                  di |S )N)Tr   OPFUNCCASr   )dictr   ir_numba_atomic_binary_templater   )r   r   r   r   paramss        r   ir_numba_atomic_binaryr   '  s,    A"$F2J?F*11;F;;r   c                 b    t        | |||||t        |            }t        j                  di |S )N)r   r   NANr   
PTR_OR_VALr   r   r   )r   r   ir_numba_atomic_minmax_templater   )r   r   r   r   r   r   r   s          r   ir_numba_atomic_minmaxr   ,  s5    A"#"-F +11;F;;r   c                 D    t         j                  | |t        |             S N)r   Tur   )ir_numba_atomic_inc_templater   r   r   r   s     r   ir_numba_atomic_incr   3      '..rvay.IIr   c                 D    t         j                  | |t        |             S r   )ir_numba_atomic_dec_templater   r   r   s     r   ir_numba_atomic_decr   7  r   r   c                    dt        dddd      fdt        dd	d
d      fdt        ddd
d      fdt        dd      fdt        dd      fdt        dd	dddd      fdt        dddddd      fdt        dd	dddd      fdt        dddddd      fdt        dd	dddd      fdt        dddddd      fd t        dd	dd!dd      fd"t        dddd!dd      fd#g}|D ]  \  }}| j	                  ||      }  t        |       } | S )$NzIdeclare double @"___numba_atomic_double_add"(double* %".1", double %".2")doublei64faddadd)r   r   r   r   zEdeclare float @"___numba_atomic_float_sub"(float* %".1", float %".2")floati32fsubsubzIdeclare double @"___numba_atomic_double_sub"(double* %".1", double %".2")z=declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")u64r   z=declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")zEdeclare float @"___numba_atomic_float_max"(float* %".1", float %".2")r   znnan oltptrmax)r   r   r   r   r   r   zIdeclare double @"___numba_atomic_double_max"(double* %".1", double %".2")zEdeclare float @"___numba_atomic_float_min"(float* %".1", float %".2")znnan ogtr   zIdeclare double @"___numba_atomic_double_min"(double* %".1", double %".2")zHdeclare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")nanultzLdeclare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")zHdeclare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")ugtzLdeclare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2"))immargr   )r   r   r   r   r   llvm140_to_70_ir)llvmirreplacementsdeclfns       r   llvm_replacer  ;  s   	T	(ue	L	N	P	'eU	K	M	T	(ue	L	N	H	u	/	1	H	u	/	1	P	'e
+0u
>	? 
U	(u"+0u
>	? 
Q	'e
+0u
>	? 
U	(u"+0u
>	? 
T	'e5+-E
;	< 
X	(u%E+-E
;	< 
T	'e5+-E
;	< 
X	(u%E+-E
;	< 	G$LL !bb) ! f%FMr   c                 f   t        | t              r| g} |j                  dd      r|j                  ddddd       t	               }t               }| D ]-  }t        |      }|j                  |j                  d             / |j                  |j                                 |j                  di |S )NfastmathFT)ftzfmaprec_div	prec_sqrtr   r   )r   strpopupdaterp   r   r  rz   r   r|   r   r   )r   optscu	libdevicemods        r   
compile_irr  j  s    &#xx
E"	
 	 
	BI3
cjj()  y}}'2::r   z"^attributes #\d+ = \{ ([\w\s]+)\ }c                 r   g } | j                          D ]  }|j                  d      rmt        j                  |      }|j	                  d      j                         }dj                  d |D              }|j                  |j	                  d      |      }|j                  |        dj                  |      S )z,
    Convert LLVM 14.0 IR for LLVM 7.0.
    zattributes #r    c              3   ,   K   | ]  }|d k7  s	|  yw)
willreturnNr   ).0as     r   	<genexpr>z#llvm140_to_70_ir.<locals>.<genexpr>  s     C1l1BQs   
r   )	
splitlines
startswithre_attributes_defmatchgroupsplitjoinr   append)r
   buflinemattrss        r   r   r     s     C??>*!''-AGGAJ$$&EHHCCCE<<
E2D

4   99S>r   c                 \   | j                   }t        j                  |d      }t        j                  t        j                  d      d      }|j                  | ||f      }t        j                  |d      }|j                  |       t        j                  d      j                         }t        j                  |d      }| j                  |      }t        j                  ||d      }	d|	_        d|	_        t        j                  ||g      |	_        | j                   j#                  d	       y
)al  
    Mark a function as a CUDA kernel. Kernels have the following requirements:

    - Metadata that marks them as a kernel.
    - Addition to the @llvm.used list, so that they will not be discarded.
    - The noinline attribute is not permitted, because this causes NVVM to emit
      a warning, which counts as failing IR verification.

    Presently it is assumed that there is one kernel per module, which holds
    for Numba-jitted functions. If this changes in future or this function is
    to be used externally, this function may need modification to add to the
    @llvm.used list rather than creating it.
    kernel    r   znvvm.annotationsrL   z	llvm.used	appendingzllvm.metadatanoinlineN)moduler
   MetaDataStringConstantIntTypeadd_metadatar   get_or_insert_named_metadatar   
as_pointer	ArrayTypebitcastGlobalVariablelinkagesectioninitializer
attributesdiscard)
functionr)  mdstrmdvaluemdnmdptrtyusedtyfnptr	llvm_useds
             r   set_cuda_kernelrA    s     __F fh/Ekk"**R.!,G			hw7	8B

.
.v7I
JCGGBK JJqM$$&E\\%#FU#E!!&&+>I#I'IKK8I 
+r   c                     t        j                  d      }t               j                         D cg c]
  } ||       }}| j	                  |      }| j                  d|       yc c}w )zAdd NVVM IR version to moduler&  znvvmir.versionN)r
   r,  r   rA   r-  add_named_metadata)r  r   r   rI   md_vers        r   add_ir_versionrE    s\     **R.C#'6#8#8#:;#:a3q6#:K;k*F+V4 <s   A%)Mrk   loggingrerc   r   ctypesr   r   r   r   r   r   r	   	threadingllvmliter
   re   r   r   r   libsr   r   r   
numba.corer   r   	getLoggerrh   loggerADDRSPACE_GENERICADDRSPACE_GLOBALADDRSPACE_SHAREDADDRSPACE_CONSTANTADDRSPACE_LOCALrm   rl   r  ra   r   r   r   r7   modulesrM   rN   r   Lockr,   r.   r   rp   r   r   r   rF   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r  r   rA  rE  r   r   r   <module>rV     s    	 
      ; ; = = & 
		8	$       
EG  '(DAqCKK!1a( )
; 7 
	 Y^^
{6 {|kf k\   
<2D!	  # ,  (  (# @"<
<JJ,^. BJJDE $$,N5r   