
    xKg                     2   d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z!  e
       Z"e"jF                  Z#e"jH                  Z%e"jL                  Z&d Z' e% ejP                  e      d      d        Z) e% ejP                  e      d      d        Z* e% ejP                  e      d      d        Z+ e% ejP                  e      d      d        Z, e% ejP                  e      d      d        Z- e%e d      d        Z. e%e d      d        Z/ e%e d      d        Z0 e#ejb                  jd                  ejf                        d         Z4d a5d! Z6 e#ejn                  jp                  ejr                  ejt                        d"        Z; e#ejn                  jp                  ejx                  ejt                         e#ejn                  jp                  ejz                  ejt                        d#               Z> e#ej~                  jp                  ejr                  ejt                        d$        Z@ e#ej~                  jp                  ejx                  ejt                         e#ej~                  jp                  ejz                  ejt                        d%               ZA e#ej                        d&        ZC e#ej                        d'        ZE e#ej                        d(        ZG e#ej                        d)        ZI e#ej                  ej                        d*        ZK e#ej                  ej                  ej                  ej                  ej                  ej                         e#ej                  ej                  ej                  ej                  ej                  ej                         e#ej                  ej                  ej                  ej                  ej                  ej                         e#ej                  ej                  ej                  ej                  ej                  ej                        d+                             ZP e#ej                  ej                  ej                  ej                        d,        ZS e#ej                  ej                  ej                         e#ej                  ej                  ej                         e#ej                  ej                  ej                         e#ej                  ej                  ej                        d-                             ZU e#ej                  ej                  ej                         e#ej                  ej                  ej                         e#ej                  ej                  ej                         e#ej                  ej                  ej                        d.                             ZW e#ej                        d/        ZY e#ej                        d0        Z[ e#ej                  ejt                        d1        Z] e#ej                  ejt                  ejt                  ejt                        d2        Z_d3 Z` eej                  ej                        d4        Zc eej                  ej                        d5        Zdd6 Ze eej                  ej                        d7        Zg eej                  ej                         eejr                  ej                        d8               Zhd9 Zi eiej                  j                  d:        eiej                  d:        eiej                  d:        eiej                  j                  d;        eiej                  d;        eiej                  d;        eiej                  j                  d<        eiej                  d<        eiej                  d<        e#ej                  j                  ej                        d=        Zu e#ej                  ej                        d>        Zw e#ej                  j                  ej                        d?        Zy e#ezej                        d@        Z{ e#ej                  j                  ej                  ej                  ej                        dA        Z} e#ej                  ej                  ej                         e#ej                  ej                  ej                        dB               ZdCZdD Z  e#ej                  j                  ej                  ej                         edE               e#ej                  ej                  ej                         edE               e#ej                  j
                  ej                  ej                         edF               e#ej                  ej                  ej                         edF               e#ej                  j                  ej                  ej                         edG               e#ej                  ej                  ej                         edG               e#ej                  j                  ej                  ej                         edH               e#ej                  ej                  ej                         edH               e#ej                  j                  ej                  ej                         edI               e#ej                  ej                  ej                         edI               e#ej                  j                  ej                  ej                         edJ               e#ej                  ej                  ej                         edJ             dK Z eej                  j                   dLdH        eej                  j"                  dMdJ       ej$                  dNej&                  dOiZ e#ej*                  ej$                         e#ej*                  ej&                        dP               Z e#ej.                  ej0                        dQ        Z e#ej.                  ej4                        dR        Z e#ej8                  ejt                        dS        Z e#ej<                  ej                         e#ej<                  ej0                        dT               Z e#ej<                  ej                         e#ej<                  ej4                        dU               Z e#ejB                  ejt                  ejt                  ejt                        dV        Z e#eej                  ej                        dW        Z e#eej                  ej                         e#eej                  ej                         e#eej                  ej                        dX                      Z e#eej                  ej                        dY        Z e#eej                  ej                         e#eej                  ej                         e#eej                  ej                        dZ                      Z e#eej                         e#eej                        d[               Z e#eej                  ej                         e#eej                  ej                        d\               Zd] ZejZ                  d^z  Zd^ejZ                  z  Z  e#ej`                  ej                         ee               e#ej`                  ej                         ee               e#ejb                  ej                         ee               e#ejb                  ej                         ee             d_ Zd` Z e#ejh                  j                  ejf                  ejj                  ejt                         e#ejh                  j                  ejf                  ejz                  ejt                         e#ejh                  j                  ejf                  ejx                  ejt                        eda                             Z e#ejh                  j                  ejf                  ejj                  ejt                         e#ejh                  j                  ejf                  ejz                  ejt                         e#ejh                  j                  ejf                  ejx                  ejt                        edb                             Z e#ejh                  jp                  ejf                  ejj                  ejt                         e#ejh                  jp                  ejf                  ejz                  ejt                         e#ejh                  jp                  ejf                  ejx                  ejt                        edc                             Z e#ejh                  jt                  ejf                  ejj                  ejt                         e#ejh                  jt                  ejf                  ejz                  ejt                         e#ejh                  jt                  ejf                  ejx                  ejt                        edd                             Zde Z eejh                  jz                  df        eejh                  j|                  dg        eejh                  j~                  dh        e#ejh                  j                  ejf                  ejj                  ejt                         e#ejh                  j                  ejf                  ejz                  ejt                         e#ejh                  j                  ejf                  ejx                  ejt                        edi                             Z e#ejh                  jF                  ejf                  ejj                  ejt                         e#ejh                  jF                  ejf                  ejx                  ejt                         e#ejh                  jF                  ejf                  ejz                  ejt                        edj                             Z e#ejh                  jL                  ejf                  ejj                  ejt                         e#ejh                  jL                  ejf                  ejx                  ejt                         e#ejh                  jL                  ejf                  ejz                  ejt                        edk                             Z e#ejh                  j                  ejf                  ejj                  ejt                         e#ejh                  j                  ejf                  ejx                  ejt                         e#ejh                  j                  ejf                  ejz                  ejt                        edl                             Z e#ejh                  j                  ejf                  ejj                  ejt                         e#ejh                  j                  ejf                  ejx                  ejt                         e#ejh                  j                  ejf                  ejz                  ejt                        edm                             Z e#ejh                  j                  ejf                  ejt                  ejt                        dn        Z e#ejh                  j                  ejf                  ejj                  ejt                  ejt                         e#ejh                  j                  ejf                  ejx                  ejt                  ejt                         e#ejh                  j                  ejf                  ejz                  ejt                  ejt                        do                      Z e#ej                  ej                        dp        Z	 dsdqZ e&e!      dr        Z e ej                         e#       y)t    )reduceN)ir)Registry
lower_cast)parse_dtype)models)typescgutils)ufunc_db)register_ufuncs   )nvvm)cuda)	nvvmutilsstubserrors)dim3CUDADispatcherc                     t        j                  | d|z        }t        j                  | d|z        }t        j                  | d|z        }t        j                  | |||f      S )Nz%s.xz%s.yz%s.z)r   	call_sregr
   pack_struct)builderprefixxyzs        W/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/numba/cuda/cudaimpl.pyinitialize_dim3r      s]    GVf_5AGVf_5AGVf_5AwAq	22    	threadIdxc                     t        |d      S )Ntidr   contextr   sigargss       r   cuda_threadIdxr(       s    7E**r   blockDimc                     t        |d      S )Nntidr#   r$   s       r   cuda_blockDimr,   %   s    7F++r   blockIdxc                     t        |d      S )Nctaidr#   r$   s       r   cuda_blockIdxr0   *   s    7G,,r   gridDimc                     t        |d      S )Nnctaidr#   r$   s       r   cuda_gridDimr4   /   s    7H--r   laneidc                 .    t        j                  |d      S )Nr5   )r   r   r$   s       r   cuda_laneidr7   4   s    w11r   r   c                 &    |j                  |d      S Nr   extract_valuer$   s       r   dim3_xr<   9         q))r   r   c                 &    |j                  |d      S )Nr   r:   r$   s       r   dim3_yr?   >   r=   r   r   c                 &    |j                  |d      S )N   r:   r$   s       r   dim3_zrB   C   r=   r   c                     |d   S r9    r$   s       r   cuda_const_array_likerE   J   s     7Nr   c                 @    t         dz  a dj                  | t               S )zDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})_unique_smem_idformatnames    r   _get_unique_smem_idrK   T   s!     qOD/22r   c           	          |j                   d   j                  }t        |j                   d         }t        | ||f|t	        d      t
        j                  d      S Nr   r   _cudapy_smemTshapedtypesymbol_name	addrspacecan_dynsizedr'   literal_valuer   _generic_arrayrK   r   ADDRSPACE_SHAREDr%   r   r&   r'   lengthrQ   s         r   cuda_shared_array_integerr[   ^   sR    XXa[&&F$E'76)5&9.&I$($9$9'+- -r   c           	          |j                   d   D cg c]  }|j                   }}t        |j                   d         }t        | |||t	        d      t
        j                  d      S c c}w rM   rU   r%   r   r&   r'   srP   rQ   s          r   cuda_shared_array_tupler_   h   sd     (+xx{4{!aoo{E4$E'7%u&9.&I$($9$9'+- - 5s   A(c           	          |j                   d   j                  }t        |j                   d         }t        | ||f|dt        j
                  d      S Nr   r   _cudapy_lmemFrO   r'   rV   r   rW   r   ADDRSPACE_LOCALrY   s         r   cuda_local_array_integerre   s   sM    XXa[&&F$E'76)5&4$($8$8',. .r   c           	          |j                   d   D cg c]  }|j                   }}t        |j                   d         }t        | |||dt        j
                  d      S c c}w ra   rc   r]   s          r   ptx_lmem_alloc_arrayrg   }   s_     (+xx{4{!aoo{E4$E'7%u&4$($8$8',. . 5s   Ac                     |rJ d}|j                   }t        j                  t        j                         d      }t	        j
                  |||      }|j                  |d       | j                         S )Nzllvm.nvvm.membar.ctarD   moduler   FunctionTypeVoidTyper
   get_or_insert_functioncallget_dummy_valuer%   r   r&   r'   fnamelmodfntysyncs           r   ptx_threadfence_blockru      ^    O8"E>>D??2;;="-D))$e<DLLr""$$r   c                     |rJ d}|j                   }t        j                  t        j                         d      }t	        j
                  |||      }|j                  |d       | j                         S )Nzllvm.nvvm.membar.sysrD   ri   rp   s           r   ptx_threadfence_systemrx      rv   r   c                     |rJ d}|j                   }t        j                  t        j                         d      }t	        j
                  |||      }|j                  |d       | j                         S )Nzllvm.nvvm.membar.glrD   ri   rp   s           r   ptx_threadfence_devicerz      s^    O8!E>>D??2;;="-D))$e<DLLr""$$r   c                     | j                  t        j                  d      }t        j                  t        j                        }t	        | |||g      S )Nl    )get_constantr	   int32noneptx_syncwarp_mask)r%   r   r&   r'   maskmask_sigs         r   ptx_syncwarpr      s=    Z8Dzz%++&HWgx$@@r   c                    d}|j                   }t        j                  t        j                         t        j                  d      f      }t        j                  |||      }|j                  ||       | j                         S )Nzllvm.nvvm.bar.warp.sync    )	rj   r   rk   rl   IntTyper
   rm   rn   ro   rp   s           r   r   r      sb    %E>>D??2;;=2::b>*;<D))$e<DLLt""$$r   c           
         |\  }}}}}|j                   d   }	|	t        j                  v r/|j                  |t	        j
                  |	j                              }d}
|j                  }t	        j                  t	        j                  t	        j
                  d      t	        j
                  d      f      t	        j
                  d      t	        j
                  d      t	        j
                  d      t	        j
                  d      t	        j
                  d      f      }t        j                  |||
      }|	j                  dk(  r|j                  ||||||f      }|	t        j                  k(  r`|j                  |d      }|j                  |d      }|j                  |t	        j                               }t        j                   |||f      }|S |j#                  |t	        j
                  d            }|j%                  || j'                  t        j(                  d            }|j#                  |t	        j
                  d            }|j                  ||||||f      }|j                  ||||||f      }|j                  |d      }|j                  |d      }|j                  |d      }|j+                  |t	        j
                  d            }|j+                  |t	        j
                  d            }|j-                  || j'                  t        j(                  d            }|j/                  ||      }|	t        j0                  k(  r$|j                  |t	        j2                               }t        j                   |||f      }|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    rA   zllvm.nvvm.shfl.sync.i32r   r   r   @   )r'   r	   real_domainbitcastr   r   bitwidthrj   rk   LiteralStructTyper
   rm   rn   float32r;   	FloatTypemake_anonymous_structtrunclshrr|   i8zextshlor_float64
DoubleType)r%   r   r&   r'   r   modevalueindexclamp
value_typerq   rr   rs   funcretrvpredfvvalue1
value_lshrvalue2ret1ret2rv1rv2rv1_64rv2_64rv_shls                               r   ptx_shfl_sync_i32r      s     '+#D$ue!JU&&&rzz*2E2E'FG%E>>D??
bjjnbjjm<=ZZ^RZZ^RZZ^ZZ^RZZ^=D
 ))$e<Db ll4$eUE!BC&&&sA.B((a0DR\\^4B//"dDC" J ubjjn5\\%)=)=ehh)KL
z2::b>:||D4vue"DE||D4vue"DE##D!,##D!,$$T1-c2::b>2c2::b>2VW%9%9%((B%GH[[(&R]]_5B++Gb$Z@Jr   c                    d}|j                   }t        j                  t        j                  t        j                  d      t        j                  d      f      t        j                  d      t        j                  d      t        j                  d      f      }t        j                  |||      }|j                  ||      S )Nzllvm.nvvm.vote.syncr   r   )rj   r   rk   r   r   r
   rm   rn   )r%   r   r&   r'   rq   rr   rs   r   s           r   ptx_vote_syncr      s    !E>>D??2//B13A1@ AJJrNBJJrNBJJqMJLD ))$e<D<<d##r   c                    |\  }}|j                   d   j                  }|j                   d   t        j                  v r%|j	                  |t        j                  |            }dj                  |      }|j                  }t        j                  t        j                  d      t        j                  d      t        j                  |      f      }	t        j                  ||	|      }
|j                  |
||f      S )Nr   zllvm.nvvm.match.any.sync.i{}r   )r'   r   r	   r   r   r   r   rH   rj   rk   r
   rm   rn   r%   r   r&   r'   r   r   widthrq   rr   rs   r   s              r   ptx_match_any_syncr      s    
 KD%HHQK  E
xx{e'''rzz%'89*11%8E>>D??2::b>BJJrNBJJu<M+NOD))$e<D<<tUm,,r   c                 H   |\  }}|j                   d   j                  }|j                   d   t        j                  v r%|j	                  |t        j                  |            }dj                  |      }|j                  }t        j                  t        j                  t        j                  d      t        j                  d      f      t        j                  d      t        j                  |      f      }	t        j                  ||	|      }
|j                  |
||f      S )Nr   zllvm.nvvm.match.all.sync.i{}r   )r'   r   r	   r   r   r   r   rH   rj   rk   r   r
   rm   rn   r   s              r   ptx_match_all_syncr     s    
 KD%HHQK  E
xx{e'''rzz%'89*11%8E>>D??2//B13A1@ AJJrNBJJu,=>@D ))$e<D<<tUm,,r   c                     t        j                  t        j                  t        j                  d      g       ddd      }|j	                  |g       S )Nr   zactivemask.b32 $0;=rTside_effectr   	InlineAsmrk   r   rn   r%   r   r&   r'   
activemasks        r   ptx_activemaskr     s>    boobjjnbA2DdLJ<<
B''r   c                     t        j                  t        j                  t        j                  d      g       ddd      }|j	                  |g       S )Nr   zmov.u32 $0, %lanemask_lt;r   Tr   r   r   s        r   ptx_lanemask_ltr   $  s@    boobjjnbA94*.0J <<
B''r   c                 *    |j                  |d         S r9   )ctpopr$   s       r   ptx_popcr   ,  s    ==a!!r   c                       |j                   | S N)fmar$   s       r   ptx_fmar   1  s    7;;r   c                 h    ddd}	 ||    S # t         $ r d|  d}t        j                  |      w xY w)N)f32f)f64d)r   r   z$Conversion between float16 and float unsupportedKeyErrorr   CudaLoweringErrorr   typemapmsgs      r   float16_float_ty_constraintr   6  sJ    \2G,x   ,4XJlK&&s++,s    %1c                 >   |j                   |j                   k(  r|S t        |j                         \  }}t        j                  | j	                  |      t        j
                  d      g      }t        j                  |d| dd| d      }|j                  ||g      S )N   zcvt..f16 $0, $1;=,h)r   r   r   rk   get_value_typer   r   rn   	r%   r   fromtytotyvalty
constraintrs   asms	            r   float16_to_float_castr   @  s    $--'
0?NB
??711$7"**R.9IJD
,,ttB4|4*R6H
IC<<cU##r   c                 <   |j                   |j                   k(  r|S t        |j                         \  }}t        j                  t        j                  d      | j                  |      g      }t        j                  |d| dd|       }|j                  ||g      S )Nr   cvt.rn.f16. $0, $1;=h,)r   r   r   rk   r   r   r   rn   r   s	            r   float_to_float16_castr   L  s    $--'
0ANB
??2::b>G,B,B6,J+KLD
,,t{2$h73zl9K
LC<<cU##r   c                 l    ddddd}	 ||    S # t         $ r d|  d}t        j                  |      w xY w)Nchrl)   r   r   r   z"Conversion between float16 and intr   r   r   s      r   float16_int_constraintr   X  sN    CSc3G,x   ,28*LI&&s++,s    %3c                 *   |j                   }t        |      }|j                  rdnd}t        j                  | j                  |      t        j                  d      g      }t        j                  |d| | dd| d      }	|j                  |	|g      S )Nr^   ur   zcvt.rni.r   r   r   )	r   r   signedr   rk   r   r   r   rn   
r%   r   r   r   r   r   r   
signednessrs   r   s
             r   float16_to_integer_castr   b  s    }}H'1JJ??711$7"**R.9IJD
,,t!*hZ|D:,b)+C <<cU##r   c                 (   |j                   }t        |      }|j                  rdnd}t        j                  t        j
                  d      | j                  |      g      }t        j                  |d| | dd|       }	|j                  |	|g      S )Nr^   r   r   r   r   r   )	r   r   r   r   rk   r   r   r   rn   r   s
             r   integer_to_float16_castr   o  s     H'1J3J??2::b>#226:;=D
,,t$ZL
(CZL)+C <<cU##r   c                 h    t        | t        j                  t        j                        fd       }y )Nc                     t        j                  t        j                  d      t        j                  d      t        j                  d      g      }t        j                  | dd      }|j	                  ||      S )Nr   z.f16 $0,$1,$2;=h,h,hr   rk   r   r   rn   )r%   r   r&   r'   rs   r   ops         r   ptx_fp16_binaryz*lower_fp16_binary.<locals>.ptx_fp16_binary  s^    rzz"~ "

2

2?All4B4~!6A||C&&r   lowerr	   float16)fnr   r   s    ` r   lower_fp16_binaryr   ~  s&    
2u}}emm,' -'r   addsubmulc                     t        j                  t        j                  d      t        j                  d      g      }t        j                  |dd      }|j	                  ||      S )Nr   zneg.f16 $0, $1;=h,hr   r%   r   r&   r'   rs   r   s         r   ptx_fp16_hnegr    I    ??2::b>BJJrN+;<D
,,t.
7C<<T""r   c                     t        | |||      S r   )r  r$   s       r   operator_hnegr	        '355r   c                     t        j                  t        j                  d      t        j                  d      g      }t        j                  |dd      }|j	                  ||      S )Nr   zabs.f16 $0, $1;r  r   r  s         r   ptx_fp16_habsr    r  r   c                     t        | |||      S r   )r  r$   s       r   operator_habsr    r
  r   c                 "   t        j                  d      t        j                  d      t        j                  d      g}t        j                  t        j                  d      |      }t        j                  |dd      }|j	                  ||      S )Nr   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   r   rk   r   rn   )r%   r   r&   r'   argtysrs   r   s          r   ptx_hfmar    sb    jjnbjjnbjjn=F??2::b>62D
,,t6

CC<<T""r   c                 0    d }| j                  ||||      S )Nc                 B    t         j                  j                  | |      S r   )r   fp16hdiv)r   r   s     r   fp16_divzfp16_div_impl.<locals>.fp16_div  s    yy~~a##r   compile_internal)r%   r   r&   r'   r  s        r   fp16_div_implr    s     $ ##GXsDAAr   z{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                       fd}|S )Nc                    t        j                  t        j                  d      t        j                  d      t        j                  d      g      }t        j                  |t        j                  	      d      }|j                  ||      }| j                  t        j                  d      }|j                  |t        j                  d            }|j                  d||      S )Nr   )r   r   r   z!=)r   rk   r   r   	_fp16_cmprH   rn   r|   r	   int16r   icmp_unsigned)
r%   r   r&   r'   rs   r   resultzero
int_resultr   s
            r   ptx_fp16_comparisonz*_gen_fp16_cmp.<locals>.ptx_fp16_comparison  s    rzz"~

2

2/OPll4!1!1R!1!8(Cc4(##EKK3__VRZZ^<
$$T:t<<r   rD   )r   r"  s   ` r   _gen_fp16_cmpr#    s    = r   eqnegegtleltc                 h    t        | t        j                  t        j                        fd       }y )Nc                 ^     t              | |||      }|j                  ||d   |d         S )Nr   r   )r#  select)r%   r   r&   r'   choicer   s        r   ptx_fp16_minmaxz*lower_fp16_minmax.<locals>.ptx_fp16_minmax  s5    "r"7GS$?~~fd1gtAw77r   r   )r   rq   r   r.  s     ` r   lower_fp16_minmaxr/    s&    
2u}}emm,8 -8r   maxmin
__nv_cbrtf	__nv_cbrtc                     |j                   }t        |   }| j                  |      }|j                  }t	        j
                  ||g      }t        j                  |||      }	|j                  |	|      S r   )	return_type
cbrt_funcsr   rj   r   rk   r
   rm   rn   )
r%   r   r&   r'   r   rq   ftyrr   rs   r   s
             r   ptx_cbrtr8    sf     
BrNE

 
 
$C>>D??3&D		'	'dE	:B<<D!!r   c           	          t        j                  |j                  t        j                  t        j
                  d      t        j
                  d      f      d      }|j                  ||      S )Nr   	__nv_brevr
   rm   rj   r   rk   r   rn   r%   r   r&   r'   r   s        r   ptx_brev_u4r=    sR    
 
	'	'


2B(9:
B <<D!!r   c           	          t        j                  |j                  t        j                  t        j
                  d      t        j
                  d      f      d      }|j                  ||      S )Nr   __nv_brevllr;  r<  s        r   ptx_brev_u8r@  	  sR    
 
	'	'


2B(9:
B <<D!!r   c                 h    |j                  |d   | j                  t        j                  d            S r9   )ctlzr|   r	   booleanr$   s       r   ptx_clzrD    s.    <<QU]]A.0 0r   c           	          t        j                  |j                  t        j                  t        j
                  d      t        j
                  d      f      d      }|j                  ||      S )Nr   __nv_ffsr;  r<  s        r   
ptx_ffs_32rG    sR     
	'	'


2B(9:
B <<D!!r   c           	          t        j                  |j                  t        j                  t        j
                  d      t        j
                  d      f      d      }|j                  ||      S )Nr   r   
__nv_ffsllr;  r<  s        r   
ptx_ffs_64rJ  &  sR     
	'	'


2B(9:
B <<D!!r   c                 4    |\  }}}|j                  |||      S r   )r,  )r%   r   r&   r'   testabs          r   ptx_selprO  0  s     JD!Q>>$1%%r   c           	          t        j                  |j                  t        j                  t        j
                         t        j
                         t        j
                         f      d      }|j                  ||      S )N
__nv_fmaxfr
   rm   rj   r   rk   r   rn   r<  s        r   
ptx_max_f4rS  6  Z    		'	'
LLN\\^R\\^,	. 	
B <<D!!r   c           
         t        j                  |j                  t        j                  t        j
                         t        j
                         t        j
                         f      d      }|j                  || j                  ||d   |j                  d   t        j                        | j                  ||d   |j                  d   t        j                        g      S )N	__nv_fmaxr   r   r
   rm   rj   r   rk   r   rn   castr'   r	   doubler<  s        r   
ptx_max_f8rZ  A       
	'	'
MMO]]_bmmo.	0 	
B <<Wd1gsxx{ELLAWd1gsxx{ELLA  r   c           	          t        j                  |j                  t        j                  t        j
                         t        j
                         t        j
                         f      d      }|j                  ||      S )N
__nv_fminfrR  r<  s        r   
ptx_min_f4r^  R  rT  r   c           
         t        j                  |j                  t        j                  t        j
                         t        j
                         t        j
                         f      d      }|j                  || j                  ||d   |j                  d   t        j                        | j                  ||d   |j                  d   t        j                        g      S )N	__nv_fminr   r   rW  r<  s        r   
ptx_min_f8ra  ]  r[  r   c           	      >   t        j                  |j                  t        j                  t        j
                  d      t        j                         f      d      }|j                  || j                  ||d   |j                  d   t        j                        g      S )Nr   __nv_llrintr   )r
   rm   rj   r   rk   r   r   rn   rX  r'   r	   rY  r<  s        r   	ptx_roundrd  n  s     
	'	'
JJrN]]_	  	
B <<Wd1gsxx{ELLA  r   c                 0    d }| j                  ||||      S )Nc                    t        j                  |       st        j                  |       r| S |dk\  r6|dkD  rd|dz
  z  }d}nd|z  }d}| |z  |z  }t        j                  |      r| S d| z  }| |z  }t        |      }t        j                  ||z
        dk(  rdt        |dz        z  }|dk\  r
|z  |z  }|S ||z  }|S )Nr      g      $@gMDg      ?g      ?g       @)mathisinfisnanroundfabs)r   ndigitspow1pow2r   r   s         r   round_ndigitsz$round_to_impl.<locals>.round_ndigits  s    ::a=DJJqMHa<| "-wTT!Azz!} WH%DDA!HIIa!e#eAGn$Aa<TT!A  IAr   r  )r%   r   r&   r'   rp  s        r   round_to_implrq    s!    B ##G]CHHr   c                       fd}|S )Nc                 n    |j                   \  }| j                  |      }|j                  ||d         S r9   )r'   r|   fmul)r%   r   r&   r'   argtyfactorconsts         r   implzgen_deg_rad.<locals>.impl  s5    %%eU3||FDG,,r   rD   )rw  rx  s   ` r   gen_deg_radry    s    - Kr   g     f@c           
         |t         j                  v rt        j                  |d      }|g}n!t        j                  ||t        |            }t        ||      D cg c]'  \  }}| j                  |||t         j                        ) }}}|j                  }	|	|k7  rt        d|	d|      |j                  t        |      k7  r#t        d|j                  t        |      fz        ||fS c c}}w )z4
    Convert integer indices into tuple of intp
    r   )rQ   count)r{  zexpect z	 but got z#indexing %d-D array with %d-D index)r	   integer_domainUniTupler
   unpack_tuplelenziprX  intprQ   	TypeErrorndim)
r%   r   indtyindsarytyvaltyindicestirQ   s
             r   _normalize_indicesr    s     $$$U!4&&&wCJGug.0.41a ||GQ5::6.  0 KKE~%?@@zzSZ=SZ01 2 	2 '>0s   ,C.c                       fd}|S )Nc                     |j                   \  }}}|\  }}}	|j                  }
t        | |||||      \  }} | j                  |      | ||      }t	        j
                  | ||||d      } | ||
||	      S )NT
wraparound)r'   rQ   r  
make_arrayr
   get_item_pointer)r%   r   r&   r'   r  r  r  aryr  r   rQ   r  laryptrdispatch_fns                 r   impz_atomic_dispatcher.<locals>.imp  s    !hhueT3+GWeT,15:w )w!!%('3?&&wg268 7GUC==r   rD   )r  r  s   ` r   _atomic_dispatcherr    s    > Jr   c                 B   |t         j                  k(  r3|j                  }|j                  t	        j
                  |      ||f      S |t         j                  k(  r3|j                  }|j                  t	        j                  |      ||f      S |j                  d||d      S )Nr   	monotonic)	r	   r   rj   rn   r   declare_atomic_add_float32r   declare_atomic_add_float64
atomic_rmwr%   r   rQ   r  r   rr   s         r   ptx_atomic_add_tupler        
 ~~||I@@F #J( 	(	%--	~~||I@@F #J( 	( !!%c;??r   c                 B   |t         j                  k(  r3|j                  }|j                  t	        j
                  |      ||f      S |t         j                  k(  r3|j                  }|j                  t	        j                  |      ||f      S |j                  d||d      S )Nr  r  )	r	   r   rj   rn   r   declare_atomic_sub_float32r   declare_atomic_sub_float64r  r  s         r   ptx_atomic_subr    r  r   c                     |t         j                  j                  v rE|j                  }|j                  }t        t        d|       }|j                   ||      ||f      S t        d| d      )Ndeclare_atomic_inc_intzUnimplemented atomic inc with  array	r   cudadeclunsigned_int_numba_typesr   rj   getattrr   rn   r  r%   r   rQ   r  r   bwrr   r   s           r   ptx_atomic_incr    j    
 666^^~~Y"8 =>||BtHsCj118vFGGr   c                     |t         j                  j                  v rE|j                  }|j                  }t        t        d|       }|j                   ||      ||f      S t        d| d      )Ndeclare_atomic_dec_intzUnimplemented atomic dec with r  r  r  s           r   ptx_atomic_decr    r  r   c                     t         fd       }t        j                  t        j                  t        j                  fD ]2  } t        | t        j                  |t        j                        |       4 y )Nc                     |t         j                  j                  v r|j                  ||d      S t	        d d| d      )Nr  zUnimplemented atomic z with r  r   r  integer_numba_typesr  r  )r%   r   rQ   r  r   r   s        r   impl_ptx_atomicz+ptx_atomic_bitwise.<locals>.impl_ptx_atomic  sG    T]]667%%b#sK@@3B4veWFKLLr   )r  r	   r  r}  Tupler   ArrayAny)stubr   r  r   s    `  r   ptx_atomic_bitwiser    sS    M M zz5>>5;;7/dEKKUYY/@ 8r   andorxorc                     |t         j                  j                  v r|j                  d||d      S t	        d| d      )Nxchgr  zUnimplemented atomic exch with r  r  )r%   r   rQ   r  r   s        r   ptx_atomic_exchr  /  sB    
 223!!&#sK@@9%GHHr   c                    |j                   }|t        j                  k(  r'|j                  t	        j
                  |      ||f      S |t        j                  k(  r'|j                  t	        j                  |      ||f      S |t        j                  t        j                  fv r|j                  d||d      S |t        j                  t        j                  fv r|j                  d||d      S t        d|z        Nr0  r  orderingumaxz&Unimplemented atomic max with %s array)rj   r	   r   rn   r   declare_atomic_max_float64r   declare_atomic_max_float32r}   int64r  uint32uint64r  r  s         r   ptx_atomic_maxr  :      
 >>D||I@@F #J( 	(	%--	||I@@F #J( 	(	5;;,	,!!%cK!HH	5<<.	.!!&#s[!II@5HIIr   c                    |j                   }|t        j                  k(  r'|j                  t	        j
                  |      ||f      S |t        j                  k(  r'|j                  t	        j                  |      ||f      S |t        j                  t        j                  fv r|j                  d||d      S |t        j                  t        j                  fv r|j                  d||d      S t        d|z        Nr1  r  r  uminz&Unimplemented atomic min with %s array)rj   r	   r   rn   r   declare_atomic_min_float64r   declare_atomic_min_float32r}   r  r  r  r  r  r  s         r   ptx_atomic_minr  N  r  r   c                    |j                   }|t        j                  k(  r'|j                  t	        j
                  |      ||f      S |t        j                  k(  r'|j                  t	        j                  |      ||f      S |t        j                  t        j                  fv r|j                  d||d      S |t        j                  t        j                  fv r|j                  d||d      S t        d|z        r  )rj   r	   r   rn   r   declare_atomic_nanmax_float64r   declare_atomic_nanmax_float32r}   r  r  r  r  r  r  s         r   ptx_atomic_nanmaxr  b      
 >>D||ICCDI #J( 	(	%--	||ICCDI #J( 	(	5;;,	,!!%cK!HH	5<<.	.!!&#s[!II@5HIIr   c                    |j                   }|t        j                  k(  r'|j                  t	        j
                  |      ||f      S |t        j                  k(  r'|j                  t	        j                  |      ||f      S |t        j                  t        j                  fv r|j                  d||d      S |t        j                  t        j                  fv r|j                  d||d      S t        d|z        r  )rj   r	   r   rn   r   declare_atomic_nanmin_float64r   declare_atomic_nanmin_float32r}   r  r  r  r  r  r  s         r   ptx_atomic_nanminr  v  r  r   c                 
   |j                  |j                  d   t        j                  |j                  d   |j                  d         }|d   | j	                  t        j                  d      |d   |d   f}t        | |||      S )Nr   r   rA   )r5  r'   r	   r  r|   ptx_atomic_casr$   s       r   ptx_atomic_compare_and_swapr    sn    
//#((1+uzz388A;
LCGW))%**a8$q'47KD'7C66r   c                    |j                   \  }}}}|\  }}	}
}t        | |||	||      \  }} | j                  |      | ||      }t        j                  | ||||d      }|j
                  t        j                  j                  v r<|j                  }|j
                  j                  }t        j                  |||||
|      S t        d|j
                  z        )NTr  z&Unimplemented atomic cas with %s array)r'   r  r  r
   r  rQ   r   r  r  rj   r   r   atomic_cmpxchgr  )r%   r   r&   r'   r  r  oldtyr  r  r  oldr   r  r  r  rr   r   s                    r   r  r    s     "%E5%CsC'%u(-/NE7 %7e$Wgs;D

"
"7GUD'.24C {{t}}889~~;;''''xc3OO@5;;NOOr   c                     t        j                  t        j                  t        j                         t        j                  d      g      ddd      }|d   }|j                  ||g       y )Nr   znanosleep.u32 $0;r   Tr   r   )r   r   rk   rl   r   rn   )r%   r   r&   r'   	nanosleepnss         r   ptx_nanosleepr    sO    R__R[[]RZZ^<LM0#4II	aBLLRD!r   c           
      v   t        t        j                  |d      }|dk  xr |xr t        |      dk(  }|dk  r|st	        d      | j
                  |   }	t        |t        j                  t        j                  f      xs/ t        |	t        j                        xs |t        j                  k(  }
|t        j                  vr|
st        d|z        | j                  |      }t!        j"                  ||      }|t$        j&                  k(  rt)        j*                  |||      }n|j,                  }t)        j.                  ||||      }| j1                  |      }d|dz
  j3                         z  |_        |rd|_        n)t!        j8                  |t         j:                        |_        |j?                  |t!        j@                  t!        jB                  d            d      }tE        jF                  t%        jH                         jJ                        }| j                  |      }|jM                  |      }|}g }tO        tQ        |            D ]  \  }}|jS                  |       ||z  } tQ        |      D cg c]  }| }}|D cg c]"  }| jU                  t        jV                  |      $ }}|rt!        jX                  t!        jZ                  t!        jB                  d	      g       d
dd      }|j]                  |j_                  |g       t!        jB                  d            }| jU                  t        jV                  |      }|ja                  ||      g}n-|D cg c]"  }| jU                  t        jV                  |      $ }}t        |      }t        jb                  ||d      } | je                  |      | |      } | jg                  | |ji                  || jj                  jl                        ||| jU                  t        jV                  |      d        | jo                         S c c}w c c}w c c}w )Nr   r   zarray length <= 0zunsupported type: %srI   externalr   genericr   zmov.u32 $0, %dynamic_smem_size;r   Tr   r   C)rQ   r  layout)datarP   stridesitemsizememinfo)8r   operatorr  r  
ValueErrordata_model_manager
isinstancer	   RecordBooleanr   StructModelr   number_domainr  get_data_typer   	ArrayTyper   rd   r
   alloca_oncerj   add_global_variableget_abi_sizeof
bit_lengthalignlinkageConstant	UndefinedinitializeraddrspacecastPointerTyper   llcreate_target_dataNVVMdata_layoutget_abi_size	enumeratereversedappendr|   r  r   rk   r   rn   udivr  r  populate_arrayr   r  type	_getvalue)!r%   r   rP   rQ   rR   rS   rT   	elemcountdynamic_smem
data_modelother_supported_typelldtypelarytydataptrrr   gvmemr  
targetdatar  
laststriderstridesr  lastsizer^   r  kstridesget_dynshared_sizedynsmem_size	kitemsizekshaper  r  r  s!                                    r   rW   rW     s   x||UA.I >FlFs5zQLA~l,-- ++E2J55<<78 	"j&"4"45	"EMM! 
 E'''0D.677##E*G\\'9-FD((( %%gvKH~~ ++D&+,57 &&w/ EAI2244&EM !#FBLL AE ''r~~bjjm/L(13 &&tyy{'>'>?J##E*G##J/H JH %18
#h
 2 #8,-,Qq,G-=DEW$$UZZ3WHE 
  \\"//"**R."*M*K*.DB ||GLL1CR$H$&JJrN4 ((X>	,,|Y78?DEu!'&&uzz15uE u:DKKe$s;E
#'

U
#GW
5C3 ' G!'#+$+$8$8X$N#'  ) ==?A .E$ Fs   8	P,'P1'P6c                 "    | j                         S r   )ro   )r%   r   r   pyvals       r   cuda_dispatcher_constr%    s    ""$$r   )F)	functoolsr   r  rh  llvmliter   llvmlite.bindingbindingr  numba.core.imputilsr   r   numba.core.typing.npydeclr   numba.core.datamodelr   
numba.corer	   r
   numba.npr   numba.np.npyimplr   cudadrvr   numbar   
numba.cudar   r   r   numba.cuda.typesr   r   registryr   lower_getattr
lower_attrlower_constantr   Moduler(   r,   r0   r4   r7   r<   r?   rB   rw  
array_liker  rE   rG   rK   sharedarrayIntegerLiteralr  r[   r  r}  r_   localre   rg   threadfence_blockru   threadfence_systemrx   threadfencerz   syncwarpr   i4r   shfl_sync_intrinsicr   f4f8r   vote_sync_intrinsicrC  r   match_any_syncr   match_all_syncr   r   r   lanemask_ltr   popcr   r   r   r   r   Floatr   r   r   Integerr   r   r   r  haddr   iaddhsubr  isubhmulr  imulhnegr  negr	  habsr  absr  hfmar  truedivitruedivr  r  r#  heqr$  hner%  hger&  hgtr'  hler(  hltr)  r/  hmaxhminr   r   r6  cbrtr8  brevu4r=  u8r@  clzrD  ffsrG  rJ  selprO  r0  rS  rZ  r1  r^  ra  rk  rd  rq  ry  pi_deg2rad_rad2degradiansdegreesr  r  atomicr  r  r  incr  decr  r  and_r   r  exchr  r  r  nanmaxr  nanminr  compare_and_swapr  casr  r  r  r  rW   r%  
get_ufuncsrD   r   r   <module>rx     sZ        4 1 ' %  ,   / / 1:##
((3 LELL,+ -+ LELL
+, ,, LELL
+- ,- LELL	*. +. LELL)2 *2 D#* * D#* * D#* * tzzekk* + 3 t{{%..		:- ;- t{{%++uyy1t{{%..%))4- 5 2- tzz--uyy9. :. tzzeii0tzz3. 4 1. u%  % u % !% u% % u~~A A u~~uxx % !% u  %((EHHehhxxu  %((EHHehhxxu  %((EHHehhxxu  %((EHHehhxx++\ u  %((EHHemmD$ E$ uUXXuxx0uUXXuxx0uUXXuxx0uUXXuxx0	- 1 1 1 1	- uUXXuxx0uUXXuxx0uUXXuxx0uUXXuxx0- 1 1 1 1- u( ( u( ( uzz599" " uyy%))UYY		2 3, EMM5;;'$ ($ EKK'$ ($, EMM5==)	$ *	$ EMM5==)E  %--0
$ 1 *
$' %**//5 ) (,, & (-- ' %**//5 ) (,, & (-- ' %**//5 ) (,, & (-- ' uzz&# '# x||U]]#6 $6 uzz&# '# sEMM6 6 uzzu}}emmD# E# x6x%--7B 8 7B		 4ejjnnemmU]] 3M$4G H 0hkk5==%-- 0t1D E 3ejjnnemmU]] 3M$4G H 0hkk5==%-- 0t1D E 3ejjnnemmU]] 3M$4G H 0hkk5==%-- 0t1D E 3ejjnnemmU]] 3M$4G H 0hkk5==%-- 0t1D E 3ejjnnemmU]] 3M$4G H 0hkk5==%-- 0t1D E 3ejjnnemmU]] 3M$4G H 0hkk5==%-- 0t1D E8 %**//5$ / %**//5$ / 
MM<	MM;
 uzz5==!uzz5==!" " "" uzz588" " uzz588" " uyy%))0 0 uyy%((uyy%(("  " uyy%((uyy%(("  " uzz599eii3& 4&
 sEHHehh"  " sEHHehhsEHHehhsEHHehh       sEHHehh"  " sEHHehhsEHHehhsEHHehh       uehhuehh	  	  uehh&uehh&"I ' '"IJ 77T>$''> dllEHH k(3 4 dllEHH k(3 4 dllEHH k(3 4 dllEHH k(3 4.$ u||ejj%))<u||enneii@u||ekk599=
@  > A =
@ u||ejj%))<u||enneii@u||ekk599=
@  > A =
@ u||ejj%))<u||enneii@u||ekk599=H  > A =H u||ejj%))<u||enneii@u||ekk599=H  > A =H	A 5<<$$e , 5<<##T * 5<<##U + u||%++uzz599=u||%++u~~uyyAu||%++u{{EII>I  ? B >I u||ejj%))<u||ekk599=u||enneii@J  A > =J  u||ejj%))<u||ekk599=u||enneii@J  A > =J  u||EKKUYY?u||EKKeii@u||EKKCJ  D A @J  u||EKKUYY?u||EKKeii@u||EKKCJ  D A @J  u||$$ekk599eiiH7 I7 u||ejj%))UYYGu||ekk599eiiHu||enneiiKP L I HP* u%" &" !&aH %  % ###%u -r   