
    xKgI                     &   d dl mZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d	 Zed
        Zed        Zed        Z e ej*                  e      dd      d        Zed        Zd Zed        Zed        Zed        Zy)    )ir)cudatypes)cgutils)RequireLiteralValue)	signature)overload_attribute)	nvvmutils)	intrinsicc                     | j                   }|dk(  rt        j                  }n4|dv r%t        j                  t        j                  |      }nt	        d      t        |t        j                        S )N   )      zargument can only be 1, 2, 3)literal_valuer   int64UniTuple
ValueErrorr   int32)ndimvalrestypes      Y/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/numba/cuda/intrinsics.py_type_grid_functionr      sU    


C
ax++	..c2788Wekk**    c                 p    t        |t        j                        st        |      t	        |      }d }||fS )a  grid(ndim)

    Return the absolute position of the current thread in the entire grid of
    blocks.  *ndim* should correspond to the number of dimensions declared when
    instantiating the kernel. If *ndim* is 1, a single integer is returned.
    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

    Computation of the first integer is as follows::

        cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x

    and is similar for the other two indices, but using the ``y`` and ``z``
    attributes.
    c                    |j                   }|t        j                  k(  rt        j                  |d      S t        |t        j                        r7t        j                  ||j                        }t        j                  ||      S y )Nr   )dim)
return_typer   r   r
   get_global_id
isinstancer   countr   
pack_array)contextbuildersigargsr   idss         r   codegenzgrid.<locals>.codegen1   sf    //ekk!**7::0))'w}}EC%%gs33 1r   r    r   IntegerLiteralr   r   )	typingctxr   r%   r(   s       r   gridr,      s:    " dE001!$''
d
#C4 <r   c                 |    t        |t        j                        st        |      t	        |      }d fd}||fS )a  gridsize(ndim)

    Return the absolute size (or shape) in threads of the entire grid of
    blocks. *ndim* should correspond to the number of dimensions declared when
    instantiating the kernel. If *ndim* is 1, a single integer is returned.
    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

    Computation of the first integer is as follows::

        cuda.blockDim.x * cuda.gridDim.x

    and is similar for the other two indices, but using the ``y`` and ``z``
    attributes.
    c                     t        j                  d      }t        j                  | d|       }t        j                  | d|       }| j	                  | j                  ||      | j                  ||            S )N@   zntid.znctaid.)r   IntTyper
   	call_sregmulsext)r$   r   i64ntidnctaids        r   _nthreads_for_dimz#gridsize.<locals>._nthreads_for_dimR   sf    jjn""7eC5M:$$Wuo>{{7<<c2GLL4MNNr   c                 R   |j                   } |d      }|t        j                  k(  r|S t        |t        j                        rb |d      }|j
                  dk(  rt        j                  |||f      S |j
                  dk(  r" |d      }t        j                  ||||f      S y y )Nxyr   r   z)r   r   r   r    r   r!   r   r"   )	r#   r$   r%   r&   r   nxnynzr7   s	           r   r(   zgridsize.<locals>.codegenX   s    //w,ekk!I0"7C0B}}!))'B8<<!#&w4))'BB<@@ $ 1r   r)   )r+   r   r%   r(   r7   s       @r   gridsizer?   <   sB    " dE001!$''
d
#COA <r   c                 B    t        t        j                        }d }||fS )Nc                 .    t        j                  |d      S )Nwarpsize)r
   r1   )r#   r$   r%   r&   s       r   r(   z_warpsize.<locals>.codegenn   s    ""7J77r   )r   r   r   r+   r%   r(   s      r   	_warpsizerD   j   s!    
EKK
 C8 <r   rB   r   )targetc                     d }|S )z_
    The size of a warp. All architectures implemented to date have a warp size
    of 32.
    c                     t               S )N)rD   )mods    r   getzcuda_warpsize.<locals>.getz   s
    {r    )rH   rI   s     r   cuda_warpsizerK   t   s    Jr   c                 B    t        t        j                        }d }||fS )a  
    Synchronize all threads in the same thread block.  This function implements
    the same pattern as barriers in traditional multi-threaded programming: this
    function waits until all threads in the block call it, at which point it
    returns control to all its callers.
    c                     d}|j                   }t        j                  t        j                         d      }t	        j
                  |||      }|j                  |d       | j                         S )Nzllvm.nvvm.barrier0rJ   )moduler   FunctionTypeVoidTyper   get_or_insert_functioncallget_dummy_value)r#   r$   r%   r&   fnamelmodfntysyncs           r   r(   zsyncthreads.<locals>.codegen   sW    $~~r{{}b1--dD%@T2&&((r   )r   r   nonerC   s      r   syncthreadsrY      s#     EJJ
C) <r   c                     t        |t        j                        sy t        t        j                  t        j                        }fd}||fS )Nc                     t        j                  t        j                  d      t        j                  d      f      }t        j                  |j
                  |      }|j                  ||      S )N    )r   rO   r0   r   rQ   rN   rR   )r#   r$   r%   r&   rV   rW   rT   s         r   r(   z'_syncthreads_predicate.<locals>.codegen   sO    rzz"~

2/@A--gnndEJ||D$''r   )r    r   Integerr   i4)r+   	predicaterT   r%   r(   s     `  r   _syncthreads_predicater`      s9    i/
EHHehh
'C(
 <r   c                      d}t        | ||      S )z
    syncthreads_count(predicate)

    An extension to numba.cuda.syncthreads where the return value is a count
    of the threads where predicate is true.
    zllvm.nvvm.barrier0.popcr`   r+   r_   rT   s      r   syncthreads_countrd      s     &E!)Y>>r   c                      d}t        | ||      S )z
    syncthreads_and(predicate)

    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
    true for all threads or 0 otherwise.
    zllvm.nvvm.barrier0.andrb   rc   s      r   syncthreads_andrf      s     %E!)Y>>r   c                      d}t        | ||      S )z
    syncthreads_or(predicate)

    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
    true for any thread or 0 otherwise.
    zllvm.nvvm.barrier0.orrb   rc   s      r   syncthreads_orrh      s     $E!)Y>>r   N)llvmliter   numbar   r   
numba.corer   numba.core.errorsr   numba.core.typingr   numba.core.extendingr	   
numba.cudar
   numba.cuda.extendingr   r   r,   r?   rD   ModulerK   rY   r`   rd   rf   rh   rJ   r   r   <module>rr      s       1 ' 3   *	+  @ * *Z   LELL&
6B C  ( ? ? ? ? ? ?r   