
    xKg                     8    d dl mZ d dlmZ d dlZd dlmZ ddZy)    )cuda)driverN)numpy_supportc           	         t        | dd      }|sw| j                  \  }}| j                  j                  |z  | j                  j                  f}t        j
                  j                  j                  ||f|| j                  |      }t        j                  | j                        t        j                         j                  }t        t        j                  dt        j                   |d      dz              }t        ||z        }||dz   ft        j"                  fd       }	t        |j                  d   |z  dz         t        |j                  d   |z  dz         f}
||f} |	|
||f   | |       |S )a  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    streamr   )dtyper         c                    t         j                  j                  
	      }t         j                  j                  }t         j                  j
                  }t         j                  j                  t         j                  j                  z  }t         j                  j
                  t         j                  j
                  z  }||z   }||z   }||z   | j                  d   k  r'||z   | j                  d   k  r| ||z   ||z   f   |||f<   t        j                          ||j                  d   k  r ||j                  d   k  r|||f   |||f<   y y y )N)shaper   r   r
   )
r   sharedarray	threadIdxxyblockIdxblockDimr   syncthreads)inputoutputtiletxtybxbyr   r   dt
tile_shapes            `/home/alanp/www/video.onchill/myenv/lib/python3.12/site-packages/numba/cuda/kernels/transpose.pykernelztranspose.<locals>.kernel)   s    {{  z <^^^^]]__t}}.]]__t}}.GG7U[[^#R%++a.(@ b"r'!12DRLv||A1v||A#6B<F1a4L $7    )getattrr   r   itemsizer   cudadrvdevicearrayDeviceNDArraynps
from_dtyper   
get_deviceMAX_THREADS_PER_BLOCKintmathpowlogjit)abr   colsrowsstridestpb
tile_widthtile_heightr   blocksthreadsr   r   s               @@r   	transposer9      sZ    Q!$FWW
d''""T)177+;+;;LL$$224L''	 3  
	 B




3
3CTXXa#q!1A!567JcJ&'KzA~.J	XX( ($ k)A-.AGGAJ4Ka4O0PPF:%G#F67F"#Aq)Hr    )N)	numbar   numba.cuda.cudadrv.driverr   r+   numba.npr   r&   r9    r    r   <module>r>      s     ,  ):r    