1
# CUDA_PROFILE_LOG_VERSION 1.3
2
timestamp,method,gputime,cputime,gridSizeX,gridSizeY,blockSizeX,blockSizeY,blockSizeZ,dynSmemPerBlock,staSmemPerBlock,registerPerThread,occupancy,streamID,branch,divergent_branch,instructions,cta_launched,memTransferSize,memTransferDir
3
timestamp=[ 5451.000 ] method=[ memcopy ] gputime=[ 25013.057 ] cputime=[ 35368.000 ] memtransfersize=[ 67108864 ] memtransferdir=[ 0 ]
4
timestamp=[ 40870.000 ] method=[ _Z10transpose3PfS_ii ] gputime=[ 7124.768 ] cputime=[ 7149.000 ] gridSize=[ 256, 256 ] threadCountPerBlock=[ 16, 16, 16 ] dynamicSharedMemPerBlock=[ 1088 ] sharedMemPerBlock=[ 48 ] regPerThread=[ 8 ] occupancy=[ 1.000 ] streamID=[ 0 ]branch=[ 17480 ] divergent_branch=[ 0 ] instructions=[ 419002 ] cta_launched=[ 6554 ]
5
timestamp=[ 48112.000 ] method=[ _Z10transpose3PfS_ii ] gputime=[ 7103.520 ] cputime=[ 7115.000 ] gridSize=[ 256, 256 ] threadCountPerBlock=[ 16, 16, 16 ] dynamicSharedMemPerBlock=[ 0 ] sharedMemPerBlock=[ 48 ] regPerThread=[ 8 ] occupancy=[ 1.000 ] streamID=[ 0 ]branch=[ 17480 ] divergent_branch=[ 0 ] instructions=[ 418631 ] cta_launched=[ 6554 ]
6
timestamp=[ 55284.000 ] method=[ _Z10transpose2PfS_ii ] gputime=[ 7155.072 ] cputime=[ 7168.000 ] gridSize=[ 256, 256 ] threadCountPerBlock=[ 16, 16, 16 ] dynamicSharedMemPerBlock=[ 1088 ] sharedMemPerBlock=[ 48 ] regPerThread=[ 9 ] occupancy=[ 1.000 ] streamID=[ 0 ]branch=[ 17472 ] divergent_branch=[ 0 ] instructions=[ 408268 ] cta_launched=[ 6553 ]
7
timestamp=[ 62506.000 ] method=[ _Z10transpose2PfS_ii ] gputime=[ 7165.632 ] cputime=[ 7177.000 ] gridSize=[ 256, 256 ] threadCountPerBlock=[ 16, 16, 16 ] dynamicSharedMemPerBlock=[ 1088 ] sharedMemPerBlock=[ 48 ] regPerThread=[ 9 ] occupancy=[ 1.000 ] streamID=[ 0 ]branch=[ 17480 ] divergent_branch=[ 0 ] instructions=[ 410378 ] cta_launched=[ 6554 ]
8
timestamp=[ 69752.000 ] method=[ _Z10transpose1PfS_ii ] gputime=[ 91694.977 ] cputime=[ 91711.000 ] gridSize=[ 256, 256 ] threadCountPerBlock=[ 16, 16, 16 ] dynamicSharedMemPerBlock=[ 1088 ] sharedMemPerBlock=[ 48 ] regPerThread=[ 6 ] occupancy=[ 1.000 ] streamID=[ 0 ]branch=[ 0 ] divergent_branch=[ 0 ] instructions=[ 286876 ] cta_launched=[ 6553 ]
9
timestamp=[ 161551.000 ] method=[ _Z10transpose1PfS_ii ] gputime=[ 91657.375 ] cputime=[ 92586.000 ] gridSize=[ 256, 256 ] threadCountPerBlock=[ 16, 16, 16 ] dynamicSharedMemPerBlock=[ 1088 ] sharedMemPerBlock=[ 48 ] regPerThread=[ 6 ] occupancy=[ 1.000 ] streamID=[ 0 ]branch=[ 0 ] divergent_branch=[ 0 ] instructions=[ 286461 ] cta_launched=[ 6554 ]
10
timestamp=[ 254231.000 ] method=[ memcopy ] gputime=[ 79652.961 ] cputime=[ 88761.000 ] memtransfersize=[ 67108864 ] memtransferdir=[ 1 ]