GPU kernel for MC simulations and CUDA host code. More...

#include <cmath>
#include "mcx_core.h"
#include "mcx_tictoc.h"
#include "mcx_const.h"
#include <cuda.h>
#include "cuda_fp16.h"
#include "mcx_rand_xorshift128p.cu"

Include dependency graph for mcx_core.cu:

Macros
#define	_USE_MATH_DEFINES

#define	SHADOWCOUNT 2

#define	ZERO 0.f

#define	CUDA_ASSERT(a) mcx_cu_assess((a),__FILE__,__LINE__)

#define	int2(a, b) make_int2(a,b)

#define	int3(a, b, c) make_int3(a,b,c)

#define	uint2(a, b) make_uint2(a,b)

#define	uint3(a, b, c) make_uint3(a,b,c)

#define	uint4(a, b, c, d) make_uint4(a,b,c,d)

#define	float1(a) make_float1(a)

#define	float2(a, b) make_float2(a,b)

#define	float3(a, b, c) make_float3(a,b,c)

#define	float4(a, b, c, d) make_float4(a,b,c,d)

#define	FL3(f) make_float3(f,f,f)

#define	CUDA_NAN_F (__int_as_float(0x7fffffff))

#define	__CUDA_ARCH_LIST__ 350

Typedefs
typedef float	OutputType

Functions
__device__ float3	operator+ (const float3 &a, const float3 &b)
	Adding two float3 vectors c=a+b.

__device__ void	operator+= (float3 &a, const float3 &b)
	Increatment a float3 vector by another float3, a+=b.

__device__ float3	operator- (const float3 &a, const float3 &b)
	Subtracting two float3 vectors c=a+b.

__device__ float3	operator- (const float3 &a)
	Negating a float3 vector c=-a.

__device__ float3	operator* (const float &a, const float3 &b)
	Front-multiplying a float3 with a scalar c=a*b.

__device__ float3	operator* (const float3 &a, const float &b)
	Post-multiplying a float3 with a scalar c=a*b.

__device__ float3	operator* (const float3 &a, const float3 &b)
	Multiplying two float3 vectors c=a*b.

__device__ float	dot (const float3 &a, const float3 &b)
	Dot-product of two float3 vectors c=a*b.

__device__ OutputType	atomicadd (OutputType *address, OutputType value)
	Texture memory for storing media indices. More...

__device__ void	clearpath (float *p, int maxmediatype)
	Reset shared memory buffer to store photon partial-path data for a new photon. More...

__device__ void	savedebugdata (MCXpos p, uint id, float gdebugdata)
	Saving photon trajectory data for debugging purposes. More...

__device__ float	mcx_nextafterf (float a, int dir)
	A simplified nextafterf() to ensure a photon moves outside of the current voxel after each move. More...

__device__ float	hitgrid (float3 p0, float3 v, float *rv, short id[4])
	Core function for photon-voxel ray-tracing. More...

__device__ void	transmit (MCXdir *v, float n1, float n2, int flipdir)
	Calculating the direction vector after transmission. More...

__device__ float	reflectcoeff (MCXdir *v, float n1, float n2, int flipdir)
	Calculating the reflection coefficient at an interface. More...

template<const int islabel, const int issvmc>
__device__ void	updateproperty (Medium prop, unsigned int &mediaid, RandType t[RAND_BUF_LEN], unsigned int idx1d, uint media[], float3 p, MCXsp *nuvox, short flipdir[4])
	Loading optical properties from constant memory. More...

__device__ int	ray_plane_intersect (float3 p0, MCXdir v, Medium prop, float &len, float &slen, MCXsp nuvox)
	Compute intersection point between a photon path and the intra-voxel interface if present. More...

__device__ int	reflectray (float n1, float3 c0, float3 rv, MCXsp nuvox, Medium prop, RandType t[RAND_BUF_LEN])
	Perform reflection/refraction computation along mismatched intra-voxel interface. More...

__device__ float	getrefractiveidx (unsigned int mediaid)
	Loading optical properties from constant memory. More...

template<const int islabel, const int issvmc>
__device__ int	skipvoid (MCXpos p, MCXdir v, MCXtime f, float3 rv, uint media[], RandType t[RAND_BUF_LEN], MCXsp *nuvox)
	Advance photon to the 1st non-zero voxel if launched in the backgruond. More...

__device__ void	rotatevector2d (MCXdir *v, float stheta, float ctheta)
	Compute 2D-scattering if the domain has a dimension of 1 in x/y or z. More...

__device__ void	rotatevector (MCXdir *v, float stheta, float ctheta, float sphi, float cphi)
	Compute 3D-scattering direction vector. More...

template<const int ispencil, const int isreflect, const int islabel, const int issvmc, const int ispolarized>
__device__ int	launchnewphoton (MCXpos p, MCXdir v, Stokes s, MCXtime f, float3 rv, short flipdir[4], Medium prop, uint idx1d, OutputType field, uint mediaid, OutputType w0, uint isdet, float ppath[], float n_det[], uint dpnum, RandType t[RAND_BUF_LEN], RandType photonseed[RAND_BUF_LEN], uint media[], float srcpattern[], int threadid, RandType rngseed[], RandType seeddata[], float gdebugdata[], volatile int gprogress[], float photontof[], MCXsp nuvox)
	Terminate a photon and launch a new photon according to specified source form. More...

__global__ void	mcx_test_rng (OutputType field[], uint n_seed[])
	A stand-alone kernel to test random number generators. More...

template<const int ispencil, const int isreflect, const int islabel, const int issvmc, const int ispolarized>
__global__ void	mcx_main_loop (uint media[], OutputType field[], float genergy[], uint n_seed[], float4 n_pos[], float4 n_dir[], float4 n_len[], float n_det[], uint detectedphoton[], float srcpattern[], float replayweight[], float photontof[], int photondetid[], RandType seeddata, float gdebugdata, float ginvcdf, float4 gsmatrix, volatile int *gprogress)
	The core Monte Carlo photon simulation kernel (!!!Important!!!) More...

void	mcx_cu_assess (cudaError_t cuerr, const char *file, const int linenum)

int	mcx_corecount (int v1, int v2)
	Utility function to calculate the GPU stream processors (cores) per SM. More...

int	mcx_smxblock (int v1, int v2)
	Utility function to calculate the maximum blocks per SM. More...

int	mcx_threadmultiplier (int v1, int v2)
	Utility function to calculate the maximum blocks per SM. More...

int	mcx_list_gpu (Config cfg, GPUInfo *info)
	Utility function to query GPU info and set active GPU. More...

void	mcx_run_simulation (Config cfg, GPUInfo gpu)
	Master host code for the MCX simulation kernel (!!!Important!!!) More...

Variables
__constant__ float4	gproperty [MAX_PROP_AND_DETECTORS]
	Concatenated optical properties and det positions, stored in constant memory. More...

__constant__ MCXParam	gcfg [1]
	Simulation constant parameters stored in the constant memory. More...

__device__ uint	gjumpdebug [1]
	Global variable to store the number of photon movements for debugging purposes.

__shared__ char	sharedmem []
	Pointer to the shared memory (storing photon data and spilled registers)

Detailed Description

GPU kernel for MC simulations and CUDA host code.

This unit contains both the GPU kernels (running on the GPU device) and host code (running on the host) that initializes GPU buffers, calling kernels, retrieving all computed results (fluence, diffuse reflectance detected photon data) from GPU, and post processing, such as normalization, saving data to file etc. The main function of the GPU kernel is mcx_main_loop and the main function of the host code is mcx_run_simulation.

This unit is written with CUDA-C and shall be compiled using nvcc in cuda-toolkit.

Macro Definition Documentation

◆ CUDA_NAN_F

#define CUDA_NAN_F (__int_as_float(0x7fffffff))

NaN constant in CUDA

◆ float1

#define float1 ( a ) make_float1(a)

float1 constructor

◆ float2

#define float2	(	a,
		b
	)	make_float2(a,b)

float2 constructor

◆ float3

#define float3	(	a,
		b,
		c
	)	make_float3(a,b,c)

float3 constructor

◆ float4

#define float4	(	a,
		b,
		c,
		d
	)	make_float4(a,b,c,d)

float4 constructor

◆ int2

#define int2	(	a,
		b
	)	make_int2(a,b)

int2 constructor

◆ int3

#define int3	(	a,
		b,
		c
	)	make_int3(a,b,c)

int3 constructor

◆ uint2

#define uint2	(	a,
		b
	)	make_uint2(a,b)

uint2 constructor

◆ uint3

#define uint3	(	a,
		b,
		c
	)	make_uint3(a,b,c)

uint3 constructor

◆ uint4

#define uint4	(	a,
		b,
		c,
		d
	)	make_uint4(a,b,c,d)

uint4 constructor

Function Documentation

◆ atomicadd()

__device__ OutputType atomicadd	(	OutputType *	address,
		OutputType	value
	)

inline

Texture memory for storing media indices.

Tested with texture memory for media, only improved 1% speed to keep code portable, use global memory for now also need to change all media[idx1d] to tex1Dfetch() below Floating-point atomic addition

◆ clearpath()

__device__ void clearpath	(	float *	p,
		int	maxmediatype
	)

inline

Reset shared memory buffer to store photon partial-path data for a new photon.

Parameters

[in]	p	pointer to the partial-path buffer
[in]	maxmediatype	length of the buffer to be reset

◆ getrefractiveidx()

__device__ float getrefractiveidx ( unsigned int mediaid )

Loading optical properties from constant memory.

This function parses the media input and load optical properties from GPU memory

Parameters

[in] mediaid the media ID (32 bit) of the current voxel, format is specified in gcfg->mediaformat or cfg->mediabyte

◆ hitgrid()

__device__ float hitgrid	(	float3 *	p0,
		float3 *	v,
		float *	rv,
		short	id[4]
	)

inline

Core function for photon-voxel ray-tracing.

This is the heart of the MCX simulation algorithm. It calculates the nearest intersection of the ray inside the current cubic voxel.

Parameters

[in]	p0	the x/y/z position of the current photon
[in]	v	the direction vector of the photon
[in]	rv	pre-computed reciprocal of the velocity vector (v)
[out]	id	0: intersect with x=x0 plane; 1: intersect with y=y0 plane; 2: intersect with z=z0 plane first

Returns: the distance to the intersection to the voxel bounding box

◆ launchnewphoton()

template<const int ispencil, const int isreflect, const int islabel, const int issvmc, const int ispolarized>

__device__ int launchnewphoton	(	MCXpos *	p,
		MCXdir *	v,
		Stokes *	s,
		MCXtime *	f,
		float3 *	rv,
		short	flipdir[4],
		Medium *	prop,
		uint *	idx1d,
		OutputType *	field,
		uint *	mediaid,
		OutputType *	w0,
		uint	isdet,
		float	ppath[],
		float	n_det[],
		uint *	dpnum,
		RandType	t[RAND_BUF_LEN],
		RandType	photonseed[RAND_BUF_LEN],
		uint	media[],
		float	srcpattern[],
		int	threadid,
		RandType	rngseed[],
		RandType	seeddata[],
		float	gdebugdata[],
		volatile int	gprogress[],
		float	photontof[],
		MCXsp *	nuvox
	)

inline

Terminate a photon and launch a new photon according to specified source form.

This function terminates the current photon and launches a new photon according to the source type selected. Currently, over a dozen source types are supported, including pencil, isotropic, planar, fourierx, gaussian, zgaussian etc.

Parameters

[in,out]	p	the 3D position and weight of the photon
[in,out]	v	the direction vector of the photon
[in,out]	f	the parameter vector of the photon
[in,out]	s	the Stokes vector of the photon
[in,out]	rv	the reciprocal direction vector of the photon (rv[i]=1/v[i])
[out]	prop	the optical properties of the voxel the photon is launched into
[in,out]	idx1d	the linear index of the voxel containing the photon at launch
[in]	field	the 3D array to store photon weights
[in,out]	mediaid	the medium index at the voxel at launch
[in,out]	w0	initial weight, reset here after launch
[in]	isdet	whether the previous photon being terminated lands at a detector
[in,out]	ppath	pointer to the shared-mem buffer to store photon partial-path data
[in,out]	n_det	array in the constant memory where detector positions are stored
[in,out]	dpnum	global-mem variable where the count of detected photons are stored
[in]	t	RNG state
[in,out]	photonseed	RNG state stored at photon's launch time if replay is needed
[in]	media	domain medium index array, read-only
[in]	srcpattern	user-specified source pattern array if pattern source is used
[in]	threadid	the global index of the current thread
[in]	rngseed	in the replay mode, pointer to the saved detected photon seed data
[in,out]	seeddata	pointer to the buffer to save detected photon seeds
[in,out]	gdebugdata	pointer to the buffer to save photon trajectory positions
[in,out]	gprogress	pointer to the host variable to update progress bar

Early termination of simulation when the detphoton buffer is filled if issavedet is set to 3

First, let's terminate the current photon and perform detection calculations

If the thread completes all assigned photons, terminate this thread.

If this is a replay of a detected photon, initilize the RNG with the stored seed here.

Attempt to launch a new photon until success

Only one branch is taken because of template, this can reduce thread divergence

parameter to generate photon path from coordinates at focus (depends on focal distance and rayleigh range)

if beam direction is along +z or -z direction

if beam dir. is not +z or -z, compute photon position and direction after rotation

photon position displacement after rotation

photon direction after rotation

compute final launch position and update medium label

If beam focus is set, determine the incident angle

Compute the reciprocal of the velocity vector

If a photon is launched outside of the box, or inside a zero-voxel, move it until it hits a non-zero voxel

specular reflection of the bbx is taken care of here

if launch attempted for over 1000 times, stop trying and return

Now a photon is successfully launched, perform necssary initialization for a new trajectory

total energy enters the volume. for diverging/converting beams, this is less than nphoton due to specular reflection loss. This is different from the wide-field MMC, where the total launched energy includes the specular reflection loss

If a progress bar is needed, only sum completed photons from the 1st, last and middle threads to determine progress bar

◆ mcx_corecount()

int mcx_corecount	(	int	v1,
		int	v2
	)

Utility function to calculate the GPU stream processors (cores) per SM.

Obtain GPU core number per MP, this replaces ConvertSMVer2Cores() in libcudautils to avoid extra dependency.

Parameters

[in]	v1	the major version of an NVIDIA GPU
[in]	v2	the minor version of an NVIDIA GPU

◆ mcx_cu_assess()

void mcx_cu_assess	(	cudaError_t	cuerr,
		const char *	file,
		const int	linenum
	)

assert cuda memory allocation result

Here is the call graph for this function:

◆ mcx_list_gpu()

int mcx_list_gpu	(	Config *	cfg,
		GPUInfo **	info
	)

Utility function to query GPU info and set active GPU.

This function query and list all available GPUs on the system and print their parameters. This is used when -L or -I is used.

Parameters

[in,out]	cfg	the simulation configuration structure
[out]	info	the GPU information structure

Here is the call graph for this function:

◆ mcx_main_loop()

template<const int ispencil, const int isreflect, const int islabel, const int issvmc, const int ispolarized>

__global__ void mcx_main_loop	(	uint	media[],
		OutputType	field[],
		float	genergy[],
		uint	n_seed[],
		float4	n_pos[],
		float4	n_dir[],
		float4	n_len[],
		float	n_det[],
		uint	detectedphoton[],
		float	srcpattern[],
		float	replayweight[],
		float	photontof[],
		int	photondetid[],
		RandType *	seeddata,
		float *	gdebugdata,
		float *	ginvcdf,
		float4 *	gsmatrix,
		volatile int *	gprogress
	)

The core Monte Carlo photon simulation kernel (!!!Important!!!)

This is the core Monte Carlo simulation kernel, please see Fig. 1 in Fang2009. everything in the GPU kernels is in grid-unit. To convert back to length, use cfg->unitinmm (scattering/absorption coeff, T, speed etc)

Parameters

[in]	media	domain medium index array, read-only
[out]	field	the 3D/4D array where the fluence/energy-deposit are accummulated
[in,out]	genergy	the array storing the total launched and escaped energy for each thread
[in]	n_seed	the seed to the RNG of this thread
[in,out]	n_pos	the initial position state of the photon for each thread
[in,out]	n_dir	the initial direction state of the photon for each thread
[in,out]	n_len	the initial parameter state of the photon for each thread
[in,out]	detectedphoton	the buffer where the detected photon data are stored
[in]	srcpattern	user-specified source pattern array if pattern source is used
[in]	replayweight	the pre-computed detected photon weight for replay
[in]	photontof	the pre-computed detected photon time-of-fly for replay
[in,out]	seeddata	pointer to the buffer to save detected photon seeds
[in,out]	gdebugdata	pointer to the buffer to save photon trajectory positions
[in,out]	gprogress	pointer to the host variable to update progress bar

the 1D index of the current thread

Load use-defined phase function (inversion of CDF) to the shared memory (first gcfg->nphase floats)

Initialize RNG states

Launch the first photon

The following lines initialize photon state variables, RNG states and

The main photon movement loop Each pass of this loop, we advance photon by one step - meaning that it is either moved from one voxel to the immediately next voxel when the scattering path continues, or moved to the end of the scattering path if it ends within the current voxel.

A scattering event When a photon arrives at a scattering site, 3 values are regenrated 1 - a random unitless scattering length f.pscat, 2 - a 0-2pi uniformly random arimuthal angle 3 - a 0-pi random zenith angle based on the Henyey-Greenstein Phase Function

Rejection method to choose azimuthal angle phi and deflection angle theta

Here we use Henyey-Greenstein Phase Function, "Handbook of Optical Biomedical Diagnostics",2002,Chap3,p234, also see Boas2002

Store old direction cosines for polarized photon simulation

Update direction vector with the two random angles

Update stokes parameters

Only compute the reciprocal vector when v is changed, this saves division calculations, which are very expensive on the GPU

Read the optical property of the current voxel

Advance photon 1 step to the next voxel

convert photon movement length to unitless scattering length by multiplying with mus

if the consumed unitless scat length is less than what's left in f.pscat, keep moving; otherwise, stop in this voxel

final length that the photon moves - either the length to move to the next voxel, or the remaining scattering length

perform ray-interface intersection test to consider intra-voxel curvature (SVMC mode)

if photon moves to the next voxel, use the precomputed intersection coord

although the below 3 lines look dumb, if you change it to flipdir[flipdir[3]] += ..., the speed drops by half, likely due to step locking

calculate photon energy loss

remaining unitless scattering length: sum(s_i*mus_i), unit-less

update photon timer to add time-of-flight (unit = s)

read the medium index of the new voxel (current or next)

if photon moves outside of the volume, set mediaid to 0

isdet now stores the boundary condition flag, this will be overwriten before the end of the loop

otherwise, read the optical property index

upper 16bit is the mask of the covered detector

lower 16bit is the medium index

save fluence to the voxel when photon moves out

if t is within the time window, which spans cfg->maxgate*cfg->tstep.wide

calculate the quality to be accummulated

when mua->0, take limit_{mua->0} w0*(1-exp(-mua*len))/mua yields w0*len

accummulate the quality to the volume using non-atomic operations

in SVMC mode, update tissue type when photons cross voxel or intra-voxel boundary

launch new photon when exceed time window or moving from non-zero voxel to zero voxel without reflection

upper 16bit is the mask of the covered detector

lower 16bit is the medium index

perform Russian Roulette

do boundary reflection/transmission

return the accumulated total energyloss and launched energy back to the host

for debugging purposes, we also pass the last photon states back to the host for printing

◆ mcx_nextafterf()

__device__ float mcx_nextafterf	(	float	a,
		int	dir
	)

inline

A simplified nextafterf() to ensure a photon moves outside of the current voxel after each move.

Parameters

[in]	a	a floating point number
[in]	dir	1: change 1 bit in the positive direction; 0: no change, -1: change 1 bit in the negative direction

First, shift coordinate by 1000 to make sure values are always positive

Then make 1 bit difference along the direction indicated by dir

Last, undo the offset, and return

◆ mcx_run_simulation()

void mcx_run_simulation	(	Config *	cfg,
		GPUInfo *	gpu
	)

Master host code for the MCX simulation kernel (!!!Important!!!)

This function is the master host code for the MCX kernel. It is responsible for initializing all GPU variables, copy data from host to GPU, launch the kernel for simulation, wait for competion, and retrieve the results.

Parameters

[in,out]	cfg	the simulation configuration structure
[in]	gpu	the GPU information structure

p0 - initial photon positions for pencil/isotropic/cone beams, used to initialize {p={x,y,z,w}} state in the GPU kernel

c0 - initial photon direction state, used to initialize {MCXdir v={vx,vy,vz,}} state in the GPU kernel

stokesvec - initial photon polarization state, described by Stokes Vector

gpuphoton - number of photons to be simulated per thread, determined by total workload and thread number

photoncount - number of completed photons returned by all thread, output

mcgrid - GPU grid size, only use 1D grid, used when launching the kernel in cuda <<<>>> operator

mcblock - GPU block size, only use 1D block, used when launching the kernel in cuda <<<>>> operator

sharedbuf - shared memory buffer length to be requested, used when launching the kernel in cuda <<<>>> operator

dimxyz - output volume variable field voxel count, Nx*Ny*Nz*Ns where Ns=cfg.srcnum is the pattern number for photon sharing

media - input volume representing the simulation domain, format specified in cfg.mediaformat, read-only

field - output volume to store GPU computed fluence, length is dimxyz

rfimag - imaginary part of the RF Jacobian, length is dimxyz

Ppos - per-thread photon state initialization host buffers

Pseed - per-thread RNG seed initialization host buffers

Pdet - output host buffer to store detected photon partial-path and other per-photon data

energy - output host buffers to accummulate total launched/absorbed energy per thread, needed for normalization

srcpw, energytot, energyabs - output host buffers to accummulate total launched/absorbed energy per pattern in photon sharing, needed for normalization of multi-pattern simulations

seeddata - output buffer to store RNG initial seeds for each detected photon for replay

detected - total number of detected photons, output

progress - pinned-memory variable as the progress bar during simulation, updated in GPU and visible from host

all pointers start with g___ are the corresponding GPU buffers to read/write host variables defined above

                        |----------------------------------------------->  hostdetreclen  <--------------------------------------|
                                  |------------------------>    partialdata   <-------------------|

host detected photon buffer: detid (1), partial_scat (#media), partial_path (#media), momontum (#media), p_exit (3), v_exit(3), w0 (1) |———————————————> w0offset <-————————————||<–— w0 (#srcnum) –—>||<- RF replay (2)->| gpu detected photon buffer: partial_scat (#media), partial_path (#media), momontum (#media), E_escape (1), E_launch (1), w0 (1), w0_photonsharing (#srcnum) cos(w*T),sin(w*T)

param - constants to be used in the GPU, copied to GPU as gcfg, stored in the constant memory

Start multiple CPU threads using OpenMP, one thread for each GPU device to run simultaneously, threadid returns the current thread ID

Use threadid and cfg.deviceid, the compressed list of active GPU devices, to get the desired GPU ID for this thread

Activate the corresponding GPU device

Use the specified GPU's parameters, stored in gpu[gpuid] to determine the maximum time gates that it can hold

Updating host simulation configuration cfg, only allow the master thread to modify cfg, others are read-only

If domain is 2D, the initial photon launch direction in the singleton dimension is expected to be 0

If autopilot mode is not used, the thread and block sizes are determined by user input from cfg.nthread and cfg.nblocksize

If total thread number is not integer multiples of block size, round it to the largest block size multiple

If cfg.respin is positive, the output data have to be accummulated, so we use a double-buffer to retrieve and then accummulate

Master thread computes total workloads, specified by users (or equally by default) for all active devices, stored as cfg.workload[gpuid]

Now we can determine how many photons to be simualated by multiplying the total photon by the relative ratio of per-device workload divided by the total workload

Once per-thread photon number param.threadphoton is known, we distribute the remaiders, if any, to a subset of threads, one extra photon per thread, so we can get the exact total photon number

Total time gate number is computed

Here we determine if the GPU memory of the current device can store all time gates, if not, disabling normalization

Here we decide the total output buffer, field's length. it is Nx*Ny*Nz*Nt*Ns

A 1D grid is determined by the total thread number and block size

A 1D block is determined by the user specified block size, or by default, 64, determined emperically to get best performance

If "-D R" is used, we launch a special RNG testing kernel to retrieve random numbers from RNG and test for randomness

Allocate all host buffers to store input or output data

Ppos: host buffer for initial photon position+weight

Pdir: host buffer for initial photon direction

Plen: host buffer for initial additional photon states

energy: host buffer for retrieving total launched and escaped energy of each thread

Pdet: host buffer for retrieving all detected photon information

Pseed: RNG seed for each thread in non-replay mode, or

Pseed: RNG seeds for photon replay in GPU threads

Allocate all GPU buffers to store input or output data

Allocate pinned memory variable, progress, for real-time update during kernel run-time

Allocate and copy data needed for photon replay, the needed variables include gPseed per-photon seed to be replayed greplayw per-photon initial weight greplaytof per-photon time-of-flight time in s greplaydetid per-photon index for each replayed photon

Allocate and copy source pattern buffer for 2D and 3D pattern sources

Saving detected photon is enabled by default, but in case if a user disabled this feature, a warning is printed

Pre-compute array dimension strides to move in +-x/y/z dimension quickly in the GPU, stored in the constant memory

Inside the GPU kernel, volume is always assumbed to be col-major (like those generated by MATLAB or FORTRAN)

Additional constants to avoid repeated computation inside GPU

Get ready to start GPU simulation here, clock is now ticking ...

Copy all host buffers to the GPU

Copy constants to the constant memory on the GPU

If one has to simulate a lot of time gates, using the GPU global memory requires extra caution. If the total global memory is bigger than the total memory to save all the snapshots, i.e. size(field)*(tend-tstart)/tstep, one simply sets gpu[gpuid].maxgate to the total gate number; this will run GPU kernel once. If the required memory is bigger than the video memory, set gpu[gpuid].maxgate to a number which fits, and the snapshot will be saved with an increment of gpu[gpuid].maxgate snapshots. In this case, the later simulations will restart from photon launching and exhibit redundancies.

The calculation of the energy conservation will only reflect the last simulation.

Outer loop: loop over each time-gate-group, determined by the capacity of the global memory to hold the output data, in most cases, totalgates is 1

Determine the start and end time of the current time-gate-group

Copy param to the constant memory variable gcfg

Inner loop: loop over total number of repetitions specified by cfg.respin, results will be accumulated to field

Each repetition, we have to reset the output buffers, including gfield and gPdet

Start the clock for GPU-kernel only run-time here

Determine template constants for compilers to build specialized binary instances to reduce branching and thread-divergence. If not using template, the performance can take a 20% drop.

ispencil: template constant, if 1, launch photon code is dramatically simplified

isref: template constant, if 1, perform boundary reflection, if 0, total-absorbion boundary, can simplify kernel

issvmc: template constant, if 1, consider the input volume containing split-voxel data, see Yan2020 for details

ispolarized: template constant, if 1, perform polarized light simulations, currently only supports label-based media

Enable reflection flag when c or m flags are used in the cfg.bc boundary condition flags

Launch GPU kernel using template constants. Here, the compiler will create 2^4=16 individually compiled kernel PTX binaries for each combination of template variables. This creates bigger binary and slower compilation time, but brings up to 20%-30% speed improvement on certain simulations.

By now, the GPU kernel has been launched asynchronously, the master thread on the host starts reading a pinned memory variable, gprogress, to realtimely read the completed photon count updated inside the GPU kernel while it is running.

host variable progress is pinned with the GPU variable gprogress, and can be updated by the GPU kernel from the device. We can read this variable to see how many photons are simulated.

Here we use the below formula to compute the 0-100% completion ratio. Only half of the threads updates the progress, and each thread only update the counter 5 times at 0%/25%/50%/75%/100% progress to minimize overhead while still providing a smooth progress bar.

By calling cudaDeviceSynchronize, the host thread now waits for the completion of the kernel, then start retrieving all GPU output data

Here, the GPU kernel is completely executed and returned

now we can estimate and print the GPU-kernel-only runtime

If the GPU kernel crashed or terminated by error during execution, we need to capture it by calling cudaGetLastError and terminate mcx if error happens

Now, we start retrieving all output variables, and copy those to the corresponding host buffers

photoncount returns the actual completely simulated photons returned by GPU threads, no longer used

If '-D M' is specified, we retrieve photon trajectory data and store those to cfg.exportdebugdata and cfg.debugdatalen

If photon detection is enabled and detectors are defined, we retrieve partial-path length data, among others, to cfg.exportdetected and detected

Accumulate volumetric fluence from all threads/devices

If double-precision is used for output, we do not need two buffers; however, by default, we use single-precision output, we need to copy and accumulate two separate floating-point buffers to minimize round-off errors near the source

If respin is used, each repeatition is accumulated to the 2nd half of the buffer

Here is the end of the inner-loop (respin)

If respin is used, copy the accumulated buffer in the 2nd half to the first half

Retrieve accumulated total launched and residual energy from each thread

For MATLAB mex file, the data is copied to a pre-allocated buffer cfg->export* as a return variable

Here is the end of the outer-loop, over time-gate groups

Let the master thread to deal with the normalization and file IO First, if multi-pattern simulation, i.e. photon sharing, is used, we normalize each pattern first

Now we normalize the fluence so that the default output is fluence rate in joule/(s*mm^2) generated by a unitary source (1 joule total).

The raw output directly from GPU is the accumulated energy-loss per photon moving step in joule when cfg.outputtype='fluence', or energy-loss multiplied by mua (1/mm) per voxel (joule/mm) when cfg.outputtype='flux' (default).

If output is flux (J/(s*mm^2), default), raw data (joule*mm) is multiplied by (1/(Nphoton*Vvox*dt)) If output is fluence (J/mm^2), raw data (joule*mm) is multiplied by (1/(Nphoton*Vvox))

If output is energy (joule), raw data is simply multiplied by 1/Nphoton

In photon sharing mode, where multiple pattern sources are simulated, each solution is normalized separately

If not running as a mex file, we need to save volumetric output data, if enabled, as a file, with suffix specifed by cfg.outputformat (mc2,nii, or .jdat or .jbat)

If not running as a mex file, we need to save detected photon data, if enabled, as a file, either as a .mch file, or a .jdat/.jbat file

If not running as a mex file, we need to save photon trajectory data, if enabled, as a file, either as a .mct file, or a .jdat/.jbat file

Copying GPU photon states back to host as Ppos, Pdir and Plen for debugging purpose is depreciated

Report simulation summary, total energy here equals total simulated photons+unfinished photons for all threads

Simulation is complete, now we need clear up all GPU memory buffers

The below call in theory is not needed, but it ensures the device is freed for other programs, especially on Windows

Lastly, free all host buffers, the simulation is complete.

◆ mcx_smxblock()

int mcx_smxblock	(	int	v1,
		int	v2
	)

Utility function to calculate the maximum blocks per SM.

Parameters

[in]	v1	the major version of an NVIDIA GPU
[in]	v2	the minor version of an NVIDIA GPU

◆ mcx_test_rng()

__global__ void mcx_test_rng	(	OutputType	field[],
		uint	n_seed[]
	)

A stand-alone kernel to test random number generators.

This function fills the domain with the random numbers generated from the GPU. One can use this function to dump RNG values and test for quality. use -D R in the command line to enable this output.

Parameters

[out]	field	the array to be filled with RNG values
[in]	n_seed	the seed to the RNG

◆ mcx_threadmultiplier()

int mcx_threadmultiplier	(	int	v1,
		int	v2
	)

Utility function to calculate the maximum blocks per SM.

Parameters

[in]	v1	the major version of an NVIDIA GPU
[in]	v2	the minor version of an NVIDIA GPU

◆ ray_plane_intersect()

__device__ int ray_plane_intersect	(	float3 *	p0,
		MCXdir *	v,
		Medium *	prop,
		float &	len,
		float &	slen,
		MCXsp *	nuvox
	)

Compute intersection point between a photon path and the intra-voxel interface if present.

This function tests if a ray intersects with an in-voxel marching-cube boundary (a plane) in SVMC mode

Parameters

[in]	p0	current position
[in]	v	current photon direction
[in]	prop	optical properties
[in,out]	len	photon movement length, updated if intersection is found
[in,out]	slen	remaining unitless scattering length, updated if intersection is found
[in]	nuvox	a struct storing normal direction (nv) and a point on the plane
[in]	f	photon state including total time-of-flight, number of scattering etc
[in]	flipdir[0,1,2]	current voxel xi/yi/zi; flipdir[3]: 0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane

◆ reflectcoeff()

__device__ float reflectcoeff	(	MCXdir *	v,
		float	n1,
		float	n2,
		int	flipdir
	)

inline

Calculating the reflection coefficient at an interface.

This function calculates the reflection coefficient at an interface of different refrective indicex (n1/n2)

Parameters

[in]	v	the direction vector of the photon
[in]	n1	the refrective index of the voxel the photon leaves
[in]	n2	the refrective index of the voxel the photon enters
[in]	flipdir	0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane

Returns: the reflection coefficient R=(Rs+Rp)/2, Rs: R of the perpendicularly polarized light, Rp: parallelly polarized light

1-[n1/n2*sin(si)]^2 = cos(ti)^2

to save one sqrt

Rp

(Rp+Rs)/2

◆ reflectray()

__device__ int reflectray	(	float	n1,
		float3 *	c0,
		float3 *	rv,
		MCXsp *	nuvox,
		Medium *	prop,
		RandType	t[RAND_BUF_LEN]
	)

Perform reflection/refraction computation along mismatched intra-voxel interface.

This function returns a new direction when a ray is reflected/transmitted through an in-voxel marching-cube boundary in the SVMC mode

Parameters

[in]	n1	refractive index of the current subvolume in a split-voxel
[in,out]	c0	current photon direction
[out]	rv	reciprocated direction vector rv={1/c0.x,1/c0.y,1/c0.z}
[in]	prop	optical properties
[in]	nuvox	a struct storing normal direction (nv) and a point on the plane
[in]	t	random number state for deciding reflection/transmission

◆ rotatevector()

__device__ void rotatevector	(	MCXdir *	v,
		float	stheta,
		float	ctheta,
		float	sphi,
		float	cphi
	)

inline

Compute 3D-scattering direction vector.

This function updates the direction vector after a 3D scattering event

Parameters

[in,out]	v	the direction vector of the photon
[in]	stheta	the sine of the azimuthal angle
[in]	ctheta	the cosine of the azimuthal angle
[in]	sphi	the sine of the zenith angle
[in]	cphi	the cosine of the zenith angle

◆ rotatevector2d()

__device__ void rotatevector2d	(	MCXdir *	v,
		float	stheta,
		float	ctheta
	)

inline

Compute 2D-scattering if the domain has a dimension of 1 in x/y or z.

This function performs 2D scattering calculation if the domain is only a sheet of voxels

Parameters

[in,out]	v	the direction vector of the photon
[in]	stheta	the sine of the rotation angle
[in]	ctheta	the cosine of the rotation angle

◆ savedebugdata()

__device__ void savedebugdata	(	MCXpos *	p,
		uint	id,
		float *	gdebugdata
	)

inline

Saving photon trajectory data for debugging purposes.

Parameters

[in]	p	the position/weight of the current photon packet
[in]	id	the global index of the photon
[in]	gdebugdata	pointer to the global-memory buffer to store the trajectory info

◆ skipvoid()

template<const int islabel, const int issvmc>

__device__ int skipvoid	(	MCXpos *	p,
		MCXdir *	v,
		MCXtime *	f,
		float3 *	rv,
		uint	media[],
		RandType	t[RAND_BUF_LEN],
		MCXsp *	nuvox
	)

inline

Advance photon to the 1st non-zero voxel if launched in the backgruond.

This function advances the photon to the 1st non-zero voxel along the direction of v if the photon is launched outside of the cubic domain or in a zero-voxel. To avoid large overhead, photon can only advance gcfg->minaccumtime steps, which can be set using the –maxvoidstep flag; by default, this limit is 1000.

Parameters

[in]	v	the direction vector of the photon
[in]	n1	the refrective index of the voxel the photon leaves
[in]	n2	the refrective index of the voxel the photon enters
[in]	flipdir	0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane

Returns: the reflection coefficient R=(Rs+Rp)/2, Rs: R of the perpendicularly polarized light, Rp: parallelly polarized light

◆ transmit()

__device__ void transmit	(	MCXdir *	v,
		float	n1,
		float	n2,
		int	flipdir
	)

inline

Calculating the direction vector after transmission.

This function updates the direction vector after the photon passing an interface of different refrective indicex (n1/n2). Because MCX only handles voxelated domain, transmission is applied only to 1 of the components, and then the vector is normalized.

Parameters

[in,out]	v	the direction vector of the photon
[in]	n1	the refrective index of the voxel the photon leaves
[in]	n2	the refrective index of the voxel the photon enters
[in]	flipdir	0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane

◆ updateproperty()

template<const int islabel, const int issvmc>

__device__ void updateproperty	(	Medium *	prop,
		unsigned int &	mediaid,
		RandType	t[RAND_BUF_LEN],
		unsigned int	idx1d,
		uint	media[],
		float3 *	p,
		MCXsp *	nuvox,
		short	flipdir[4]
	)

Loading optical properties from constant memory.

This function parses the media input and load optical properties from GPU memory

Parameters

[out]	prop	pointer to the current optical properties {mua, mus, g, n}
[in]	mediaid	the media ID (32 bit) of the current voxel, format is specified in gcfg->mediaformat or cfg->mediabyte

The default mcx input volume is assumed to be 4-byte per voxel (SVMC mode requires 2x 4-byte voxels for 8 data points)

The data encoded in the voxel are parsed based on the gcfg->mediaformat flag. Below, we use [s*] to represent 2-byte short integers; [h*] to represent a 2-byte half-precision floating point number; [c*] to represent 1-byte unsigned char integers and [i*] to represent 4-byte integers; [f*] for 4-byte floating point number index 0 starts from the lowest (least significant bit) end

s[1]: half-prec property; s[0]: high 2bits: idx 0-3, low 14bits: tissue label

Extract the reference point of the intra-voxel interface

Extract the normal vector of the intra-voxel interface

Determine tissue label corresponding to the current photon position

Variable Documentation

◆ gcfg

__constant__ MCXParam gcfg[1]

Simulation constant parameters stored in the constant memory.

This variable stores all constants used in the simulation.

◆ gproperty

__constant__ float4 gproperty[MAX_PROP_AND_DETECTORS]

Concatenated optical properties and det positions, stored in constant memory.

The first cfg.maxmedia elements of this array contain the optical properties of the domains. Format: {x}:mua,{y}:mus,{z}:anisotropy (g),{w}:refractive index (n). The following cfg.detnum elements of this array contains the detector information. Format: {x,y,z}: the x/y/z coord. of the detector, and {w}: radius; all in grid unit. The total length (both media properties and detector) is defined by MAX_PROP_AND_DETECTORS, which is 4000 to fully utilize the constant memory space (64kb=4096 float4)

Macros

Typedefs

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ CUDA_NAN_F

◆ float1

◆ float2

◆ float3

◆ float4

◆ int2

◆ int3

◆ uint2

◆ uint3

◆ uint4

Function Documentation

◆ atomicadd()

◆ clearpath()

◆ getrefractiveidx()

◆ hitgrid()

◆ launchnewphoton()

◆ mcx_corecount()

◆ mcx_cu_assess()

◆ mcx_list_gpu()

◆ mcx_main_loop()

◆ mcx_nextafterf()

◆ mcx_run_simulation()

◆ mcx_smxblock()

◆ mcx_test_rng()

◆ mcx_threadmultiplier()

◆ ray_plane_intersect()

◆ reflectcoeff()

◆ reflectray()

◆ rotatevector()

◆ rotatevector2d()

◆ savedebugdata()

◆ skipvoid()

◆ transmit()

◆ updateproperty()

Variable Documentation

◆ gcfg

◆ gproperty