Monte Carlo eXtreme (MCX)
|
GPU kernel for MC simulations and CUDA host code. More...
#include <cmath>
#include "mcx_core.h"
#include "mcx_tictoc.h"
#include "mcx_const.h"
#include <cuda.h>
#include "cuda_fp16.h"
#include "mcx_rand_xorshift128p.cu"
Macros | |
#define | _USE_MATH_DEFINES |
#define | SHADOWCOUNT 2 |
#define | ZERO 0.f |
#define | CUDA_ASSERT(a) mcx_cu_assess((a),__FILE__,__LINE__) |
#define | int2(a, b) make_int2(a,b) |
#define | int3(a, b, c) make_int3(a,b,c) |
#define | uint2(a, b) make_uint2(a,b) |
#define | uint3(a, b, c) make_uint3(a,b,c) |
#define | uint4(a, b, c, d) make_uint4(a,b,c,d) |
#define | float1(a) make_float1(a) |
#define | float2(a, b) make_float2(a,b) |
#define | float3(a, b, c) make_float3(a,b,c) |
#define | float4(a, b, c, d) make_float4(a,b,c,d) |
#define | FL3(f) make_float3(f,f,f) |
#define | CUDA_NAN_F (__int_as_float(0x7fffffff)) |
#define | __CUDA_ARCH_LIST__ 350 |
Typedefs | |
typedef float | OutputType |
Functions | |
__device__ float3 | operator+ (const float3 &a, const float3 &b) |
Adding two float3 vectors c=a+b. | |
__device__ void | operator+= (float3 &a, const float3 &b) |
Increatment a float3 vector by another float3, a+=b. | |
__device__ float3 | operator- (const float3 &a, const float3 &b) |
Subtracting two float3 vectors c=a+b. | |
__device__ float3 | operator- (const float3 &a) |
Negating a float3 vector c=-a. | |
__device__ float3 | operator* (const float &a, const float3 &b) |
Front-multiplying a float3 with a scalar c=a*b. | |
__device__ float3 | operator* (const float3 &a, const float &b) |
Post-multiplying a float3 with a scalar c=a*b. | |
__device__ float3 | operator* (const float3 &a, const float3 &b) |
Multiplying two float3 vectors c=a*b. | |
__device__ float | dot (const float3 &a, const float3 &b) |
Dot-product of two float3 vectors c=a*b. | |
__device__ OutputType | atomicadd (OutputType *address, OutputType value) |
Texture memory for storing media indices. More... | |
__device__ void | clearpath (float *p, int maxmediatype) |
Reset shared memory buffer to store photon partial-path data for a new photon. More... | |
__device__ void | savedebugdata (MCXpos *p, uint id, float *gdebugdata) |
Saving photon trajectory data for debugging purposes. More... | |
__device__ float | mcx_nextafterf (float a, int dir) |
A simplified nextafterf() to ensure a photon moves outside of the current voxel after each move. More... | |
__device__ float | hitgrid (float3 *p0, float3 *v, float *rv, short id[4]) |
Core function for photon-voxel ray-tracing. More... | |
__device__ void | transmit (MCXdir *v, float n1, float n2, int flipdir) |
Calculating the direction vector after transmission. More... | |
__device__ float | reflectcoeff (MCXdir *v, float n1, float n2, int flipdir) |
Calculating the reflection coefficient at an interface. More... | |
template<const int islabel, const int issvmc> | |
__device__ void | updateproperty (Medium *prop, unsigned int &mediaid, RandType t[RAND_BUF_LEN], unsigned int idx1d, uint media[], float3 *p, MCXsp *nuvox, short flipdir[4]) |
Loading optical properties from constant memory. More... | |
__device__ int | ray_plane_intersect (float3 *p0, MCXdir *v, Medium *prop, float &len, float &slen, MCXsp *nuvox) |
Compute intersection point between a photon path and the intra-voxel interface if present. More... | |
__device__ int | reflectray (float n1, float3 *c0, float3 *rv, MCXsp *nuvox, Medium *prop, RandType t[RAND_BUF_LEN]) |
Perform reflection/refraction computation along mismatched intra-voxel interface. More... | |
__device__ float | getrefractiveidx (unsigned int mediaid) |
Loading optical properties from constant memory. More... | |
template<const int islabel, const int issvmc> | |
__device__ int | skipvoid (MCXpos *p, MCXdir *v, MCXtime *f, float3 *rv, uint media[], RandType t[RAND_BUF_LEN], MCXsp *nuvox) |
Advance photon to the 1st non-zero voxel if launched in the backgruond. More... | |
__device__ void | rotatevector2d (MCXdir *v, float stheta, float ctheta) |
Compute 2D-scattering if the domain has a dimension of 1 in x/y or z. More... | |
__device__ void | rotatevector (MCXdir *v, float stheta, float ctheta, float sphi, float cphi) |
Compute 3D-scattering direction vector. More... | |
template<const int ispencil, const int isreflect, const int islabel, const int issvmc, const int ispolarized> | |
__device__ int | launchnewphoton (MCXpos *p, MCXdir *v, Stokes *s, MCXtime *f, float3 *rv, short flipdir[4], Medium *prop, uint *idx1d, OutputType *field, uint *mediaid, OutputType *w0, uint isdet, float ppath[], float n_det[], uint *dpnum, RandType t[RAND_BUF_LEN], RandType photonseed[RAND_BUF_LEN], uint media[], float srcpattern[], int threadid, RandType rngseed[], RandType seeddata[], float gdebugdata[], volatile int gprogress[], float photontof[], MCXsp *nuvox) |
Terminate a photon and launch a new photon according to specified source form. More... | |
__global__ void | mcx_test_rng (OutputType field[], uint n_seed[]) |
A stand-alone kernel to test random number generators. More... | |
template<const int ispencil, const int isreflect, const int islabel, const int issvmc, const int ispolarized> | |
__global__ void | mcx_main_loop (uint media[], OutputType field[], float genergy[], uint n_seed[], float4 n_pos[], float4 n_dir[], float4 n_len[], float n_det[], uint detectedphoton[], float srcpattern[], float replayweight[], float photontof[], int photondetid[], RandType *seeddata, float *gdebugdata, float *ginvcdf, float4 *gsmatrix, volatile int *gprogress) |
The core Monte Carlo photon simulation kernel (!!!Important!!!) More... | |
void | mcx_cu_assess (cudaError_t cuerr, const char *file, const int linenum) |
int | mcx_corecount (int v1, int v2) |
Utility function to calculate the GPU stream processors (cores) per SM. More... | |
int | mcx_smxblock (int v1, int v2) |
Utility function to calculate the maximum blocks per SM. More... | |
int | mcx_threadmultiplier (int v1, int v2) |
Utility function to calculate the maximum blocks per SM. More... | |
int | mcx_list_gpu (Config *cfg, GPUInfo **info) |
Utility function to query GPU info and set active GPU. More... | |
void | mcx_run_simulation (Config *cfg, GPUInfo *gpu) |
Master host code for the MCX simulation kernel (!!!Important!!!) More... | |
Variables | |
__constant__ float4 | gproperty [MAX_PROP_AND_DETECTORS] |
Concatenated optical properties and det positions, stored in constant memory. More... | |
__constant__ MCXParam | gcfg [1] |
Simulation constant parameters stored in the constant memory. More... | |
__device__ uint | gjumpdebug [1] |
Global variable to store the number of photon movements for debugging purposes. | |
__shared__ char | sharedmem [] |
Pointer to the shared memory (storing photon data and spilled registers) | |
GPU kernel for MC simulations and CUDA host code.
This unit contains both the GPU kernels (running on the GPU device) and host code (running on the host) that initializes GPU buffers, calling kernels, retrieving all computed results (fluence, diffuse reflectance detected photon data) from GPU, and post processing, such as normalization, saving data to file etc. The main function of the GPU kernel is mcx_main_loop
and the main function of the host code is mcx_run_simulation
.
This unit is written with CUDA-C and shall be compiled using nvcc in cuda-toolkit.
#define CUDA_NAN_F (__int_as_float(0x7fffffff)) |
NaN constant in CUDA
#define float1 | ( | a | ) | make_float1(a) |
float1 constructor
#define float2 | ( | a, | |
b | |||
) | make_float2(a,b) |
float2 constructor
#define float3 | ( | a, | |
b, | |||
c | |||
) | make_float3(a,b,c) |
float3 constructor
#define float4 | ( | a, | |
b, | |||
c, | |||
d | |||
) | make_float4(a,b,c,d) |
float4 constructor
#define int2 | ( | a, | |
b | |||
) | make_int2(a,b) |
int2 constructor
#define int3 | ( | a, | |
b, | |||
c | |||
) | make_int3(a,b,c) |
int3 constructor
#define uint2 | ( | a, | |
b | |||
) | make_uint2(a,b) |
uint2 constructor
#define uint3 | ( | a, | |
b, | |||
c | |||
) | make_uint3(a,b,c) |
uint3 constructor
#define uint4 | ( | a, | |
b, | |||
c, | |||
d | |||
) | make_uint4(a,b,c,d) |
uint4 constructor
|
inline |
Texture memory for storing media indices.
Tested with texture memory for media, only improved 1% speed to keep code portable, use global memory for now also need to change all media[idx1d] to tex1Dfetch() below Floating-point atomic addition
|
inline |
Reset shared memory buffer to store photon partial-path data for a new photon.
[in] | p | pointer to the partial-path buffer |
[in] | maxmediatype | length of the buffer to be reset |
__device__ float getrefractiveidx | ( | unsigned int | mediaid | ) |
Loading optical properties from constant memory.
This function parses the media input and load optical properties from GPU memory
[in] | mediaid | the media ID (32 bit) of the current voxel, format is specified in gcfg->mediaformat or cfg->mediabyte |
Core function for photon-voxel ray-tracing.
This is the heart of the MCX simulation algorithm. It calculates the nearest intersection of the ray inside the current cubic voxel.
[in] | p0 | the x/y/z position of the current photon |
[in] | v | the direction vector of the photon |
[in] | rv | pre-computed reciprocal of the velocity vector (v) |
[out] | id | 0: intersect with x=x0 plane; 1: intersect with y=y0 plane; 2: intersect with z=z0 plane first |
|
inline |
Terminate a photon and launch a new photon according to specified source form.
This function terminates the current photon and launches a new photon according to the source type selected. Currently, over a dozen source types are supported, including pencil, isotropic, planar, fourierx, gaussian, zgaussian etc.
[in,out] | p | the 3D position and weight of the photon |
[in,out] | v | the direction vector of the photon |
[in,out] | f | the parameter vector of the photon |
[in,out] | s | the Stokes vector of the photon |
[in,out] | rv | the reciprocal direction vector of the photon (rv[i]=1/v[i]) |
[out] | prop | the optical properties of the voxel the photon is launched into |
[in,out] | idx1d | the linear index of the voxel containing the photon at launch |
[in] | field | the 3D array to store photon weights |
[in,out] | mediaid | the medium index at the voxel at launch |
[in,out] | w0 | initial weight, reset here after launch |
[in] | isdet | whether the previous photon being terminated lands at a detector |
[in,out] | ppath | pointer to the shared-mem buffer to store photon partial-path data |
[in,out] | n_det | array in the constant memory where detector positions are stored |
[in,out] | dpnum | global-mem variable where the count of detected photons are stored |
[in] | t | RNG state |
[in,out] | photonseed | RNG state stored at photon's launch time if replay is needed |
[in] | media | domain medium index array, read-only |
[in] | srcpattern | user-specified source pattern array if pattern source is used |
[in] | threadid | the global index of the current thread |
[in] | rngseed | in the replay mode, pointer to the saved detected photon seed data |
[in,out] | seeddata | pointer to the buffer to save detected photon seeds |
[in,out] | gdebugdata | pointer to the buffer to save photon trajectory positions |
[in,out] | gprogress | pointer to the host variable to update progress bar |
Early termination of simulation when the detphoton buffer is filled if issavedet is set to 3
First, let's terminate the current photon and perform detection calculations
If the thread completes all assigned photons, terminate this thread.
If this is a replay of a detected photon, initilize the RNG with the stored seed here.
Attempt to launch a new photon until success
Only one branch is taken because of template, this can reduce thread divergence
parameter to generate photon path from coordinates at focus (depends on focal distance and rayleigh range)
if beam direction is along +z or -z direction
if beam dir. is not +z or -z, compute photon position and direction after rotation
photon position displacement after rotation
photon direction after rotation
compute final launch position and update medium label
If beam focus is set, determine the incident angle
Compute the reciprocal of the velocity vector
If a photon is launched outside of the box, or inside a zero-voxel, move it until it hits a non-zero voxel
specular reflection of the bbx is taken care of here
if launch attempted for over 1000 times, stop trying and return
Now a photon is successfully launched, perform necssary initialization for a new trajectory
total energy enters the volume. for diverging/converting beams, this is less than nphoton due to specular reflection loss. This is different from the wide-field MMC, where the total launched energy includes the specular reflection loss
If a progress bar is needed, only sum completed photons from the 1st, last and middle threads to determine progress bar
int mcx_corecount | ( | int | v1, |
int | v2 | ||
) |
Utility function to calculate the GPU stream processors (cores) per SM.
Obtain GPU core number per MP, this replaces ConvertSMVer2Cores() in libcudautils to avoid extra dependency.
[in] | v1 | the major version of an NVIDIA GPU |
[in] | v2 | the minor version of an NVIDIA GPU |
void mcx_cu_assess | ( | cudaError_t | cuerr, |
const char * | file, | ||
const int | linenum | ||
) |
assert cuda memory allocation result
Utility function to query GPU info and set active GPU.
This function query and list all available GPUs on the system and print their parameters. This is used when -L or -I is used.
[in,out] | cfg | the simulation configuration structure |
[out] | info | the GPU information structure |
__global__ void mcx_main_loop | ( | uint | media[], |
OutputType | field[], | ||
float | genergy[], | ||
uint | n_seed[], | ||
float4 | n_pos[], | ||
float4 | n_dir[], | ||
float4 | n_len[], | ||
float | n_det[], | ||
uint | detectedphoton[], | ||
float | srcpattern[], | ||
float | replayweight[], | ||
float | photontof[], | ||
int | photondetid[], | ||
RandType * | seeddata, | ||
float * | gdebugdata, | ||
float * | ginvcdf, | ||
float4 * | gsmatrix, | ||
volatile int * | gprogress | ||
) |
The core Monte Carlo photon simulation kernel (!!!Important!!!)
This is the core Monte Carlo simulation kernel, please see Fig. 1 in Fang2009. everything in the GPU kernels is in grid-unit. To convert back to length, use cfg->unitinmm (scattering/absorption coeff, T, speed etc)
[in] | media | domain medium index array, read-only |
[out] | field | the 3D/4D array where the fluence/energy-deposit are accummulated |
[in,out] | genergy | the array storing the total launched and escaped energy for each thread |
[in] | n_seed | the seed to the RNG of this thread |
[in,out] | n_pos | the initial position state of the photon for each thread |
[in,out] | n_dir | the initial direction state of the photon for each thread |
[in,out] | n_len | the initial parameter state of the photon for each thread |
[in,out] | detectedphoton | the buffer where the detected photon data are stored |
[in] | srcpattern | user-specified source pattern array if pattern source is used |
[in] | replayweight | the pre-computed detected photon weight for replay |
[in] | photontof | the pre-computed detected photon time-of-fly for replay |
[in,out] | seeddata | pointer to the buffer to save detected photon seeds |
[in,out] | gdebugdata | pointer to the buffer to save photon trajectory positions |
[in,out] | gprogress | pointer to the host variable to update progress bar |
the 1D index of the current thread
Load use-defined phase function (inversion of CDF) to the shared memory (first gcfg->nphase floats)
Initialize RNG states
Launch the first photon
The following lines initialize photon state variables, RNG states and
The main photon movement loop Each pass of this loop, we advance photon by one step - meaning that it is either moved from one voxel to the immediately next voxel when the scattering path continues, or moved to the end of the scattering path if it ends within the current voxel.
A scattering event When a photon arrives at a scattering site, 3 values are regenrated 1 - a random unitless scattering length f.pscat, 2 - a 0-2pi uniformly random arimuthal angle 3 - a 0-pi random zenith angle based on the Henyey-Greenstein Phase Function
Rejection method to choose azimuthal angle phi and deflection angle theta
Here we use Henyey-Greenstein Phase Function, "Handbook of Optical Biomedical Diagnostics",2002,Chap3,p234, also see Boas2002
Store old direction cosines for polarized photon simulation
Update direction vector with the two random angles
Update stokes parameters
Only compute the reciprocal vector when v is changed, this saves division calculations, which are very expensive on the GPU
Read the optical property of the current voxel
Advance photon 1 step to the next voxel
convert photon movement length to unitless scattering length by multiplying with mus
if the consumed unitless scat length is less than what's left in f.pscat, keep moving; otherwise, stop in this voxel
final length that the photon moves - either the length to move to the next voxel, or the remaining scattering length
perform ray-interface intersection test to consider intra-voxel curvature (SVMC mode)
if photon moves to the next voxel, use the precomputed intersection coord
although the below 3 lines look dumb, if you change it to flipdir[flipdir[3]] += ..., the speed drops by half, likely due to step locking
calculate photon energy loss
remaining unitless scattering length: sum(s_i*mus_i), unit-less
update photon timer to add time-of-flight (unit = s)
read the medium index of the new voxel (current or next)
if photon moves outside of the volume, set mediaid to 0
isdet now stores the boundary condition flag, this will be overwriten before the end of the loop
otherwise, read the optical property index
upper 16bit is the mask of the covered detector
lower 16bit is the medium index
save fluence to the voxel when photon moves out
if t is within the time window, which spans cfg->maxgate*cfg->tstep.wide
calculate the quality to be accummulated
when mua->0, take limit_{mua->0} w0*(1-exp(-mua*len))/mua yields w0*len
accummulate the quality to the volume using non-atomic operations
in SVMC mode, update tissue type when photons cross voxel or intra-voxel boundary
launch new photon when exceed time window or moving from non-zero voxel to zero voxel without reflection
upper 16bit is the mask of the covered detector
lower 16bit is the medium index
perform Russian Roulette
do boundary reflection/transmission
return the accumulated total energyloss and launched energy back to the host
for debugging purposes, we also pass the last photon states back to the host for printing
|
inline |
A simplified nextafterf() to ensure a photon moves outside of the current voxel after each move.
[in] | a | a floating point number |
[in] | dir | 1: change 1 bit in the positive direction; 0: no change, -1: change 1 bit in the negative direction |
First, shift coordinate by 1000 to make sure values are always positive
Then make 1 bit difference along the direction indicated by dir
Last, undo the offset, and return
Master host code for the MCX simulation kernel (!!!Important!!!)
This function is the master host code for the MCX kernel. It is responsible for initializing all GPU variables, copy data from host to GPU, launch the kernel for simulation, wait for competion, and retrieve the results.
[in,out] | cfg | the simulation configuration structure |
[in] | gpu | the GPU information structure |
p0
- initial photon positions for pencil/isotropic/cone beams, used to initialize {p={x,y,z,w}} state in the GPU kernel
c0
- initial photon direction state, used to initialize {MCXdir v={vx,vy,vz,}} state in the GPU kernel
stokesvec
- initial photon polarization state, described by Stokes Vector
gpuphoton
- number of photons to be simulated per thread, determined by total workload and thread number
photoncount
- number of completed photons returned by all thread, output
mcgrid
- GPU grid size, only use 1D grid, used when launching the kernel in cuda <<<>>> operator
mcblock
- GPU block size, only use 1D block, used when launching the kernel in cuda <<<>>> operator
sharedbuf
- shared memory buffer length to be requested, used when launching the kernel in cuda <<<>>> operator
dimxyz
- output volume variable field
voxel count, Nx*Ny*Nz*Ns where Ns=cfg.srcnum is the pattern number for photon sharing
media
- input volume representing the simulation domain, format specified in cfg.mediaformat, read-only
field
- output volume to store GPU computed fluence, length is dimxyz
rfimag
- imaginary part of the RF Jacobian, length is dimxyz
Ppos
- per-thread photon state initialization host buffers
Pseed
- per-thread RNG seed initialization host buffers
Pdet
- output host buffer to store detected photon partial-path and other per-photon data
energy
- output host buffers to accummulate total launched/absorbed energy per thread, needed for normalization
srcpw
, energytot
, energyabs
- output host buffers to accummulate total launched/absorbed energy per pattern in photon sharing, needed for normalization of multi-pattern simulations
seeddata
- output buffer to store RNG initial seeds for each detected photon for replay
detected
- total number of detected photons, output
progress
- pinned-memory variable as the progress bar during simulation, updated in GPU and visible from host
all pointers start with g___ are the corresponding GPU buffers to read/write host variables defined above
|-----------------------------------------------> hostdetreclen <--------------------------------------| |------------------------> partialdata <-------------------|
host detected photon buffer: detid (1), partial_scat (#media), partial_path (#media), momontum (#media), p_exit (3), v_exit(3), w0 (1) |———————————————> w0offset <-————————————||<–— w0 (#srcnum) –—>||<- RF replay (2)->| gpu detected photon buffer: partial_scat (#media), partial_path (#media), momontum (#media), E_escape (1), E_launch (1), w0 (1), w0_photonsharing (#srcnum) cos(w*T),sin(w*T)
param
- constants to be used in the GPU, copied to GPU as gcfg
, stored in the constant memory
Start multiple CPU threads using OpenMP, one thread for each GPU device to run simultaneously, threadid
returns the current thread ID
Use threadid
and cfg.deviceid, the compressed list of active GPU devices, to get the desired GPU ID for this thread
Activate the corresponding GPU device
Use the specified GPU's parameters, stored in gpu[gpuid] to determine the maximum time gates that it can hold
Updating host simulation configuration cfg
, only allow the master thread to modify cfg, others are read-only
If domain is 2D, the initial photon launch direction in the singleton dimension is expected to be 0
If autopilot mode is not used, the thread and block sizes are determined by user input from cfg.nthread and cfg.nblocksize
If total thread number is not integer multiples of block size, round it to the largest block size multiple
If cfg.respin is positive, the output data have to be accummulated, so we use a double-buffer to retrieve and then accummulate
Master thread computes total workloads, specified by users (or equally by default) for all active devices, stored as cfg.workload[gpuid]
Now we can determine how many photons to be simualated by multiplying the total photon by the relative ratio of per-device workload divided by the total workload
Once per-thread photon number param.threadphoton
is known, we distribute the remaiders, if any, to a subset of threads, one extra photon per thread, so we can get the exact total photon number
Total time gate number is computed
Here we determine if the GPU memory of the current device can store all time gates, if not, disabling normalization
Here we decide the total output buffer, field's length. it is Nx*Ny*Nz*Nt*Ns
A 1D grid is determined by the total thread number and block size
A 1D block is determined by the user specified block size, or by default, 64, determined emperically to get best performance
If "-D R" is used, we launch a special RNG testing kernel to retrieve random numbers from RNG and test for randomness
Allocate all host buffers to store input or output data
Ppos:
host buffer for initial photon position+weight
Pdir:
host buffer for initial photon direction
Plen:
host buffer for initial additional photon states
energy:
host buffer for retrieving total launched and escaped energy of each thread
Pdet:
host buffer for retrieving all detected photon information
Pseed:
RNG seed for each thread in non-replay mode, or
Pseed:
RNG seeds for photon replay in GPU threads
Allocate all GPU buffers to store input or output data
Allocate pinned memory variable, progress, for real-time update during kernel run-time
Allocate and copy data needed for photon replay, the needed variables include gPseed
per-photon seed to be replayed greplayw
per-photon initial weight greplaytof
per-photon time-of-flight time in s greplaydetid
per-photon index for each replayed photon
Allocate and copy source pattern buffer for 2D and 3D pattern sources
Saving detected photon is enabled by default, but in case if a user disabled this feature, a warning is printed
Pre-compute array dimension strides to move in +-x/y/z dimension quickly in the GPU, stored in the constant memory
Inside the GPU kernel, volume is always assumbed to be col-major (like those generated by MATLAB or FORTRAN)
Additional constants to avoid repeated computation inside GPU
Get ready to start GPU simulation here, clock is now ticking ...
Copy all host buffers to the GPU
Copy constants to the constant memory on the GPU
If one has to simulate a lot of time gates, using the GPU global memory requires extra caution. If the total global memory is bigger than the total memory to save all the snapshots, i.e. size(field)*(tend-tstart)/tstep, one simply sets gpu[gpuid].maxgate to the total gate number; this will run GPU kernel once. If the required memory is bigger than the video memory, set gpu[gpuid].maxgate to a number which fits, and the snapshot will be saved with an increment of gpu[gpuid].maxgate snapshots. In this case, the later simulations will restart from photon launching and exhibit redundancies.
The calculation of the energy conservation will only reflect the last simulation.
Outer loop: loop over each time-gate-group, determined by the capacity of the global memory to hold the output data, in most cases, totalgates
is 1
Determine the start and end time of the current time-gate-group
Copy param to the constant memory variable gcfg
Inner loop: loop over total number of repetitions specified by cfg.respin, results will be accumulated to field
Each repetition, we have to reset the output buffers, including gfield
and gPdet
Start the clock for GPU-kernel only run-time here
Determine template constants for compilers to build specialized binary instances to reduce branching and thread-divergence. If not using template, the performance can take a 20% drop.
ispencil:
template constant, if 1, launch photon code is dramatically simplified
isref:
template constant, if 1, perform boundary reflection, if 0, total-absorbion boundary, can simplify kernel
issvmc:
template constant, if 1, consider the input volume containing split-voxel data, see Yan2020 for details
ispolarized:
template constant, if 1, perform polarized light simulations, currently only supports label-based media
Enable reflection flag when c or m flags are used in the cfg.bc boundary condition flags
Launch GPU kernel using template constants. Here, the compiler will create 2^4=16 individually compiled kernel PTX binaries for each combination of template variables. This creates bigger binary and slower compilation time, but brings up to 20%-30% speed improvement on certain simulations.
By now, the GPU kernel has been launched asynchronously, the master thread on the host starts reading a pinned memory variable, gprogress
, to realtimely read the completed photon count updated inside the GPU kernel while it is running.
host variable progress
is pinned with the GPU variable gprogress
, and can be updated by the GPU kernel from the device. We can read this variable to see how many photons are simulated.
Here we use the below formula to compute the 0-100% completion ratio. Only half of the threads updates the progress, and each thread only update the counter 5 times at 0%/25%/50%/75%/100% progress to minimize overhead while still providing a smooth progress bar.
By calling cudaDeviceSynchronize
, the host thread now waits for the completion of the kernel, then start retrieving all GPU output data
Here, the GPU kernel is completely executed and returned
now we can estimate and print the GPU-kernel-only runtime
If the GPU kernel crashed or terminated by error during execution, we need to capture it by calling cudaGetLastError
and terminate mcx if error happens
Now, we start retrieving all output variables, and copy those to the corresponding host buffers
photoncount
returns the actual completely simulated photons returned by GPU threads, no longer used
If '-D M' is specified, we retrieve photon trajectory data and store those to cfg.exportdebugdata
and cfg.debugdatalen
If photon detection is enabled and detectors are defined, we retrieve partial-path length data, among others, to cfg.exportdetected
and detected
Accumulate volumetric fluence from all threads/devices
If double-precision is used for output, we do not need two buffers; however, by default, we use single-precision output, we need to copy and accumulate two separate floating-point buffers to minimize round-off errors near the source
If respin is used, each repeatition is accumulated to the 2nd half of the buffer
Here is the end of the inner-loop (respin)
If respin is used, copy the accumulated buffer in the 2nd half to the first half
Retrieve accumulated total launched and residual energy from each thread
For MATLAB mex file, the data is copied to a pre-allocated buffer cfg->export*
as a return variable
Here is the end of the outer-loop, over time-gate groups
Let the master thread to deal with the normalization and file IO First, if multi-pattern simulation, i.e. photon sharing, is used, we normalize each pattern first
Now we normalize the fluence so that the default output is fluence rate in joule/(s*mm^2) generated by a unitary source (1 joule total).
The raw output directly from GPU is the accumulated energy-loss per photon moving step in joule when cfg.outputtype='fluence', or energy-loss multiplied by mua (1/mm) per voxel (joule/mm) when cfg.outputtype='flux' (default).
If output is flux (J/(s*mm^2), default), raw data (joule*mm) is multiplied by (1/(Nphoton*Vvox*dt)) If output is fluence (J/mm^2), raw data (joule*mm) is multiplied by (1/(Nphoton*Vvox))
If output is energy (joule), raw data is simply multiplied by 1/Nphoton
In photon sharing mode, where multiple pattern sources are simulated, each solution is normalized separately
If not running as a mex file, we need to save volumetric output data, if enabled, as a file, with suffix specifed by cfg.outputformat (mc2,nii, or .jdat or .jbat)
If not running as a mex file, we need to save detected photon data, if enabled, as a file, either as a .mch file, or a .jdat/.jbat file
If not running as a mex file, we need to save photon trajectory data, if enabled, as a file, either as a .mct file, or a .jdat/.jbat file
Copying GPU photon states back to host as Ppos, Pdir and Plen for debugging purpose is depreciated
Report simulation summary, total energy here equals total simulated photons+unfinished photons for all threads
Simulation is complete, now we need clear up all GPU memory buffers
The below call in theory is not needed, but it ensures the device is freed for other programs, especially on Windows
Lastly, free all host buffers, the simulation is complete.
int mcx_smxblock | ( | int | v1, |
int | v2 | ||
) |
Utility function to calculate the maximum blocks per SM.
[in] | v1 | the major version of an NVIDIA GPU |
[in] | v2 | the minor version of an NVIDIA GPU |
__global__ void mcx_test_rng | ( | OutputType | field[], |
uint | n_seed[] | ||
) |
A stand-alone kernel to test random number generators.
This function fills the domain with the random numbers generated from the GPU. One can use this function to dump RNG values and test for quality. use -D R in the command line to enable this output.
[out] | field | the array to be filled with RNG values |
[in] | n_seed | the seed to the RNG |
int mcx_threadmultiplier | ( | int | v1, |
int | v2 | ||
) |
Utility function to calculate the maximum blocks per SM.
[in] | v1 | the major version of an NVIDIA GPU |
[in] | v2 | the minor version of an NVIDIA GPU |
__device__ int ray_plane_intersect | ( | float3 * | p0, |
MCXdir * | v, | ||
Medium * | prop, | ||
float & | len, | ||
float & | slen, | ||
MCXsp * | nuvox | ||
) |
Compute intersection point between a photon path and the intra-voxel interface if present.
This function tests if a ray intersects with an in-voxel marching-cube boundary (a plane) in SVMC mode
[in] | p0 | current position |
[in] | v | current photon direction |
[in] | prop | optical properties |
[in,out] | len | photon movement length, updated if intersection is found |
[in,out] | slen | remaining unitless scattering length, updated if intersection is found |
[in] | nuvox | a struct storing normal direction (nv) and a point on the plane |
[in] | f | photon state including total time-of-flight, number of scattering etc |
[in] | flipdir[0,1,2] | current voxel xi/yi/zi; flipdir[3]: 0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane |
|
inline |
Calculating the reflection coefficient at an interface.
This function calculates the reflection coefficient at an interface of different refrective indicex (n1/n2)
[in] | v | the direction vector of the photon |
[in] | n1 | the refrective index of the voxel the photon leaves |
[in] | n2 | the refrective index of the voxel the photon enters |
[in] | flipdir | 0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane |
1-[n1/n2*sin(si)]^2 = cos(ti)^2
to save one sqrt
Rp
(Rp+Rs)/2
__device__ int reflectray | ( | float | n1, |
float3 * | c0, | ||
float3 * | rv, | ||
MCXsp * | nuvox, | ||
Medium * | prop, | ||
RandType | t[RAND_BUF_LEN] | ||
) |
Perform reflection/refraction computation along mismatched intra-voxel interface.
This function returns a new direction when a ray is reflected/transmitted through an in-voxel marching-cube boundary in the SVMC mode
[in] | n1 | refractive index of the current subvolume in a split-voxel |
[in,out] | c0 | current photon direction |
[out] | rv | reciprocated direction vector rv={1/c0.x,1/c0.y,1/c0.z} |
[in] | prop | optical properties |
[in] | nuvox | a struct storing normal direction (nv) and a point on the plane |
[in] | t | random number state for deciding reflection/transmission |
|
inline |
Compute 3D-scattering direction vector.
This function updates the direction vector after a 3D scattering event
[in,out] | v | the direction vector of the photon |
[in] | stheta | the sine of the azimuthal angle |
[in] | ctheta | the cosine of the azimuthal angle |
[in] | sphi | the sine of the zenith angle |
[in] | cphi | the cosine of the zenith angle |
|
inline |
Compute 2D-scattering if the domain has a dimension of 1 in x/y or z.
This function performs 2D scattering calculation if the domain is only a sheet of voxels
[in,out] | v | the direction vector of the photon |
[in] | stheta | the sine of the rotation angle |
[in] | ctheta | the cosine of the rotation angle |
Saving photon trajectory data for debugging purposes.
[in] | p | the position/weight of the current photon packet |
[in] | id | the global index of the photon |
[in] | gdebugdata | pointer to the global-memory buffer to store the trajectory info |
|
inline |
Advance photon to the 1st non-zero voxel if launched in the backgruond.
This function advances the photon to the 1st non-zero voxel along the direction of v if the photon is launched outside of the cubic domain or in a zero-voxel. To avoid large overhead, photon can only advance gcfg->minaccumtime steps, which can be set using the –maxvoidstep flag; by default, this limit is 1000.
[in] | v | the direction vector of the photon |
[in] | n1 | the refrective index of the voxel the photon leaves |
[in] | n2 | the refrective index of the voxel the photon enters |
[in] | flipdir | 0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane |
|
inline |
Calculating the direction vector after transmission.
This function updates the direction vector after the photon passing an interface of different refrective indicex (n1/n2). Because MCX only handles voxelated domain, transmission is applied only to 1 of the components, and then the vector is normalized.
[in,out] | v | the direction vector of the photon |
[in] | n1 | the refrective index of the voxel the photon leaves |
[in] | n2 | the refrective index of the voxel the photon enters |
[in] | flipdir | 0: transmit through x=x0 plane; 1: through y=y0 plane; 2: through z=z0 plane |
__device__ void updateproperty | ( | Medium * | prop, |
unsigned int & | mediaid, | ||
RandType | t[RAND_BUF_LEN], | ||
unsigned int | idx1d, | ||
uint | media[], | ||
float3 * | p, | ||
MCXsp * | nuvox, | ||
short | flipdir[4] | ||
) |
Loading optical properties from constant memory.
This function parses the media input and load optical properties from GPU memory
[out] | prop | pointer to the current optical properties {mua, mus, g, n} |
[in] | mediaid | the media ID (32 bit) of the current voxel, format is specified in gcfg->mediaformat or cfg->mediabyte |
The default mcx input volume is assumed to be 4-byte per voxel (SVMC mode requires 2x 4-byte voxels for 8 data points)
The data encoded in the voxel are parsed based on the gcfg->mediaformat flag. Below, we use [s*] to represent 2-byte short integers; [h*] to represent a 2-byte half-precision floating point number; [c*] to represent 1-byte unsigned char integers and [i*] to represent 4-byte integers; [f*] for 4-byte floating point number index 0 starts from the lowest (least significant bit) end
s[1]: half-prec property; s[0]: high 2bits: idx 0-3, low 14bits: tissue label
Extract the reference point of the intra-voxel interface
Extract the normal vector of the intra-voxel interface
Determine tissue label corresponding to the current photon position
__constant__ MCXParam gcfg[1] |
Simulation constant parameters stored in the constant memory.
This variable stores all constants used in the simulation.
__constant__ float4 gproperty[MAX_PROP_AND_DETECTORS] |
Concatenated optical properties and det positions, stored in constant memory.
The first cfg.maxmedia elements of this array contain the optical properties of the domains. Format: {x}:mua,{y}:mus,{z}:anisotropy (g),{w}:refractive index (n). The following cfg.detnum elements of this array contains the detector information. Format: {x,y,z}: the x/y/z coord. of the detector, and {w}: radius; all in grid unit. The total length (both media properties and detector) is defined by MAX_PROP_AND_DETECTORS, which is 4000 to fully utilize the constant memory space (64kb=4096 float4)