| Here is the source of the program:
---------------------------------
/* md-smp.c */
/********************************************************************
Basic 2-D & 3-D molecular dynamics code - adapted for SMP
Copyright (c) 1997 D.C. Rapaport
Compile:
cc CFLAGS -o md-smp md-smp.c [-DSMP] [-DD3] -lm
where CFLAGS = {-n32 -r5000 -O3}, or {-migrate -O4}, etc.
Define SMP to get SMP code for 4 procs.
Define D3 to get 3-D version (run both 2-D and 3-D versions).
Edited output from "small" runs (1-2 minutes) on 6 SPECfp95 workstation:
> cc ...
> md-smp 200
2-D test: 46000 atoms, 200 steps
20 0.641 1.000
40 0.683 1.000
...
180 0.669 1.000
200 0.666 1.000
time/atom-step 3.68 musec <<- this is the important number!
> cc ... -DD3 ...
> md-smp 24
3-D test: 55296 atoms, 200 steps
20 0.934 1.500
40 0.982 1.500
...
180 0.967 1.500
200 0.966 1.500
time/atom-step 8.73 musec <<- this is the important number!
(Near-linear speedup is obtained on message-passing architecture using
another version of this program; hope for at least 75% efficiency on
4-node SMP since code is fully parallelizable in principle.)
Note: the SMP version has not been tested on a multi-cpu machine, and
may require further changes.
********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#ifndef SMP
#define SMP 0
#endif
#ifdef D3
#define NDIM 3
#else
#define NDIM 2
#endif
/* vector definitions and macro ops, etc */
typedef struct {double x, y;} VecD2;
typedef struct {double x, y, z;} VecD3;
typedef struct {int x, y;} VecI2;
typedef struct {int x, y, z;} VecI3;
#if NDIM == 2
typedef VecD2 VecD;
typedef VecI2 VecI;
#define VSet(v, sx, sy) v.x = sx, v.y = sy
#define VZero(v) VSet (v, 0, 0)
#define VScale(v, s) v.x *= s, v.y *= s
#define VSCopy(v2, s1, v1) \
v2.x = (s1) * v1.x, v2.y = (s1) * v1.y
#define VAdd(v1, v2, v3) \
v1.x = v2.x + v3.x, v1.y = v2.y + v3.y
#define VSub(v1, v2, v3) \
v1.x = v2.x - v3.x, v1.y = v2.y - v3.y
#define VMul(v1, v2, v3) \
v1.x = v2.x * v3.x, v1.y = v2.y * v3.y
#define VDiv(v1, v2, v3) \
v1.x = v2.x / v3.x, v1.y = v2.y / v3.y
#define VSAdd(v1, v2, s3, v3) \
v1.x = v2.x + (s3) * v3.x, v1.y = v2.y + (s3) * v3.y
#define VProd(v) (v.x * v.y)
#define VLinear(p, s) p.y * s.x + p.x
#define VLenSq(v) (SQ (v.x) + SQ (v.y))
#define OFFSET_VALS {0,0, 1,0, 1,1, 0,1, -1,1}
#define DO_OFFSET for (offset = 0; offset < 5; offset ++)
#elif NDIM == 3
typedef VecD3 VecD;
typedef VecI3 VecI;
#define VSet(v, sx, sy, sz) v.x = sx, v.y = sy, v.z = sz
#define VZero(v) VSet (v, 0, 0, 0)
#define VScale(v, s) v.x *= s, v.y *= s, v.z *= s
#define VSCopy(v2, s1, v1) \
v2.x = (s1) * v1.x, v2.y = (s1) * v1.y, v2.z = (s1) * v1.z
#define VAdd(v1, v2, v3) \
v1.x = v2.x + v3.x, v1.y = v2.y + v3.y, v1.z = v2.z + v3.z
#define VSub(v1, v2, v3) \
v1.x = v2.x - v3.x, v1.y = v2.y - v3.y, v1.z = v2.z - v3.z
#define VMul(v1, v2, v3) \
v1.x = v2.x * v3.x, v1.y = v2.y * v3.y, v1.z = v2.z * v3.z
#define VDiv(v1, v2, v3) \
v1.x = v2.x / v3.x, v1.y = v2.y / v3.y, v1.z = v2.z / v3.z
#define VSAdd(v1, v2, s3, v3) \
v1.x = v2.x + (s3) * v3.x, v1.y = v2.y + (s3) * v3.y, \
v1.z = v2.z + (s3) * v3.z
#define VProd(v) (v.x * v.y * v.z)
#define VLinear(p, s) (p.z * s.y + p.y) * s.x + p.x
#define VLenSq(v) (SQ (v.x) + SQ (v.y) + SQ (v.z))
#define OFFSET_VALS {0,0,0, 1,0,0, 1,1,0, 0,1,0, -1,1,0, 0,0,1, \
1,0,1, 1,1,1, 0,1,1, -1,1,1, -1,0,1, -1,-1,1, 0,-1,1, 1,-1,1}
#define DO_OFFSET for (offset = 0; offset < 14; offset ++)
#endif
#define SQ(x) ((x) * (x))
#define AllocT(x, t) (t *) malloc ((x) * sizeof (t))
#define TdiffD(tv2, tv1) \
(1e6 * (tv2.tv_sec - tv1.tv_sec) + tv2.tv_usec - tv1.tv_usec)
/* prototypes */
void BuildNebrList ();
void ComputeForces ();
void LeapfrogIntegrate ();
void CheckPeriodicWrap ();
void CalcEnergy ();
void InitialState ();
void VRand ();
/* globals */
static VecD *r, *rv, *ra, region;
static VecI cells, initUcell;
static double deltaT, rCut, temperature, density, uSum, kinEnergy,
potEnergy, rNebrShell, dispHi, vvMax;
static int *cellList, nAtom, stepCount, stepMax, stepPrint,
nUcellEdge, nebrTabMax, nebrTabFac, nebrNow;
#if SMP
#define MAX_PROC 4
static VecD *raP[MAX_PROC];
static int *nebrTab[MAX_PROC], nebrTabLen[MAX_PROC], nProc;
#else
static int *nebrTab, nebrTabLen;
#endif
/* do not parallelize main program */
void main (int argc, char **argv)
{
FILE *fp;
struct timeval tv1, tv2;
#if SMP
int ip;
#endif
#if NDIM == 2
nUcellEdge = 100;
nebrTabFac = 5;
#elif NDIM == 3
nUcellEdge = 16;
nebrTabFac = 10;
#endif
if (argc > 1) nUcellEdge = atol (argv[1]);
temperature = 1.;
density = 0.8;
rCut = pow (2., 1./6.);
rNebrShell = 0.4;
stepMax = 200;
stepPrint = 20;
deltaT = 0.005;
#if NDIM == 2
VSet (initUcell, nUcellEdge, nUcellEdge / sqrt (3.));
region.x = initUcell.x / sqrt (density * sqrt (3.) / 2.);
region.y = region.x;
nAtom = 2 * VProd (initUcell);
density = nAtom / VProd (region);
#elif NDIM == 3
VSet (initUcell, nUcellEdge, nUcellEdge, nUcellEdge);
nAtom = 4 * VProd (initUcell);
VSCopy (region, 1. / pow (density / 4., 1./3.), initUcell);
#endif
nebrTabMax = nebrTabFac * nAtom;
VSCopy (cells, 1. / (rCut + rNebrShell), region);
r = AllocT (nAtom, VecD);
rv = AllocT (nAtom, VecD);
ra = AllocT (nAtom, VecD);
cellList = AllocT (nAtom + VProd (cells), int);
#if SMP
nProc = MAX_PROC;
for (ip = 0; ip < nProc; ip ++) {
raP[ip] = AllocT (nAtom, VecD);
nebrTab[ip] = AllocT (2 * nebrTabMax / nProc, int);
}
#else
nebrTab = AllocT (2 * nebrTabMax, int);
#endif
InitialState ();
nebrNow = 1;
fp = stdout;
#if SMP
fprintf (fp, "SMP %d procs -- ", nProc);
#endif
fprintf (fp, "%d-D test: %d atoms, %d steps\n", NDIM, nAtom, stepMax);
/* start timing */
gettimeofday (&tv1, NULL);
/* main loop */
for (stepCount = 1; stepCount <= stepMax; stepCount ++) {
if (nebrNow) {
nebrNow = 0;
dispHi = 0.;
BuildNebrList ();
}
ComputeForces ();
LeapfrogIntegrate ();
CheckPeriodicWrap ();
CalcEnergy ();
dispHi += sqrt (vvMax) * deltaT;
if (dispHi > 0.5 * rNebrShell) nebrNow = 1;
if (stepCount % stepPrint == 0) {
fprintf (fp, "%5d %.3f %.3f\n", stepCount, kinEnergy,
kinEnergy + potEnergy);
fflush (fp);
}
}
/* end */
gettimeofday (&tv2, NULL);
fprintf (fp, " time/atom-step %.2f musec\n",
TdiffD (tv2, tv1) / (stepMax * nAtom));
exit (0);
}
#define WrapC(t) \
if (m2v.t >= cells.t) m2v.t = 0, shift.t = region.t; \
else if (m2v.t < 0) m2v.t = cells.t - 1, shift.t = - region.t
/* function should be automatically parallelized -
can be 2nd largest cpu user, need to work on local variables,
perhaps better way to organize first part of function */
void BuildNebrList ()
{
VecD dr, invWid, rs, shift;
VecI cc, m1v, m2v, iof[] = OFFSET_VALS;
double rrNebr;
int c, j1, j2, m1, m1x, m1y, m1z, m2, n, offset;
#if SMP
int ip;
#endif
VDiv (invWid, cells, region);
rrNebr = SQ (rCut + rNebrShell);
/* start parallel section */
for (n = nAtom; n < nAtom + VProd (cells); n ++) cellList[n] = -1;
/* end parallel section */
/* barrier here */
/* start parallel section - this is safe to parallelize (better ways?) */
#if SMP
for (ip = 0; ip < nProc; ip ++) {
#endif
for (n = 0; n < nAtom; n ++) {
VSAdd (rs, r[n], 0.5, region);
VMul (cc, rs, invWid);
c = VLinear (cc, cells) + nAtom;
#if SMP
if (c % nProc == ip) {
#endif
cellList[n] = cellList[c];
cellList[c] = n;
#if SMP
}
#endif
}
#if SMP
}
#endif
/* end parallel section */
/* barrier here */
/* start parallel section */
#if SMP
for (ip = 0; ip < nProc; ip ++) {
nebrTabLen[ip] = 0;
#else
nebrTabLen = 0;
#endif
#if NDIM == 3
for (m1z = 0; m1z < cells.z; m1z ++) {
#endif
for (m1y = 0; m1y < cells.y; m1y ++) {
#if SMP
if (m1y % nProc == ip) {
#endif
for (m1x = 0; m1x < cells.x; m1x ++) {
#if NDIM == 2
VSet (m1v, m1x, m1y);
#elif NDIM == 3
VSet (m1v, m1x, m1y, m1z);
#endif
m1 = VLinear (m1v, cells) + nAtom;
DO_OFFSET {
VAdd (m2v, m1v, iof[offset]);
VZero (shift);
WrapC (x);
WrapC (y);
#if NDIM == 3
WrapC (z);
#endif
m2 = VLinear (m2v, cells) + nAtom;
for (j1 = cellList[m1]; j1 >= 0; j1 = cellList[j1]) {
for (j2 = cellList[m2]; j2 >= 0; j2 = cellList[j2]) {
if (m1 != m2 || j2 < j1) {
VSub (dr, r[j1], r[j2]);
VSub (dr, dr, shift);
if (VLenSq (dr) < rrNebr) {
#if SMP
nebrTab[ip][2 * nebrTabLen[ip]] = j1;
nebrTab[ip][2 * nebrTabLen[ip] + 1] = j2;
++ nebrTabLen[ip];
#else
nebrTab[2 * nebrTabLen] = j1;
nebrTab[2 * nebrTabLen + 1] = j2;
++ nebrTabLen;
#endif
}
}
}
}
}
}
#if SMP
}
#endif
}
#if NDIM == 3
}
#endif
#if SMP
}
#endif
/* end parallel section */
}
#define WrapR(v, t) if (v.t >= 0.5 * region.t) v.t -= region.t; \
else if (v.t < - 0.5 * region.t) v.t += region.t
/* function should be automatically parallelized -
largest (> 60%) cpu user, need to work on local variables */
void ComputeForces ()
{
VecD dr;
double fcVal, rr, rrCut, rri, rri3, uVal;
int j1, j2, n;
#if SMP
double uSumP[MAX_PROC];
int ip;
#endif
rrCut = SQ (rCut);
/* start parallel section */
#if SMP
for (ip = 0; ip < nProc; ip ++) {
for (n = 0; n < nAtom; n ++) VZero (raP[ip][n]);
uSumP[ip] = 0.;
#else
for (n = 0; n < nAtom; n ++) VZero (ra[n]);
uSum = 0.;
#endif
#if SMP
for (n = 0; n < nebrTabLen[ip]; n ++) {
j1 = nebrTab[ip][2 * n];
j2 = nebrTab[ip][2 * n + 1];
#else
for (n = 0; n < nebrTabLen; n ++) {
j1 = nebrTab[2 * n];
j2 = nebrTab[2 * n + 1];
#endif
VSub (dr, r[j1], r[j2]);
WrapR (dr, x);
WrapR (dr, y);
#if NDIM == 3
WrapR (dr, z);
#endif
rr = VLenSq (dr);
if (rr < rrCut) {
rri = 1. / rr;
rri3 = rri * rri * rri;
fcVal = 48. * rri3 * (rri3 - 0.5) * rri;
uVal = 4. * rri3 * (rri3 - 1.) + 1.;
#if SMP
VSAdd (raP[ip][j1], raP[ip][j1], fcVal, dr);
VSAdd (raP[ip][j2], raP[ip][j2], - fcVal, dr);
uSumP[ip] += uVal;
#else
VSAdd (ra[j1], ra[j1], fcVal, dr);
VSAdd (ra[j2], ra[j2], - fcVal, dr);
uSum += uVal;
#endif
}
}
#if SMP
}
#endif
/* end parallel section */
#if SMP
/* two alternative ways of combining partial results (first is new) */
#if 1
/* barrier here */
/* start parallel section */
for (n = 0; n < nAtom; n ++) {
ra[n] = raP[0][n];
for (ip = 1; ip < nProc; ip ++) VAdd (ra[n], ra[n], raP[ip][n]);
}
uSum = 0.;
for (ip = 0; ip < nProc; ip ++) uSum += uSumP[ip];
/* end parallel section */
#else
/* start serial section */
for (n = 0; n < nAtom; n ++) VZero (ra[n]);
uSum = 0.;
for (ip = 0; ip < nProc; ip ++) {
for (n = 0; n < nAtom; n ++) VAdd (ra[n], ra[n], raP[ip][n]);
uSum += uSumP[ip];
}
/* end serial section */
#endif
#endif
}
/* function should be automatically parallelized */
void LeapfrogIntegrate ()
{
int n;
for (n = 0; n < nAtom; n ++) {
VSAdd (rv[n], rv[n], deltaT, ra[n]);
VSAdd (r[n], r[n], deltaT, rv[n]);
}
}
/* function should be automatically parallelized */
void CheckPeriodicWrap ()
{
int n;
for (n = 0; n < nAtom; n ++) {
WrapR (r[n], x);
WrapR (r[n], y);
#if NDIM == 3
WrapR (r[n], z);
#endif
}
}
/* function should be automatically parallelized -
but note reduction ops */
void CalcEnergy ()
{
VecD v;
double vv, vvSum;
int n;
vvSum = 0.;
vvMax = 0.;
for (n = 0; n < nAtom; n ++) {
VSAdd (v, rv[n], -0.5 * deltaT, ra[n]);
vv = VLenSq (v);
vvSum += vv;
if (vv > vvMax) vvMax = vv;
}
kinEnergy = 0.5 * vvSum / nAtom;
potEnergy = uSum / nAtom;
}
/* function only called once (not part of timing) - do not parallelize */
void InitialState ()
{
VecD p, gap, vSum;
double vMag;
int j, n, nx, ny, nz;
VDiv (gap, region, initUcell);
n = 0;
#if NDIM == 3
for (nz = 0; nz < initUcell.z; nz ++) {
#endif
for (ny = 0; ny < initUcell.y; ny ++) {
for (nx = 0; nx < initUcell.x; nx ++) {
#if NDIM == 2
VSet (p, nx + 0.25, ny + 0.25);
#elif NDIM == 3
VSet (p, nx + 0.25, ny + 0.25, nz + 0.25);
#endif
VMul (p, p, gap);
VSAdd (p, p, -0.5, region);
#if NDIM == 2
for (j = 0; j < 2; j ++) {
r[n] = p;
if (j != 0) VSAdd (r[n], r[n], 0.5, gap);
++ n;
}
#elif NDIM == 3
for (j = 0; j < 4; j ++) {
r[n] = p;
if (j != 3) {
if (j != 0) r[n].x += 0.5 * gap.x;
if (j != 1) r[n].y += 0.5 * gap.y;
if (j != 2) r[n].z += 0.5 * gap.z;
}
++ n;
}
#endif
}
}
#if NDIM == 3
}
#endif
vMag = sqrt (NDIM * temperature);
VZero (vSum);
for (n = 0; n < nAtom; n ++) {
VRand (&rv[n]);
VScale (rv[n], vMag);
VAdd (vSum, vSum, rv[n]);
}
for (n = 0; n < nAtom; n ++) VSAdd (rv[n], rv[n], -1. / nAtom, vSum);
}
#if NDIM == 2
void VRand (VecD2 *p)
{
double s;
s = 2. * M_PI * drand48 ();
p->x = cos (s); p->y = sin (s);
}
#elif NDIM == 3
void VRand (VecD3 *p)
{
double s, x, y;
s = 2.;
while (s > 1.) {
x = 2. * drand48 () - 1.; y = 2. * drand48 () - 1.;
s = SQ (x) + SQ (y);
}
p->z = 1. - 2. * s;
s = 2. * sqrt (1. - s);
p->x = s * x; p->y = s * y;
}
#endif
/* not used */
#if 0
void ComputeForces ()
{
VecD dr, invWid, rs, shift;
VecI cc, m1v, m2v, iof[] = OFFSET_VALS;
double fcVal, rr, rrCut, rri, rri3, uVal;
int c, j1, j2, m1, m1x, m1y, m1z, m2, n, offset;
rrCut = SQ (rCut);
for (n = nAtom; n < nAtom + VProd (cells); n ++) cellList[n] = -1;
VDiv (invWid, cells, region);
for (n = 0; n < nAtom; n ++) {
VSAdd (rs, r[n], 0.5, region);
VMul (cc, rs, invWid);
c = VLinear (cc, cells) + nAtom;
cellList[n] = cellList[c];
cellList[c] = n;
}
for (n = 0; n < nAtom; n ++) VZero (ra[n]);
uSum = 0.;
#if NDIM == 3
for (m1z = 0; m1z < cells.z; m1z ++) {
#endif
for (m1y = 0; m1y < cells.y; m1y ++) {
for (m1x = 0; m1x < cells.x; m1x ++) {
#if NDIM == 2
VSet (m1v, m1x, m1y);
#elif NDIM == 3
VSet (m1v, m1x, m1y, m1z);
#endif
m1 = VLinear (m1v, cells) + nAtom;
DO_OFFSET {
VAdd (m2v, m1v, iof[offset]);
VZero (shift);
WrapC (x);
WrapC (y);
#if NDIM == 3
WrapC (z);
#endif
m2 = VLinear (m2v, cells) + nAtom;
for (j1 = cellList[m1]; j1 >= 0; j1 = cellList[j1]) {
for (j2 = cellList[m2]; j2 >= 0; j2 = cellList[j2]) {
if (m1 != m2 || j2 < j1) {
VSub (dr, r[j1], r[j2]);
VSub (dr, dr, shift);
rr = VLenSq (dr);
if (rr < rrCut) {
rri = 1. / rr;
rri3 = rri * rri * rri;
fcVal = 48. * rri3 * (rri3 - 0.5) * rri;
uVal = 4. * rri3 * (rri3 - 1.) + 1.;
VSAdd (ra[j1], ra[j1], fcVal, dr);
VSAdd (ra[j2], ra[j2], - fcVal, dr);
uSum += uVal;
}
}
}
}
}
}
}
#if NDIM == 3
}
#endif
}
#endif
/* end of program */
|