| >Can you give us some declarations? For example, are tim(*,*) and
>tau(*,*) arrays or functions? What datatype do they use? What are
>the expected values of nsub, ntau, ntimept(*), nwldet(*)?
tim and tau are real*4 arrays
yfit is a real array (because I use -r8 as a comp. option it will be real*8)
>At first glance, it seems you would do better by moving the j loop to
>be innermost, so that exp(...k,m,i,n) can be moved outside the loop.
>This looks like it may have more payoff (~100 cycles per call) than
>reordering the loops for optimal memory behavior.
GREAT ! the elapsed time for the run drops from 54" to 34" !!!
>What operating system and version are you running this program on? On Unix,
>several versions back, there was a severe performance problem when doing
>exp on some (very large negative?) values.
digital unix 4.0 564 ; f77 v4.1-92
I use f77 -O5 -fast -tune host -speculate_all -r8
I also did manual blocking on Bill's code, by introducing additional
loops to have the l,i,n,j,k loops varying on a range sized nb, and I found
the best result (about 28" ) using nb=20 and compiling with
f77 -O4 -fast -tune host -r8
(Here -O5 slows it down to one minute, perhaps because it interferes
with the manual unrolling ?? )
By the way I was kind of surprised that the min. lies on nb=20;
it is an nb**5 domain (l, k, j, i, n) there are 2 doubles (crap and yfit )
and 4 real*4 (tau, tim, das ). To fit D-cache it should go like this :
8000 = (8+4+4+4+8)* (nb**5)
which should give nb=3 ???
Here follows the modified subroutine (i.e. Bill's idea+blocking )
Thanks to all of you, in particular Bill for the grand idea!
-Joseph
subroutine calc_decay
c
c include global values
c
include 'kfdimdef.inc'
include 'kfgbldef.inc'
c
c define variables
c
c real*8 crap(DIM_TAU,DIM_SUB,DIM_TIMEPNT)
real*8 crap
integer*4 i, j, k, l, m, n
c
nb =20
do l = 1, nwlexc
m = dataset(2,l)
do k=1,ntimepnt(m)
do j=1,nwldet(l)
yfit(k,j,l) = 0.0
end do
end do
end do
c
c
do ll=1,nwlexc,nb
do l=ll,min(nwlexc,ll+nb-1)
m = dataset(2,l)
do kk=1, ntimepnt(m),nb
do nn=1,nsub,nb
do ii=1,ntau,nb
do jj=1,nwldet(l),nb
do k=kk,min( ntimepnt(m),kk+nb-1)
do n=nn,min(nsub,nn+nb-1)
do i=ii,min(ntau,ii+nb-1)
crap=exp(-tim(k,m)/tau(i,n))
do j = jj,min(nwldet(l),jj+nb-1)
yfit(k,j,l) = yfit(k,j,l)
1 + das(j,i,l,n) * crap
end do ! end do j
end do ! end do i
end do ! end do n
end do ! end do k
end do !end do jj
end do ! end do ii
end do ! end do nn
end do ! end do kk
end do ! end do l
end do ! end do ll
c
return
end
c
c Filename: kfgbldef.inc
c Author: I. Martin
c Date: V1.0 01-dec-1995
c Description: global values
c
c -----
c
c define variables in common blocks
c
integer*4 ierr, lenlis
character*80 fillis, fillst
integer*4 mode, nabs, nflu, dataset(DIM_SASTYP,DIM_WLEXC),
1 nwlexc, nwldet(DIM_WLEXC), mwldet(DIM_SASTYP), ntau,
1 nsub, nsas(DIM_TAU,DIM_SASTYP),
1 ngf(DIM_TAU,DIM_SASTYP), ntimepnt(DIM_SASTYP)
real*4 wlexc(DIM_WLEXC),
1 wldet(DIM_WLDET*DIM_WLEXC,DIM_SASTYP)
integer*4 inddas(DIM_WLDET,DIM_WLEXC,DIM_SASTYP)
real amplinpsum(DIM_WLDET,DIM_WLEXC)
logical*4 exccalc
real*4 excfacdec(DIM_WLDET,DIM_WLEXC),
1 prob(DIM_TAU), problb(DIM_TAU), probub(DIM_TAU)
integer*4 probfit(DIM_TAU)
real*4 ratmat(DIM_TAU,DIM_TAU,DIM_SUB),
1 ratlb(DIM_TAU,DIM_TAU), ratub(DIM_TAU,DIM_TAU)
integer*4 ratfit(DIM_TAU,DIM_TAU)
real*4 excfac(DIM_WLEXC), excrel(DIM_TAU,DIM_WLEXC),
1 excmat(DIM_TAU,DIM_WLEXC,DIM_SUB),
1 exclb(DIM_WLEXC), excub(DIM_WLEXC)
integer*4 excfit(DIM_WLEXC)
integer*4 sasmode
real*4 gfamp(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfpos(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfhw(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfint(DIM_TAU,DIM_SASTYP), ! intercept and
1 gfsl(DIM_TAU,DIM_SASTYP), ! slope for subground
1 gfamplb(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfposlb(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfhwlb(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfintlb(DIM_TAU,DIM_SASTYP),
1 gfsllb(DIM_TAU,DIM_SASTYP),
1 gfampub(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfposub(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfhwub(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfintub(DIM_TAU,DIM_SASTYP),
1 gfslub(DIM_TAU,DIM_SASTYP)
integer*4 gfampfit(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfposfit(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfhwfit(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfintfit(DIM_TAU,DIM_SASTYP),
1 gfslfit(DIM_TAU,DIM_SASTYP)
real*4 gfbeg(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfend(DIM_GF,DIM_TAU,DIM_SASTYP),
1 gfbegl(DIM_TAU,DIM_SASTYP), gfendl(DIM_TAU,DIM_SASTYP),
1 sas(DIM_WLDET*DIM_WLEXC,DIM_TAU,DIM_SASTYP),
1 saslb(DIM_TAU,DIM_SASTYP), sasub(DIM_TAU,DIM_SASTYP)
integer*4 sasfit(DIM_TAU,DIM_SASTYP), g
|
| I had the best result using the following silicon-graphics
code, compiled with -O5 -fast -tune host -speculate-all -r8
It is about 20 "
------------------------------------------------
subroutine calc_decay
c
c include global values
c
include 'kfdimdef.inc'
include 'kfgbldef.inc'
c
c define variables
c
integer*4 i, j, k, l, m, n
c
real*4 tau_inv(DIM_SUB,DIM_TAU)
real*4 exp_vec(DIM_SASTYP,DIM_TIMEPNT,DIM_SUB,DIM_TAU)
real*4 exp_arg(DIM_TIMEPNT,DIM_SUB,DIM_TAU)
do n = 1, nsub
do i = 1, ntau
tau_inv(n,i) = 1.0/tau(i,n)
enddo
enddo
do l = 1, nwlexc
m = dataset(2,l)
do k = 1, ntimepnt(m)
do n = 1, nsub
do i = 1, ntau
exp_vec(m,k,n,i) = exp(-tim(k,m)*tau_inv(n,i))
end do
end do
end do
end do
do l = 1, nwlexc
m = dataset(2,l)
do j = 1, nwldet(l)
do k = 1, ntimepnt(m)
yfit(k,j,l) = 0.0
Cum do n = 1, nsub
Cum do i = 1, ntau
Cum yfit(k,j,l) = yfit(k,j,l)
Cum 1 + das(j,i,l,n) * exp(-tim(k,m)/tau(i,n))
Cum end do
Cum end do
do n = 1, nsub
do i = 1, ntau
yfit(k,j,l) = yfit(k,j,l)
1 + das(j,i,l,n) * exp_vec(m,k,n,i)
end do
end do
end do
end do
end do
c
return
end
|
|
Joseph,
I'd be curious to know how the code below performs. It does essentially the
same thing your best code does, but uses less memory and accesses temporary
data arrays contiguously.
Also, if there's any way you can redefine "das" so that memory is accessed
contiguously, [ make das(j,i,l,n) into das(i,n,j,l) ] do it. It can only
help.
- Dwight -
--------------------------------------------------------------------------------
subroutine calc_decay
c
c include global values
c
include 'kfdimdef.inc'
include 'kfgbldef.inc'
c
c define variables
c
integer*4 i, j, k, l, m, n
c
real*4 tau_inv(DIM_TAU,DIM_SUB)
real*4 exp_vec(DIM_TAU,DIM_SUB)
real*4 temp
c
do n = 1, nsub
do i = 1, ntau
tau_inv(i,n) = 1.0/tau(i,n)
enddo
enddo
do l = 1, nwlexc
m = dataset(2,l)
do k = 1, ntimepnt(m)
do n = 1, nsub
do i = 1, ntau
exp_vec(i,n) = exp(-tim(k,m)*tau_inv(i,n))
end do
end do
do j = 1, nwldet(l)
temp = 0.0
do n = 1, nsub
do i = 1, ntau
temp = temp + das(j,i,l,n) * exp_vec(i,n)
end do
end do
yfit(k,j,l) = temp
end do
end do
end do
c
return
end
|