I am seeing a memory leak when running FEniCS 1.6 (installed from HashDist) on more than a couple of processors on our cluster (using GCC 4.8.4 and OpenMPI 1.8.6). As an example, the following slightly modified version of the demo advection-diffusion code uses 3GB after timestep 100 and 8GB after timestep 10000 when run on 24 cores, but stays steady at 300MB when run on 2 cores. Has anyone seen this kind of issue before? Also, why does the total required memory scale with the number of processors: shouldn't it be relatively independent of the number of processors?
# Copyright (C) 2007 Kristian B. Oelgaard
#
# This file is part of DOLFIN.
#
# DOLFIN is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# DOLFIN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with DOLFIN. If not, see <http://www.gnu.org/licenses/>.
#
# Modified by Anders Logg, 2008
# Modified by Johan Hake, 2008
# Modified by Garth N. Wells, 2009
#
# This demo solves the time-dependent convection-diffusion equation by
# a SUPG stabilized method. The velocity field used in the simulation
# is the output from the Stokes (Taylor-Hood) demo. The sub domains
# for the different boundary conditions are computed by the demo
# program in src/demo/subdomains.
from dolfin import *
import numpy as np
set_log_level(50)
parameters['form_compiler']['representation'] = 'quadrature'
parameters['form_compiler']['optimize'] = True
parameters['form_compiler']['cpp_optimize'] = True
rank = MPI.rank(mpi_comm_world())
# Memory analysis code borrowed from fenicstools
memory_code = '''
#include "dolfin.h"
#include <sys/types.h>
#include <unistd.h>
#include <fstream>
namespace dolfin
{
std::size_t getMemoryUsage(bool rss=true)
{
// Get process ID and page size
const std::size_t pid = getpid();
const std::size_t page_size = getpagesize();
// Prepare statm file
std::stringstream filename;
filename << "/proc/" << pid << "/statm";
std::ifstream statm;
// Read number of pages from statm
statm.open(filename.str().c_str());
if (!statm)
std::cout << "Unable to open statm file for process." << std::endl;
std::size_t num_pages;
statm >> num_pages;
if (rss)
statm >> num_pages;
statm.close();
// Convert to MB and report memory usage
const std::size_t num_kb = num_pages*page_size / 1024;
return num_kb;
}
}
'''
compiled_memory_module = compile_extension_module(code=memory_code)
get_memory_usage = compiled_memory_module.getMemoryUsage
nx = 10
mesh = UnitCubeMesh(nx, nx, nx)
h = CellSize(mesh)
# Create FunctionSpaces
Q = FunctionSpace(mesh, "CG", 2)
V = VectorFunctionSpace(mesh, "CG", 1)
velocity = interpolate(Expression(('x[1]', '0.', '0.')), V)
u0 = interpolate(Expression('0.'), Q)
u_sol = interpolate(Expression('0.'), Q)
# Parameters
T = 10.
hps = 10000
dt = T / n_tsteps
c = 1e-2
# Test and trial functions
u, v = TrialFunction(Q), TestFunction(Q)
# Mid-point solution
u_mid = 0.5*(u0 + u)
# Galerkin variational problem
F = v*(u - u0)*dx + dt*(v*dot(velocity, grad(u_mid))*dx \
+ c*dot(grad(v), grad(u_mid))*dx)
# Create bilinear and linear forms
a = lhs(F)
L = rhs(F)
# Set up boundary condition
class Inflow(SubDomain):
def inside(self, x, on_boundary):
return (x[0] < DOLFIN_EPS and on_boundary)
class Dirichlet(SubDomain):
def inside(self, x, on_boundary):
return (x[0] > .25 and x[0] < .75 and x[1] < DOLFIN_EPS and on_boundary)
facet_domains = MeshFunction("size_t", mesh, mesh.topology().dim()-1)
facet_domains.set_all(0)
inflow_bnd = Inflow()
dirichlet_bnd = Dirichlet()
inflow_bnd.mark(facet_domains, 1)
dirichlet_bnd.mark(facet_domains, 2)
bc1 = DirichletBC(Q, 0., facet_domains, 1)
bc2 = DirichletBC(Q, 1., facet_domains, 2)
bcs = [bc1, bc2]
bc1.apply(u0.vector())
bc2.apply(u0.vector())
bc1.apply(u_sol.vector())
bc2.apply(u_sol.vector())
# Assemble matrix
A = assemble(a)
bc1.apply(A)
bc2.apply(A)
mem_usage = MPI.sum(mpi_comm_world(), get_memory_usage())
if rank == 0:
print 'Memory usage before timestepping loop:', mem_usage
tstep = 0
# Time-stepping
while tstep < n_tsteps:
tstep += 1
# Assemble vector and apply boundary conditions
b = assemble(L)
bc1.apply(b)
bc2.apply(b)
solve(A, u0.vector(), b, 'gmres')
# u0.vector().set_local(u_sol.vector().get_local())
# u0.vector().apply('')
mem_usage = MPI.sum(mpi_comm_world(), get_memory_usage())
if rank == 0 and tstep % 100 == 0:
print 'Memory usage after tstep', tstep, ':', mem_usage