Direct solvers produce answers all over the place

When solving a mixed problem I observe extremely strange results with a variety of direct linear solvers. It is not surprising that serial ones (UMFPACK/SuperLU/Petsc) eventually run out of memory, but what worries me is (1) SuperLU_dist results are completely nonsensical and (2) MUMPS results seem to deteriorate (i.e. produce NaNs earlier) with increasing number of cores
(3) Petsc solver just produces NaNs. Am I missing something/doing something incorrectly? How does one debug issues like these?

The example is unfortunately somewhat long, but this is just a particular Laplace discretization (and one cannot go much simpler than Laplace, can one?). The output is below the code.

Code:


from __future__ import print_function
from dolfin import *
import numpy as np

# Laplace problem using discontinuous Petrov-Galerkin method

parameters["ghost_mode"] = "shared_facet"
set_log_level(WARNING)

method = ('umfpack','petsc','superlu','mumps','superlu_dist')[3]

comm = mpi_comm_world()
mpiRank = MPI.rank(comm)

# Create mesh and define function space
degree  = 0
delta_p = 2
# Test case
u_exact = Expression("cos(pi*x[0])*cos(2*pi*x[1])", degree=degree+2)
f_exact = Expression("cos(pi*x[0])*cos(2*pi*x[1])*5*pi*pi", degree=degree+2)

# bilinear forms
# inner product in the test space
def inner_V(V1,V2,mesh):
    v1,tau1 = split(V1)
    v2,tau2 = split(V2)
    return inner(grad(v1)+tau1,grad(v2)+tau2)*dx + inner(div(tau1),div(tau2))*dx + inner(v1,v2)*dx + inner(tau1,tau2)*dx

# linear differential operator
def b(U,V,mesh):
    u,sigma,uhat,sighat = split(U)
    v,tau               = split(V)
    n       = FacetNormal(mesh)
    sighatn = dot(sighat,n)
    taun    = dot(tau, n)
    # eq1:  -div(sigma) = f
    eq1 = inner(sigma,grad(v))*dx - inner(sighatn('+'),v('+')-v('-'))*dS - inner(dot(sighat,n),v)*ds
    # eq2:  sigma - grad(u) = 0
    eq2 = inner(sigma,tau)*dx + inner(u,div(tau))*dx - inner(uhat('+'),taun('+')+taun('-'))*dS - inner(uhat,taun)*ds
    #
    return eq1+eq2

def a(UV,dUV,mesh):
    U,V   = split(UV)
    dU,dV = split(dUV)
    return inner_V(V,dV,mesh) + b(U,dV,mesh) + b(dU,V,mesh)

def L(UV):
    U,V = split(UV)
    v,tau = split(V)
    return inner(f_exact,v)*dx

def laplace_dpg(N):
    mesh = UnitSquareMesh(N,N)
    # Trial spaces
    u_element = FiniteElement("DG", mesh.ufl_cell(), degree) # solution
    s_element = VectorElement("DG", mesh.ufl_cell(), degree) # gradient
    uh_element= FiniteElement("CG", mesh.ufl_cell(), degree+1) #traces
    sh_element= FiniteElement("RT", mesh.ufl_cell(), degree+1) #fluxes
    U_element = MixedElement([u_element,s_element,uh_element,sh_element])
    #uh_element= FiniteElement("CG", mesh.ufl_cell(), degree+1, restriction="facet") #traces - This is not possible in the latest version of FENICS... use lowest order degree=0
    #sh_element= FiniteElement("RT", mesh.ufl_cell(), degree+1, restriction="facet") #fluxes
    # Test spaces
    v_element = FiniteElement("DG", mesh.ufl_cell(), degree+delta_p)
    t_element = VectorElement("DG", mesh.ufl_cell(), degree+delta_p)
    V_element = MixedElement([v_element,t_element])
    #
    UV_element = MixedElement([U_element,V_element])
    E = FunctionSpace(mesh,UV_element)
    
    # boundary conditions on uhat
    bc = DirichletBC(E.sub(0).sub(2), u_exact, 'on_boundary')

    # Solve the system
    uSol = Function(E)

    UV  = TrialFunction(E)
    dUV = TestFunction(E)

    solve(a(UV,dUV,mesh)==L(dUV), uSol, bcs=bc, solver_parameters = {'linear_solver': method})

    U_,V_ = uSol.split()
    u_,sigma_,uhat_,sighat_ = U_.split()

    # Estimate the error
    L2_err = sqrt(assemble((u_-u_exact)*(u_-u_exact)*dx))
    V_err  = sqrt(assemble(inner_V(V_,V_,mesh)))

    return L2_err, V_err

if __name__=='__main__':
    u_error0 = np.Inf
    v_error0 = np.Inf
    
    for N in 10*2**np.arange(5):
        u_error,v_error = laplace_dpg(N)
        if mpiRank==0:
            print('N = %4d, u_error = %e, v_error = %e, slope_u = %e, slope_v = %e' \
                 % (N,u_error,v_error,u_error0/u_error,v_error0/v_error))
        u_error0,v_error0 = u_error,v_error

Output:

UMFPACK:
N =   10, u_error = 8.499598e-02, v_error = 1.005355e+00, slope_u = inf, slope_v = inf
N =   20, u_error = 4.170205e-02, v_error = 5.167252e-01, slope_u = 2.038173e+00, slope_v = 1.945627e+00
N =   40, u_error = 2.073663e-02, v_error = 2.604149e-01, slope_u = 2.011033e+00, slope_v = 1.984238e+00
out of memory

PETSC:
N =   10, u_error = nan, v_error = nan, slope_u = nan, slope_v = nan
N =   20, u_error = nan, v_error = nan, slope_u = nan, slope_v = nan
N =   40, u_error = nan, v_error = nan, slope_u = nan, slope_v = nan

SuperLU:
N =   10, u_error = 8.499598e-02, v_error = 1.005355e+00, slope_u = inf, slope_v = inf
N =   20, u_error = 4.170205e-02, v_error = 5.167252e-01, slope_u = 2.038173e+00, slope_v = 1.945627e+00
N =   40, u_error = 2.073663e-02, v_error = 2.604149e-01, slope_u = 2.011033e+00, slope_v = 1.984238e+00
N =   80, u_error = 1.035351e-02, v_error = 1.304945e-01, slope_u = 2.002860e+00, slope_v = 1.995600e+00
Not enough memory to perform factorization.

MUMPS:
mpirun -np 1 python laplace_dpg.py
N =   10, u_error = 8.499598e-02, v_error = 1.005355e+00, slope_u = inf, slope_v = inf
N =   20, u_error = 4.170205e-02, v_error = 5.167252e-01, slope_u = 2.038173e+00, slope_v = 1.945627e+00
N =   40, u_error = 2.073663e-02, v_error = 2.604149e-01, slope_u = 2.011033e+00, slope_v = 1.984238e+00
N =   80, u_error = 1.035351e-02, v_error = 1.304945e-01, slope_u = 2.002860e+00, slope_v = 1.995600e+00
N =  160, u_error = nan, v_error = nan, slope_u = nan, slope_v = nan

mpirun -np 2 python laplace_dpg.py
N =   10, u_error = 8.499598e-02, v_error = 1.005355e+00, slope_u = inf, slope_v = inf
N =   20, u_error = 4.170205e-02, v_error = 5.167252e-01, slope_u = 2.038173e+00, slope_v = 1.945627e+00
N =   40, u_error = 2.073663e-02, v_error = 2.604149e-01, slope_u = 2.011033e+00, slope_v = 1.984238e+00
N =   80, u_error = 1.035351e-02, v_error = 1.304945e-01, slope_u = 2.002860e+00, slope_v = 1.995600e+00
N =  160, u_error = nan, v_error = nan, slope_u = nan, slope_v = nan

mpirun -np 4 python laplace_dpg.py
N =   10, u_error = 8.499598e-02, v_error = 1.005355e+00, slope_u = inf, slope_v = inf
N =   20, u_error = 4.170205e-02, v_error = 5.167252e-01, slope_u = 2.038173e+00, slope_v = 1.945627e+00
N =   40, u_error = 2.073663e-02, v_error = 2.604149e-01, slope_u = 2.011033e+00, slope_v = 1.984238e+00
N =   80, u_error = nan, v_error = nan, slope_u = nan, slope_v = nan
...

mpirun -np 24 nice -n 15 python laplace_dpg.py
N =   10, u_error = 8.499598e-02, v_error = 1.005355e+00, slope_u = inf, slope_v = inf
N =   20, u_error = 4.170205e-02, v_error = 5.167252e-01, slope_u = 2.038173e+00, slope_v = 1.945627e+00
N =   40, u_error = nan, v_error = nan, slope_u = nan, slope_v = nan
...

SuperLU_dist:
mpirun -np 2  python laplace_dpg.py
N =   10, u_error = 2.512400e-01, v_error = 2.091658e+00, slope_u = inf, slope_v = inf
N =   20, u_error = 1.559787e+09, v_error = 2.141536e+01, slope_u = 1.610732e-10, slope_v = 9.767091e-02
N =   40, u_error = 4.527557e+01, v_error = 6.164801e+03, slope_u = 3.445097e+07, slope_v = 3.473813e-03
???