Hi,
There are quite many ways to speed this up. If you are going to compute this projection many times, then much work can be preassembled. Some suggestions are:
from dolfin import *
mesh = RectangleMesh(0,0,1,1,40,40)
V = FunctionSpace(mesh,'CG',2)
u = interpolate(Expression('sin(x[1])*cos(x[0])'), V)
S = TensorFunctionSpace(mesh,'DG',0)
t0 = Timer("Original")
H = project(grad(grad(u)), S)
print norm(H)
H.vector().zero()
t0 = Timer("Compress diagonal coefficient matrix for faster solve")
A = assemble(inner(TrialFunction(S), TestFunction(S))*dx)
b = assemble(inner(grad(grad(u)), TestFunction(S))*dx)
A.compress()
solve(A, H.vector(), b, "cg", "default")
print norm(H)
H.vector().zero()
t0 = Timer("Extract diagonal from coefficient matrix - solve directly")
A = assemble(inner(TrialFunction(S), TestFunction(S))*dx)
b = assemble(inner(grad(grad(u)), TestFunction(S))*dx)
ones = Function(S)
ones.vector()[:] = 1
A_diag = A * ones.vector()
A_diag.set_local(1.0/A_diag.array())
H = Function(S)
H.vector()[:] = b * A_diag
t0.stop()
print norm(H)
H.vector().zero()
###
M = assemble(inner(grad(grad(TrialFunction(V))), TestFunction(S))*dx)
M.compress()
A = assemble(inner(TrialFunction(S), TestFunction(S))*dx)
ones = Function(S)
ones.vector()[:] = 1
A_diag = A * ones.vector()
A_diag.set_local(1.0/A_diag.array())
t0 = Timer("Extract diagonal + precompute all required")
b = M*u.vector()
H.vector()[:] = b * A_diag
t0.stop()
print norm(H)
t0 = Timer("Local Solver")
ls = LocalSolver()
ls.solve(H.vector(), Form(inner(TrialFunction(S), TestFunction(S))*dx), Form(inner(grad(grad(u)), TestFunction(S))*dx))
print norm(H)
list_timings()