/usr/include/dune/istl/repartition.hh

#ifndef DUNE_REPARTITION_HH
#define DUNE_REPARTITION_HH

#include <cassert>
#include <map>
#include <utility>

#if HAVE_PARMETIS
#include <parmetis.h>
#endif

#include <dune/common/timer.hh>
#include <dune/common/enumset.hh>
#include <dune/common/mpitraits.hh>
#include <dune/common/stdstreams.hh>
#include <dune/common/parallel/communicator.hh>
#include <dune/common/parallel/indexset.hh>
#include <dune/common/parallel/indicessyncer.hh>
#include <dune/common/parallel/remoteindices.hh>

#include <dune/istl/owneroverlapcopy.hh>
#include <dune/istl/paamg/graph.hh>

/**
 * @file
 * @brief Functionality for redistributing a parallel index set using graph partitioning.
 *
 * Refactored version of an intern.
 * @author Markus Blatt
 */

namespace Dune 
{
#if HAVE_MPI
    /**
     * @brief Fills the holes in an index set.
     *
     * In general the index set only needs to know those indices 
     * where communication my occur. In usual FE computations these 
     * are just those near the processor boundaries.
     *
     * For the repartitioning we need to know all all indices for which data is stored.
     * The missing indices will be created in this method.
     *
     * @param graph The graph to reparition.
     * @param oocomm The communication information.
     */
    template<class G, class T1, class T2>
    void fillIndexSetHoles(const G& graph, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm)
    {
      typedef typename Dune::OwnerOverlapCopyCommunication<T1,T2>::ParallelIndexSet IndexSet;
      typedef typename IndexSet::LocalIndex::Attribute Attribute;
      
      IndexSet& indexSet = oocomm.indexSet();
      const typename Dune::OwnerOverlapCopyCommunication<T1,T2>::GlobalLookupIndexSet& lookup =oocomm.globalLookup();

      // The type of the const vertex iterator.
      typedef typename G::ConstVertexIterator VertexIterator;
      
      
      std::size_t sum=0, needed = graph.noVertices()-indexSet.size();
      std::vector<std::size_t> neededall(oocomm.communicator().size(), 0);
      
      MPI_Allgather(&needed, 1, MPITraits<std::size_t>::getType() , &(neededall[0]), 1, MPITraits<std::size_t>::getType(), oocomm.communicator());
      for(int i=0; i<oocomm.communicator().size(); ++i)
	sum=sum+neededall[i]; // MAke this for generic
      
      if(sum==0)
	// Nothing to do
	return;
      
      //Compute Maximum Global Index
      T1 maxgi=0;
      typedef typename IndexSet::const_iterator Iterator;
      Iterator end;
      end = indexSet.end();
      for(Iterator it = indexSet.begin(); it != end; ++it) 
	maxgi=std::max(maxgi,it->global());

      //Process p creates global indices consecutively
      //starting atmaxgi+\sum_{i=1}^p neededall[i]
      // All created indices are owned by the process
      maxgi=oocomm.communicator().max(maxgi);
      ++maxgi;//Sart with the next free index.
      
      for(int i=0; i<oocomm.communicator().rank(); ++i)
	maxgi=maxgi+neededall[i]; // TODO: make this more generic
      
      // Store the global index information for repairing the remote index information
      std::map<int,SLList<std::pair<T1,Attribute> > > globalIndices;
      storeGlobalIndicesOfRemoteIndices(globalIndices, oocomm.remoteIndices(), indexSet);
      indexSet.beginResize();
      
      for(VertexIterator vertex = graph.begin(), vend=graph.end(); vertex != vend; ++vertex){
	const typename IndexSet::IndexPair* pair=lookup.pair(*vertex);
	if(pair==0){
	  // No index yet, add new one
	  indexSet.add(maxgi, typename IndexSet::LocalIndex(*vertex, OwnerOverlapCopyAttributeSet::owner, false));
	  ++maxgi;
	}
      }
      
      indexSet.endResize();

      repairLocalIndexPointers(globalIndices, oocomm.remoteIndices(), indexSet);
            
      oocomm.freeGlobalLookup();
      oocomm.buildGlobalLookup();
#ifdef DEBUG_REPART
      std::cout<<"Holes are filled!"<<std::endl;
      std::cout<<oocomm.communicator().rank()<<": "<<oocomm.indexSet()<<std::endl;
#endif
    }

  namespace 
  {
    
    class ParmetisDuneIndexMap
    {
    public:
      template<class Graph, class OOComm>
      ParmetisDuneIndexMap(const Graph& graph, const OOComm& com);
      int toParmetis(int i) const
      {
	return duneToParmetis[i];
      }
      int toLocalParmetis(int i) const
      {
	return duneToParmetis[i]-base_;
      }
      int operator[](int i) const
      {
	return duneToParmetis[i];
      }
      int toDune(int i) const
      {
	return parmetisToDune[i];
      }
      std::vector<int>::size_type numOfOwnVtx() const
      {
	return parmetisToDune.size();
      }
      int* vtxDist()
      {
	return &vtxDist_[0];
      }
      int globalOwnerVertices;
    private:
      int base_;
      std::vector<int> duneToParmetis;
      std::vector<int> parmetisToDune;
      // range of vertices for processor i: vtxdist[i] to vtxdist[i+1] (parmetis global)
      std::vector<int> vtxDist_;
    };
    
    template<class G, class OOComm>
    ParmetisDuneIndexMap::ParmetisDuneIndexMap(const G& graph, const OOComm& oocomm)
      : duneToParmetis(graph.noVertices(), -1), vtxDist_(oocomm.communicator().size()+1)
    {
      int npes=oocomm.communicator().size(), mype=oocomm.communicator().rank();
            
      typedef typename OOComm::ParallelIndexSet::const_iterator Iterator;
      typedef typename OOComm::OwnerSet OwnerSet;

      int numOfOwnVtx=0;
      Iterator end = oocomm.indexSet().end();
      for(Iterator index = oocomm.indexSet().begin(); index != end; ++index) {
	if (OwnerSet::contains(index->local().attribute())) {
	  numOfOwnVtx++;
	}
      }
      parmetisToDune.resize(numOfOwnVtx);
      std::vector<int> globalNumOfVtx(npes);
      // make this number available to all processes
      MPI_Allgather(&numOfOwnVtx, 1, MPI_INT, &(globalNumOfVtx[0]), 1, MPI_INT, oocomm.communicator()); 

      int base=0;
      vtxDist_[0] = 0;
      for(int i=0; i<npes; i++) {
	if (i<mype) {
	  base += globalNumOfVtx[i];
	}
	vtxDist_[i+1] = vtxDist_[i] + globalNumOfVtx[i];
      }
      globalOwnerVertices=vtxDist_[npes];
      base_=base;
      
      // The type of the const vertex iterator.
      typedef typename G::ConstVertexIterator VertexIterator;
#ifdef DEBUG_REPART
      std::cout << oocomm.communicator().rank()<<" vtxDist: ";
      for(int i=0; i<= npes;++i)
	std::cout << vtxDist_[i]<<" ";
      std::cout<<std::endl;
#endif

      // Traverse the graph and assign a new consecutive number/index
      // starting by "base" to all owner vertices.
      // The new index is used as the ParMETIS global index and is
      // stored in the vector "duneToParmetis"
      VertexIterator vend = graph.end();
      for(VertexIterator vertex = graph.begin(); vertex != vend; ++vertex) {
	const typename OOComm::ParallelIndexSet::IndexPair* index=oocomm.globalLookup().pair(*vertex);
	assert(index);
	if (OwnerSet::contains(index->local().attribute())) {
	  // assign and count the index
	  parmetisToDune[base-base_]=index->local();
	  duneToParmetis[index->local()] = base++;
	}
      }

      // At this point, every process knows the ParMETIS global index
      // of it's owner vertices. The next step is to get the 
      // ParMETIS global index of the overlap vertices from the 
      // associated processes. To do this, the Dune::Interface class
      // is used.
#ifdef DEBUG_REPART
      std::cout <<oocomm.communicator().rank()<<": before ";
      for(std::size_t i=0; i<duneToParmetis.size(); ++i)
	std::cout<<duneToParmetis[i]<<" ";
      std::cout<<std::endl;
#endif
      oocomm.copyOwnerToAll(duneToParmetis,duneToParmetis);
#ifdef DEBUG_REPART
      std::cout <<oocomm.communicator().rank()<<": after ";
      for(std::size_t i=0; i<duneToParmetis.size(); ++i)
	std::cout<<duneToParmetis[i]<<" ";
      std::cout<<std::endl;
#endif
    }
  }
  
  struct RedistributeInterface
    : public Interface
  {
    void setCommunicator(MPI_Comm comm)
    {
      communicator_=comm;
    }
    template<class Flags,class IS>
    void buildSendInterface(const std::vector<int>& toPart, const IS& idxset)
    {
      std::map<int,int> sizes;
      
      typedef typename IS::const_iterator IIter;
      for(IIter i=idxset.begin(), end=idxset.end();i!=end; ++i)
	if(Flags::contains(i->local().attribute()))
	  ++sizes[toPart[i->local()]];

      // Allocate the necessary space
      typedef std::map<int,int>::const_iterator MIter;
      for(MIter i=sizes.begin(), end=sizes.end(); i!=end; ++i)
	interfaces()[i->first].first.reserve(i->second);
      
      //Insert the interface information
      typedef typename IS::const_iterator IIter;
      for(IIter i=idxset.begin(), end=idxset.end();i!=end; ++i)
	if(Flags::contains(i->local().attribute()))
	  interfaces()[toPart[i->local()]].first.add(i->local());
    }
    
    void reserveSpaceForReceiveInterface(int proc, int size)
    {
      interfaces()[proc].second.reserve(size);
    }
    void addReceiveIndex(int proc, std::size_t idx)
    {
      interfaces()[proc].second.add(idx);
    }
    template<typename TG>
    void buildReceiveInterface(std::vector<std::pair<TG,int> >& indices)
    {
      typedef typename std::vector<std::pair<TG,int> >::const_iterator VIter;
      std::size_t i=0;
      for(VIter idx=indices.begin(); idx!= indices.end(); ++idx){
	  interfaces()[idx->second].second.add(i++);
	}
    }
    
    ~RedistributeInterface()
    {
    }
    
  };

  namespace
  {
    /**
     * @brief Fills send buffer with global indices.
     *
     * @param ownerVec the owner vertices to send
     * @param overlapSet the overlap vertices to send
     * @param sendBuf the send buffer
     * @param buffersize The size of the send buffer
     * @param comm Communicator for the send.
     */
    template<class GI>
    void createSendBuf(std::vector<GI>& ownerVec, std::set<GI>& overlapVec, std::set<int>& neighbors, char *sendBuf, int buffersize, MPI_Comm comm) {
      // Pack owner vertices
      std::size_t s=ownerVec.size();
      int pos=0;
      if(s==0)
        ownerVec.resize(1); // otherwise would read beyond the memory bound
      MPI_Pack(&s, 1, MPITraits<std::size_t>::getType(), sendBuf, buffersize, &pos, comm);
      MPI_Pack(&(ownerVec[0]), s, MPITraits<GI>::getType(), sendBuf, buffersize, &pos, comm);
      s = overlapVec.size();
      MPI_Pack(&s, 1, MPITraits<std::size_t>::getType(), sendBuf, buffersize, &pos, comm);
      typedef typename std::set<GI>::iterator Iter;
      for(Iter i=overlapVec.begin(), end= overlapVec.end(); i != end; ++i)
	MPI_Pack(const_cast<GI*>(&(*i)), 1, MPITraits<GI>::getType(), sendBuf, buffersize, &pos, comm);

      s=neighbors.size();
      MPI_Pack(&s, 1, MPITraits<std::size_t>::getType(), sendBuf, buffersize, &pos, comm);
      typedef typename std::set<int>::iterator IIter;

      for(IIter i=neighbors.begin(), end= neighbors.end(); i != end; ++i)
	MPI_Pack(const_cast<int*>(&(*i)), 1, MPI_INT, sendBuf, buffersize, &pos, comm);
    }
    /**
     * @brief save the values of the received MPI buffer to the owner/overlap vectors
     *
     * @param recvBuf the receive buffer.
     * @param ownerVec the vector to store the owner indices in.
     * @param overlapVec the set to store the overlap indices in.
     * @param comm The communicator used in the receive.
     */
    template<class GI>
    void saveRecvBuf(char *recvBuf, int bufferSize, std::vector<std::pair<GI,int> >& ownerVec, 
		     std::set<GI>& overlapVec, std::set<int>& neighbors, RedistributeInterface& inf, int from, MPI_Comm comm) {
      std::size_t size;
      int pos=0;
      // unpack owner vertices
      MPI_Unpack(recvBuf, bufferSize, &pos, &size, 1, MPITraits<std::size_t>::getType(), comm);
      inf.reserveSpaceForReceiveInterface(from, size);
      ownerVec.reserve(ownerVec.size()+size);
      for(;size!=0;--size){
	GI gi;
	MPI_Unpack(recvBuf, bufferSize, &pos, &gi, 1, MPITraits<GI>::getType(), comm);
	ownerVec.push_back(std::make_pair(gi,from));
      }
      // unpack overlap vertices
      MPI_Unpack(recvBuf, bufferSize, &pos, &size, 1, MPITraits<std::size_t>::getType(), comm);
      typename std::set<GI>::iterator ipos = overlapVec.begin();
      Dune::dverb << "unpacking "<<size<<" overlap"<<std::endl;
      for(;size!=0;--size){
	GI gi;
	MPI_Unpack(recvBuf, bufferSize, &pos, &gi, 1, MPITraits<GI>::getType(), comm);
	ipos=overlapVec.insert(ipos, gi);
      }
      //unpack neighbors
      MPI_Unpack(recvBuf, bufferSize, &pos, &size, 1,  MPITraits<std::size_t>::getType(), comm);
      Dune::dverb << "unpacking "<<size<<" neighbors"<<std::endl;
      typename std::set<int>::iterator npos = neighbors.begin();
      for(;size!=0;--size){
	int n;
	MPI_Unpack(recvBuf, bufferSize, &pos, &n, 1, MPI_INT, comm);
	npos=neighbors.insert(npos, n);
      }
    }

    /**
     * @brief Find the optimal domain number for a given process
     *
     * The estimation is necessary because the result of ParMETIS for 
     * the new partition is only a domain/set number and not a process number.
     *
     * @param comm the MPI communicator
     * @param *part the result array of the ParMETIS repartition
     * @param numOfOwnVtx the number of owner vertices
     * @param nparts the number of target partitions/processes
     * @param *myDomain the optimal output domain number
     * @param domainMapping[] the array of output domain mapping
     */
    template<typename T>
    void getDomain(const MPI_Comm& comm, T *part, int numOfOwnVtx, int nparts, int *myDomain, std::vector<int> &domainMapping) {
      int npes, mype;
      MPI_Comm_size(comm, &npes);
      MPI_Comm_rank(comm, &mype);
      MPI_Status status;
  
      *myDomain = -1;
      int i=0;
      int j=0;
  
      std::vector<int> domain(nparts);
      std::vector<int> assigned(npes);
      // init
      for (i=0; i<nparts; i++) {
	domainMapping[i] = -1;
	domain[i] = 0;
      }
      for (i=0; i<npes; i++) {
	assigned[i] = -0;
      }
      // count the occurance of domains
      for (i=0; i<numOfOwnVtx; i++) {
	domain[part[i]]++;
      }
  
      int *domainMatrix = new int[npes * nparts];
      // init
      for(i=0; i<npes*nparts; i++) {
	domainMatrix[i]=-1;
      }
  
      // init buffer with the own domain
      int *buf = new int[nparts];
      for (i=0; i<nparts; i++) {
	buf[i] = domain[i];
	domainMatrix[mype*nparts+i] = domain[i];
      }
      int pe=0;
      int src = (mype-1+npes)%npes;
      int dest = (mype+1)%npes;
      // ring communication, we need n-1 communications for n processors
      for (i=0; i<npes-1; i++) {
	MPI_Sendrecv_replace(buf, nparts, MPI_INT, dest, 0, src, 0, comm, &status); 
	// pe is the process of the actual received buffer
	pe = ((mype-1-i)+npes)%npes;
	for(j=0; j<nparts; j++) {
	  // save the values to the domain matrix
	  domainMatrix[pe*nparts+j] = buf[j];
	}
      }
      delete[] buf;
      
      // Start the domain calculation.
      // The process which contains the maximum number of vertices of a 
      // particular domain is selected to choose it's favorate domain
      int maxOccurance = 0;
      pe = -1;
      for(i=0; i<nparts; i++) {
	for(j=0; j<npes; j++) {
	  // process has no domain assigned
	  if (assigned[j]==0) {
	    if (maxOccurance < domainMatrix[j*nparts+i]) {
	      maxOccurance = domainMatrix[j*nparts+i];
	      pe = j;
	    }
	  }
      
	}
	if (pe!=-1) {
	  // process got a domain, ...
	  domainMapping[i] = pe;
	  // ...mark as assigned
	  assigned[pe] = 1;
	  if (pe==mype) {
	    *myDomain = i;
	  }
	  pe = -1;
	}
	maxOccurance = 0;
      }
    
      delete[] domainMatrix;

    }
  
    struct SortFirst
    {
      template<class T>
      bool operator()(const T& t1, const T& t2) const
      {
	return t1<t2;
      }
    };
    

    /**
     * @brief Merge the owner/overlap vectors
     *
     * This function merges and adds the vertices of a owner/overlap
     * vector to a result owner/overlap vector
     *
     * @param &ownerVec a global index vector contains the owner vertices to merge/add, sorted according
     * to the global index.
     * @param &overlapSet a global index set contains the overlap vertices to merge/add
     */
    template<class GI>
    void mergeVec(std::vector<std::pair<GI, int> >& ownerVec, std::set<GI>& overlapSet) {
      
      typedef typename std::vector<std::pair<GI,int> >::const_iterator VIter;
#ifdef DEBUG_REPART
      // Safty check for duplicates.
      if(ownerVec.size()>0)
	{
	  VIter old=ownerVec.begin();
	  for(VIter i=old+1, end=ownerVec.end(); i != end; old=i++)
	    {
	      if(i->first==old->first)
		{
		  std::cerr<<"Value at indes"<<old-ownerVec.begin()<<" is the same as at index "
			   <<i-ownerVec.begin()<<" ["<<old->first<<","<<old->second<<"]==["
			   <<i->first<<","<<i->second<<"]"<<std::endl;  
		  throw "Huch!";
		}
	    }
	}
    
#endif
    
      typedef typename std::set<GI>::iterator SIter;
      VIter v=ownerVec.begin(), vend=ownerVec.end();
      for(SIter s=overlapSet.begin(), send=overlapSet.end(); s!=send;)
	{
	  while(v!=vend && v->first<*s) ++v;
	  if(v!=vend && v->first==*s){
	    // Move to the next element before erasing
	    // thus s stays valid!
	    SIter tmp=s;
	    ++s;
	    overlapSet.erase(tmp);
	  }else
	    ++s;
	}
    }


    /**
     * @brief get the non-owner neighbors of a given vertex
     *
     * For a given vertex, get the index of all non-owner neighbor vertices are
     * computed.
     *
     * @param g the local graph
     * @param part Where the vertices become owner
     * @param vtx the given vertex
     * @param parmetisVtxMapping mapping between Dune and ParMETIS vertices
     * @param indexSet the indexSet
     * @param neighbor the output set to store the neighbor indices in.
     */
    template<class OwnerSet, class Graph, class IS, class GI>
    void getNeighbor(const Graph& g, std::vector<int>& part,
		     typename Graph::VertexDescriptor vtx, const IS& indexSet, 
		     int toPe, std::set<GI>& neighbor, std::set<int>& neighborProcs) {
      typedef typename Graph::ConstEdgeIterator Iter;
      for(Iter edge=g.beginEdges(vtx), end=g.endEdges(vtx); edge!=end; ++edge)
	{
	  const typename IS::IndexPair* pindex = indexSet.pair(edge.target());
	  assert(pindex);
	  if(part[pindex->local()]!=toPe || !OwnerSet::contains(pindex->local().attribute()))
	    {
	      // is sent to another process and therefore becomes overlap
	      neighbor.insert(pindex->global());
	      neighborProcs.insert(part[pindex->local()]);
	    }
	}
    }

    template<class T, class I>
    void my_push_back(std::vector<T>& ownerVec, const I& index, int proc)
    {
      ownerVec.push_back(index);
    }
    
    template<class T, class I>
    void my_push_back(std::vector<std::pair<T,int> >& ownerVec, const I& index, int proc)
    {
      ownerVec.push_back(std::make_pair(index,proc));
    }
    template<class T>
    void reserve(std::vector<T>&, RedistributeInterface&, int)
    {
    }
    template<class T>
    void reserve(std::vector<std::pair<T,int> >& ownerVec, RedistributeInterface& redist, int proc)
    {
      redist.reserveSpaceForReceiveInterface(proc, ownerVec.size());
    }
    

    /**
     * @brief get the owner- and overlap vertices for giving source and destination processes.
     *
     * The estimation is based on the vtxdist and the global PARMETIS mapping
     * generated before. The owner- and overlap vertices are stored in two
     * separate vectors
     *
     * @param graph The local graph.
     * @param part The target domain of the local vertices (result of PARMETIS).
     * @param indexSet The indexSet of the given graph.
     * @param parmetisVtxMapping The mapping between PARMETIS index 
     *                           and DUNE global index.
     * @param myPe The source process number.
     * @param toPe The target process number.
     * @param ownerVec The output vector containing all owner vertices.
     * @param overlapSet The output vector containing all overlap vertices.
     */
    template<class OwnerSet, class G, class IS, class T, class GI>
    void getOwnerOverlapVec(const G& graph, std::vector<int>& part, IS& indexSet, 
			    int myPe, int toPe, std::vector<T>& ownerVec, std::set<GI>& overlapSet,
			    RedistributeInterface& redist, std::set<int>& neighborProcs) {

      //typedef typename IndexSet::const_iterator Iterator;
      typedef typename IS::const_iterator Iterator;
      for(Iterator index = indexSet.begin(); index != indexSet.end(); ++index) {
	// Only Process owner vertices, the others are not in the parmetis graph.
	if(OwnerSet::contains(index->local().attribute()))
	  {
	    if(part[index->local()]==toPe)
	      {
		getNeighbor<OwnerSet>(graph, part, index->local(), indexSet, 
				      toPe, overlapSet, neighborProcs);
		my_push_back(ownerVec, index->global(), toPe);
	      }
	  }
      }
      reserve(ownerVec, redist, toPe);
      
    }
    

    /**
     * @brief check if the given vertex is a owner vertex
     *
     * @param indexSet the indexSet
     * @param index the given vertex index
     */
    template<class F, class IS>
    inline bool isOwner(IS& indexSet, int index) {

      const typename IS::IndexPair* pindex=indexSet.pair(index);
      
      assert(pindex);
      return F::contains(pindex->local().attribute());
    }

    
    class BaseEdgeFunctor
    {
    public:
      BaseEdgeFunctor(int* adj,const ParmetisDuneIndexMap& data)
	:i_(), adj_(adj), data_(data)
      {}
      
      template<class T>
      void operator()(const T& edge)
      {
	// Get the egde weight
	// const Weight& weight=edge.weight();
	adj_[i_] = data_.toParmetis(edge.target());
	i_++;
      }
      std::size_t index()
      {
	return i_;
      }
      
    private:
      std::size_t i_;
      int* adj_;
      const ParmetisDuneIndexMap& data_;
    };
    
    template<typename G>
    struct EdgeFunctor
      : public BaseEdgeFunctor
    {
      EdgeFunctor(int* adj, const ParmetisDuneIndexMap& data, std::size_t s)
	: BaseEdgeFunctor(adj, data)
      {}

      int* getWeights()
      {
	return NULL;
      }
      void free(){}
    };
      
    template<class G, class V, class E, class VM, class EM>
    class EdgeFunctor<Dune::Amg::PropertiesGraph<G,V,E,VM,EM> >
      :  public BaseEdgeFunctor
    {
    public:
      EdgeFunctor(int* adj, const ParmetisDuneIndexMap& data, std::size_t s)
	:BaseEdgeFunctor(adj, data)
	{
	  weight_=new int[s];
	}
      
      template<class T>
      void operator()(const T& edge)
      {
	weight_[index()]=edge.properties().depends()?3:1;
	BaseEdgeFunctor::operator()(edge);
      }
      int* getWeights()
      {
	return weight_;
      }
      void free(){
	if(weight_!=0){
	  delete weight_;
	  weight_=0;
	}
      }
    private:
      int* weight_;
    };
    
      

    /**
     * @brief Create the "adjncy" and "xadj" arrays for using ParMETIS
     *
     * This function builds the ParMETIS "adjncy" and "xadj" array according 
     * to the ParMETIS documentation. These arrays are generated by
     * traversing the graph object. The assigned index to the 
     * "adjncy" array is the ParMETIS global index calculated before. 
     *
     * @param graph the local graph.
     * @param indexSet the local indexSet.
     * @param &xadj the ParMETIS xadj array
     * @param ew Funcot to setup adjacency info.
     */
    template<class F, class G, class IS, class EW>
    void getAdjArrays(G& graph, IS& indexSet, int *xadj,
		      EW& ew)
    {
      int j=0;
  
      // The type of the const vertex iterator.
      typedef typename G::ConstVertexIterator VertexIterator;
      //typedef typename IndexSet::const_iterator Iterator;
      typedef typename IS::const_iterator Iterator;

      VertexIterator vend = graph.end();
      Iterator end;

      for(VertexIterator vertex = graph.begin(); vertex != vend; ++vertex){
	if (isOwner<F>(indexSet,*vertex)) {  
	  // The type of const edge iterator.
	  typedef typename G::ConstEdgeIterator EdgeIterator;
	  EdgeIterator eend = vertex.end();
	  xadj[j] = ew.index();
	  j++;
	  for(EdgeIterator edge = vertex.begin(); edge != eend; ++edge){
	    ew(edge);
	  }
	}
      }
      xadj[j] = ew.index();
    }
  }// end anonymous namespace

  template<class G, class T1, class T2>
  bool buildCommunication(const G& graph, std::vector<int>& realparts, 
                          Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm, 
                          Dune::OwnerOverlapCopyCommunication<T1,T2>*& outcomm, 
                          RedistributeInterface& redistInf,
                          bool verbose=false);
#if HAVE_PARMETIS
  extern "C"{
  void METIS_PartGraphKway(int *nvtxs, idxtype *xadj, idxtype *adjncy, idxtype *vwgt, 
                           idxtype *adjwgt, int *wgtflag, int *numflag, int *nparts, 
                           int *options, int *edgecut, idxtype *part);
  
  void METIS_PartGraphRecursive(int *nvtxs, idxtype *xadj, idxtype *adjncy, idxtype *vwgt, 
                           idxtype *adjwgt, int *wgtflag, int *numflag, int *nparts, 
                           int *options, int *edgecut, idxtype *part);
  }
#endif

  template<class S, class T>
  inline void print_carray(S& os, T* array, std::size_t l)
  {
    for(T *cur=array, *end=array+l; cur!=end; ++cur)
      os<<*cur<<" ";
  }
#if !HAVE_PARMETIS
  typedef std::size_t idxtype;
#endif

  inline bool isValidGraph(std::size_t noVtx, std::size_t gnoVtx, idxtype noEdges, idxtype* xadj,
                  idxtype* adjncy, bool checkSymmetry)
  {
    bool correct=true;
    
    for(idxtype vtx=0; vtx<(idxtype)noVtx; ++vtx){
      if(xadj[vtx]>noEdges||xadj[vtx]<0){
        std::cerr <<"Check graph: xadj["<<vtx<<"]="<<xadj[vtx]<<" (>"
                  <<noEdges<<") out of range!"<<std::endl;
          correct=false;
      }
      if(xadj[vtx+1]>noEdges||xadj[vtx+1]<0){
        std::cerr <<"Check graph: xadj["<<vtx+1<<"]="<<xadj[vtx+1]<<" (>"
                  <<noEdges<<") out of range!"<<std::endl;
          correct=false;
      }
    // Check numbers in adjncy
      for(idxtype i=xadj[vtx]; i< xadj[vtx+1];++i){
        if(adjncy[i]<0||((std::size_t)adjncy[i])>gnoVtx){
          std::cerr<<" Edge "<<adjncy[i]<<" out of range ["<<0<<","<<noVtx<<")"
                   <<std::endl;
          correct=false;
        }
      }
      if(checkSymmetry){
        for(idxtype i=xadj[vtx]; i< xadj[vtx+1];++i){
          idxtype target=adjncy[i];
          // search for symmetric edge
          int found=0;
          for(idxtype j=xadj[target]; j< xadj[target+1];++j)
            if(adjncy[j]==vtx)
              found++;
          if(found!=1){
            std::cerr<<"Edge ("<<target<<","<<vtx<<") "<<i<<" time"<<std::endl;
            correct=false;
          }
        }
      }
    }
    return correct;
  }
  
  template<class M, class T1, class T2>
  bool commGraphRepartition(const M& mat, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm, int nparts,
			    Dune::OwnerOverlapCopyCommunication<T1,T2>*& outcomm,
			    RedistributeInterface& redistInf,
			    bool verbose=false)
  {
    if(verbose && oocomm.communicator().rank()==0)
      std::cout<<"Repartitioning from "<<oocomm.communicator().size()
	       <<" to "<<nparts<<" parts"<<std::endl;
    Timer time;
    int rank = oocomm.communicator().rank();
#if !HAVE_PARMETIS
    int* part = new int[1];
    part[0]=0;
#else
    idxtype* part = new idxtype[1]; // where all our data moves to

    if(nparts>1){
        
      part[0]=rank;

      { // sublock for automatic memory deletion
      
        // Build the graph of the communication scheme and create an appropriate indexset.
        // calculate the neighbour vertices
        int noNeighbours = oocomm.remoteIndices().neighbours();
        typedef typename  Dune::OwnerOverlapCopyCommunication<T1,T2>::RemoteIndices RemoteIndices;
        typedef typename RemoteIndices::const_iterator 
          NeighbourIterator;
    
        for(NeighbourIterator n= oocomm.remoteIndices().begin(); n !=  oocomm.remoteIndices().end();
            ++n)
          if(n->first==rank){
            //do not include ourselves.
            --noNeighbours;
            break;
          }
    
        // A parmetis graph representing the communication graph.
        // The diagonal entries are the number of nodes on the process.
        // The offdiagonal entries are the number of edges leading to other processes.

        idxtype *xadj=new idxtype[2], *vwgt = 0;
        idxtype *vtxdist=new idxtype[oocomm.communicator().size()+1];
        idxtype * adjncy=new idxtype[noNeighbours], *adjwgt = 0;
        
        // each process has exactly one vertex!
        for(int i=0; i<oocomm.communicator().size(); ++i)
          vtxdist[i]=i;
        vtxdist[oocomm.communicator().size()]=oocomm.communicator().size();
      
        xadj[0]=0;
        xadj[1]=noNeighbours;

        // count edges to other processor
        // a vector mapping the index to the owner
        // std::vector<int> owner(mat.N(), oocomm.communicator().rank());
        // for(NeighbourIterator n= oocomm.remoteIndices().begin(); n !=  oocomm.remoteIndices().end();
        //     ++n)
        //   {
        //     if(n->first!=oocomm.communicator().rank()){
        //       typedef typename RemoteIndices::RemoteIndexList RIList;
        //       const RIList& rlist = *(n->second.first);
        //       typedef typename RIList::const_iterator LIter;
        //       for(LIter entry=rlist.begin(); entry!=rlist.end(); ++entry){
        //         if(entry->attribute()==OwnerOverlapCopyAttributeSet::owner)
        //           owner[entry->localIndexPair().local()] = n->first;
        //       }
        //     }
        //   }

        // std::map<int,idxtype> edgecount; // edges to other processors
        // typedef typename M::ConstRowIterator RIter;
        // typedef typename M::ConstColIterator CIter;
    
        // // calculate edge count
        // for(RIter row=mat.begin(), endr=mat.end(); row != endr; ++row)
        //   if(owner[row.index()]==OwnerOverlapCopyAttributeSet::owner)
        //     for(CIter entry= row->begin(), ende = row->end(); entry != ende; ++entry)
        //       ++edgecount[owner[entry.index()]];
    
        // setup edge and weight pattern
        typedef typename  RemoteIndices::const_iterator NeighbourIterator;
        typedef typename  Dune::OwnerOverlapCopyCommunication<T1,T2>::ParallelIndexSet IndexSet;
        typedef typename  IndexSet::LocalIndex LocalIndex;
    
        idxtype* adjp=adjncy;

#ifdef USE_WEIGHTS
        vwgt   = new idxtype[1];
        vwgt[0]= mat.N(); // weight is numer of rows TODO: Should actually be the nonzeros.

        adjwgt = new idxtype[noNeighbours];
        idxtype* adjwp=adjwgt;
#endif

        for(NeighbourIterator n= oocomm.remoteIndices().begin(); n !=  oocomm.remoteIndices().end();
            ++n)
          if(n->first != rank){
            *adjp=n->first;
            ++adjp;
#ifdef USE_WEIGHTS
            *adjwp=1;//edgecount[n->first];
            ++adjwp;
#endif
          }
        assert(isValidGraph(vtxdist[rank+1]-vtxdist[rank], 
                            vtxdist[oocomm.communicator().size()],
                            noNeighbours, xadj, adjncy, false));
        
        int wgtflag=0, numflag=0, edgecut;
#ifdef USE_WEIGHTS
        wgtflag=3;
#endif
        float *tpwgts = new float[nparts];
        for(int i=0; i<nparts; ++i)
          tpwgts[i]=1.0/nparts;
        int options[5] ={ 0,1,15,0,0};
        MPI_Comm comm=oocomm.communicator();

        Dune::dinfo<<rank<<" vtxdist: ";
        print_carray(Dune::dinfo, vtxdist, oocomm.communicator().size()+1);
        Dune::dinfo<<std::endl<<rank<<" xadj: ";
        print_carray(Dune::dinfo, xadj, 2);
        Dune::dinfo<<std::endl<<rank<<" adjncy: ";
        print_carray(Dune::dinfo, adjncy, noNeighbours);

#ifdef USE_WEIGHTS
        Dune::dinfo<<std::endl<<rank<<" vwgt: ";
        print_carray(Dune::dinfo, vwgt, 1);
        Dune::dinfo<<std::endl<<rank<<" adwgt: ";
        print_carray(Dune::dinfo, adjwgt, noNeighbours);
#endif
        Dune::dinfo<<std::endl;
        oocomm.communicator().barrier();
	if(verbose && oocomm.communicator().rank()==0)
	std::cout<<"Creating comm graph took "<<time.elapsed()<<std::endl;
	time.reset();

#ifdef PARALLEL_PARTITION
        float ubvec = 1.15;
        int ncon=1;

        //=======================================================
        // ParMETIS_V3_PartKway
        //=======================================================
        ParMETIS_V3_PartKway(vtxdist, xadj, adjncy,
                             vwgt, adjwgt, &wgtflag,
                             &numflag, &ncon, &nparts, tpwgts, &ubvec, options, &edgecut, part, 
                             &comm);
	if(verbose && oocomm.communicator().rank()==0)
	std::cout<<"ParMETIS took "<<time.elapsed()<<std::endl;
	time.reset();
#else
        Timer time1;
        std::size_t gnoedges=0;
        int* noedges = 0;
          noedges = new int[oocomm.communicator().size()];
        Dune::dverb<<"noNeighbours: "<<noNeighbours<<std::endl;
        // gather number of edges for each vertex.
        MPI_Allgather(&noNeighbours,1,MPI_INT,noedges,1, MPI_INT,oocomm.communicator());
        
        if(verbose && oocomm.communicator().rank()==0)
	    std::cout<<"Gathering noedges took "<<time1.elapsed()<<std::endl;
	  time1.reset();

        int noVertices = vtxdist[oocomm.communicator().size()];
        idxtype *gxadj = 0;
        idxtype *gvwgt = 0;
        idxtype *gadjncy = 0;
        idxtype *gadjwgt = 0;
        idxtype *gpart = 0;
        int* displ = 0;
        int* noxs = 0;
        int* xdispl = 0;  // displacement for xadj
        int* novs = 0;
        int* vdispl=0; // real vertex displacement
        std::size_t localNoVtx=vtxdist[rank+1]-vtxdist[rank];
        std::size_t gxadjlen = vtxdist[oocomm.communicator().size()]-vtxdist[0]+oocomm.communicator().size();
      
        {
          Dune::dinfo<<"noedges: ";
          print_carray(Dune::dinfo, noedges, oocomm.communicator().size());
          Dune::dinfo<<std::endl;
          displ = new int[oocomm.communicator().size()];
          xdispl = new int[oocomm.communicator().size()];
          noxs = new int[oocomm.communicator().size()];
          vdispl = new int[oocomm.communicator().size()];
          novs = new int[oocomm.communicator().size()];
        
          for(int i=0; i < oocomm.communicator().size(); ++i){
            noxs[i]=vtxdist[i+1]-vtxdist[i]+1;
            novs[i]=vtxdist[i+1]-vtxdist[i];
          }
        
          idxtype *so= vtxdist;
          int offset = 0;
          for(int *xcurr = xdispl, *vcurr = vdispl, *end=vdispl+oocomm.communicator().size(); 
              vcurr!=end; ++vcurr, ++xcurr, ++so, ++offset){
            *vcurr = *so;
            *xcurr = offset + *so;
          }
        
          int *pdispl =displ;
          int cdispl = 0;
          *pdispl = 0;
          for(int *curr=noedges, *end=noedges+oocomm.communicator().size()-1; 
              curr!=end; ++curr){
            ++pdispl; // next displacement
            cdispl += *curr; // next value
            *pdispl = cdispl; 
          }
          Dune::dinfo<<"displ: ";
          print_carray(Dune::dinfo, displ, oocomm.communicator().size());
          Dune::dinfo<<std::endl;
        
          // calculate global number of edges
          // It is bigger than the actual one as we habe size-1 additional end entries
          for(int *curr=noedges, *end=noedges+oocomm.communicator().size(); 
              curr!=end; ++curr)
            gnoedges += *curr;

          // alocate gobal graph
          Dune::dinfo<<"gxadjlen: "<<gxadjlen<<" noVertices: "<<noVertices
                   <<" gnoedges: "<<gnoedges<<std::endl;
          gxadj = new idxtype[gxadjlen];
          gpart = new idxtype[noVertices];
#ifdef USE_WEIGHTS
          gvwgt = new idxtype[noVertices];
          gadjwgt = new idxtype[gnoedges];
#endif
          gadjncy = new idxtype[gnoedges];
        }

        if(verbose && oocomm.communicator().rank()==0)
	    std::cout<<"Preparing global graph took "<<time1.elapsed()<<std::endl;
	  time1.reset();
        // Communicate data
      
        MPI_Allgatherv(xadj,2,MPITraits<idxtype>::getType(),
                    gxadj,noxs,xdispl,MPITraits<idxtype>::getType(),
                    comm);
        MPI_Allgatherv(adjncy,noNeighbours,MPITraits<idxtype>::getType(),
                    gadjncy,noedges,displ,MPITraits<idxtype>::getType(),
                    comm);
#ifdef USE_WEIGHTS
        MPI_Allgatherv(adjwgt,noNeighbours,MPITraits<idxtype>::getType(),
                    gadjwgt,noedges,displ,MPITraits<idxtype>::getType(),
                    comm);
        MPI_Allgatherv(vwgt,localNoVtx,MPITraits<idxtype>::getType(),
                    gvwgt,novs,vdispl,MPITraits<idxtype>::getType(),
                    comm);
#endif
        if(verbose && oocomm.communicator().rank()==0)
	    std::cout<<"Gathering global graph data took "<<time1.elapsed()<<std::endl;
	  time1.reset();

        {
          // create the real gxadj array
          // i.e. shift entries and add displacements.

          print_carray(Dune::dinfo, gxadj, gxadjlen);
        
          int offset = 0;
          idxtype increment = vtxdist[1];
          idxtype *start=gxadj+1;
          for(int i=1; i<oocomm.communicator().size(); ++i){
            offset+=1;
            int lprev = vtxdist[i]-vtxdist[i-1];
            int l = vtxdist[i+1]-vtxdist[i];
            start+=lprev;
            assert((start+l+offset)-gxadj<=static_cast<idxtype>(gxadjlen));
            increment = *(start-1);
            std::transform(start+offset, start+l+offset, start, std::bind2nd(std::plus<idxtype>(), increment));
          }
          Dune::dinfo<<std::endl<<"shifted xadj:";
          print_carray(Dune::dinfo, gxadj, noVertices+1);
          Dune::dinfo<<std::endl<<" gadjncy: ";
          print_carray(Dune::dinfo, gadjncy, gnoedges);
#ifdef USE_WEIGHTS
          Dune::dinfo<<std::endl<<" gvwgt: ";
          print_carray(Dune::dinfo, gvwgt, noVertices);
          Dune::dinfo<<std::endl<<"adjwgt: ";
          print_carray(Dune::dinfo, gadjwgt, gnoedges);
          Dune::dinfo<<std::endl;
#endif
          // everything should be fine now!!!
          if(verbose && oocomm.communicator().rank()==0)
	    std::cout<<"Postprocesing global graph data took "<<time1.elapsed()<<std::endl;
	  time1.reset();
#ifndef NDEBUG
          assert(isValidGraph(noVertices, noVertices, gnoedges,
                       gxadj, gadjncy, true));
#endif

	  if(verbose && oocomm.communicator().rank()==0)
	    std::cout<<"Creating grah one 1 process took "<<time.elapsed()<<std::endl;
	  time.reset();
	  options[0]=0; options[1]=1; options[2]=1; options[3]=3; options[4]=3;
          // Call metis
          METIS_PartGraphRecursive(&noVertices, gxadj, gadjncy, gvwgt, gadjwgt, &wgtflag,
                              &numflag, &nparts, options, &edgecut, gpart);

	  if(verbose && oocomm.communicator().rank()==0)
	    std::cout<<"METIS took "<<time.elapsed()<<std::endl;
	  time.reset();

          Dune::dinfo<<std::endl<<"part:";
          print_carray(Dune::dinfo, gpart, noVertices);

          delete[] gxadj;
          delete[] gadjncy;
#ifdef USE_WEIGHTS
          delete[] gvwgt;
          delete[] gadjwgt;
#endif
        }  
        // Scatter result
        MPI_Scatter(gpart, 1, MPITraits<idxtype>::getType(), part, 1,
                    MPITraits<idxtype>::getType(), 0, comm);

        {
          // release remaining memory
          delete[] gpart;
          delete[] noedges;
          delete[] displ;
        }
    
      
#endif
        delete[] xadj;
        delete[] vtxdist;
        delete[] adjncy;
#ifdef USE_WEIGHTS
        delete[] vwgt;
        delete[] adjwgt;
#endif
        delete[] tpwgts;
      }
    }else{
      part[0]=0;
    }
#endif
    Dune::dinfo<<" repart "<<rank <<" -> "<< part[0]<<std::endl;

    std::vector<int> realpart(mat.N(), part[0]);
    delete[] part;
    
    oocomm.copyOwnerToAll(realpart, realpart);
    
    if(verbose && oocomm.communicator().rank()==0)
      std::cout<<"Scattering repartitioning took "<<time.elapsed()<<std::endl;
    time.reset();


    oocomm.buildGlobalLookup(mat.N());
    Dune::Amg::MatrixGraph<M> graph(const_cast<M&>(mat));
    fillIndexSetHoles(graph, oocomm);
    if(verbose && oocomm.communicator().rank()==0)
      std::cout<<"Filling index set took "<<time.elapsed()<<std::endl;
    time.reset();
    
    if(verbose){
      int noNeighbours=oocomm.remoteIndices().neighbours();
      noNeighbours = oocomm.communicator().sum(noNeighbours)
	/ oocomm.communicator().size();
      if(oocomm.communicator().rank()==0)
	std::cout<<"Average no neighbours was "<<noNeighbours<<std::endl;
    }
    bool ret = buildCommunication(graph, realpart, oocomm, outcomm, redistInf,
                                  verbose);
    if(verbose && oocomm.communicator().rank()==0)
      std::cout<<"Building index sets took "<<time.elapsed()<<std::endl;
    time.reset();
	  

    return ret;
    
  }
  
  /**
   * @brief execute a graph repartition for a giving graph and indexset.
   *
   * This function provides repartition functionality using the 
   * PARMETIS library
   *
   * @param graph The given graph to repartition
   * @param oocomm The parallel information about the graph.
   * @param nparts The number of domains the repartitioning should achieve.
   * @param[out] outcomm Pointer store the parallel information of the 
   * redistributed domains in.
   * @param redistInf Redistribute interface
   * @param verbose Verbosity flag to give out additional information.
   */
  template<class G, class T1, class T2>
  bool graphRepartition(const G& graph, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm, int nparts,
			Dune::OwnerOverlapCopyCommunication<T1,T2>*& outcomm,
			RedistributeInterface& redistInf,
                        bool verbose=false)
  {
    Timer time;
    
    MPI_Comm comm=oocomm.communicator();
    oocomm.buildGlobalLookup(graph.noVertices());
    fillIndexSetHoles(graph, oocomm);
    
    if(verbose && oocomm.communicator().rank()==0)
      std::cout<<"Filling holes took "<<time.elapsed()<<std::endl;
    time.reset();
    
    // simple precondition checks 
     
#ifdef PERF_REPART  
    // Profiling variables
    double t1=0.0, t2=0.0, t3=0.0, t4=0.0, tSum=0.0;
#endif  
  
  
    // MPI variables
    int mype = oocomm.communicator().rank();
      
    assert(nparts<=oocomm.communicator().size());
  
    int myDomain;

    //
    // 1) Prepare the required parameters for using ParMETIS
    //    Especially the arrays that represent the graph must be 
    //    generated by the DUNE Graph and IndexSet input variables.
    //    These are the arrays:
    //    - vtxdist 
    //    - xadj
    //    - adjncy
    //
    //
#ifdef PERF_REPART  
    // reset timer for step 1)
    t1=MPI_Wtime();
#endif  


    typedef typename  Dune::OwnerOverlapCopyCommunication<T1,T2> OOComm;
    typedef typename  OOComm::OwnerSet OwnerSet;  

    // Create the vtxdist array and parmetisVtxMapping.
    // Global communications are necessary
    // The parmetis global identifiers for the owner vertices.
    ParmetisDuneIndexMap indexMap(graph,oocomm);
#if HAVE_PARMETIS
    idxtype *part = new idxtype[indexMap.numOfOwnVtx()];
#else
    std::size_t *part = new std::size_t[indexMap.numOfOwnVtx()];
#endif
    for(std::size_t i=0; i < indexMap.numOfOwnVtx(); ++i)
      part[i]=mype;

#if !HAVE_PARMETIS
    if(oocomm.communicator().rank()==0 && nparts>1)
      std::cerr<<"ParMETIS not activated. Will repartition to 1 domain instead of requested "
	       <<nparts<<" domains."<<std::endl;
    nparts=1; // No parmetis available, fallback to agglomerating to 1 process
    
#else

    if(nparts>1){
      // Create the xadj and adjncy arrays
      idxtype *xadj = new  idxtype[indexMap.numOfOwnVtx()+1];
      idxtype *adjncy = new idxtype[graph.noEdges()];
      EdgeFunctor<G> ef(adjncy, indexMap, graph.noEdges());
      getAdjArrays<OwnerSet>(graph, oocomm.globalLookup(), xadj, ef);
               
      //
      // 2) Call ParMETIS
      //
      //
      int numflag=0, wgtflag=0, options[3], edgecut=0, ncon=1;
      //float *tpwgts = NULL;
      float *tpwgts = new float[nparts];
      for(int i=0; i<nparts; ++i)
          tpwgts[i]=1.0/nparts;
      float ubvec[1];
      options[0] = 0; // 0=default, 1=options are defined in [1]+[2]
#ifdef DEBUG_REPART
      options[1] = 3; // show info: 0=no message
#else
      options[1] = 0; // show info: 0=no message
#endif
      options[2] = 1; // random number seed, default is 15
      wgtflag = (ef.getWeights()!=NULL)?1:0;
      numflag = 0;
      edgecut = 0;
      ncon=1;
      ubvec[0]=1.05; // recommended by ParMETIS

#ifdef DEBUG_REPART
      if (mype == 0) {
	std::cout<<std::endl;
	std::cout<<"Testing ParMETIS_V3_PartKway with options[1-2] = {"
		 <<options[1]<<" "<<options[2]<<"}, Ncon: "
		 <<ncon<<", Nparts: "<<nparts<<std::endl;
      }
#endif
#ifdef PERF_REPART  
      // stop the time for step 1)
      t1=MPI_Wtime()-t1;
      // reset timer for step 2)
      t2=MPI_Wtime();
#endif

      if(verbose){
        oocomm.communicator().barrier();
        if(oocomm.communicator().rank()==0)
          std::cout<<"Preparing for parmetis took "<<time.elapsed()<<std::endl;
      }
      time.reset();
      
      //=======================================================
      // ParMETIS_V3_PartKway
      //=======================================================
      ParMETIS_V3_PartKway(indexMap.vtxDist(), xadj, adjncy,
			   NULL, ef.getWeights(), &wgtflag,
			   &numflag, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &const_cast<MPI_Comm&>(comm));

      
      delete[] xadj;
      delete[] adjncy;
      delete[] tpwgts;
      
      ef.free();
    
#ifdef DEBUG_REPART
      if (mype == 0) {
	std::cout<<std::endl;
	std::cout<<"ParMETIS_V3_PartKway reported a cut of "<<edgecut<<std::endl;
	std::cout<<std::endl;
      }
      std::cout<<mype<<": PARMETIS-Result: ";
      for(int i=0; i < indexMap.vtxDist()[mype+1]-indexMap.vtxDist()[mype]; ++i) {
	std::cout<<part[i]<<" ";
      }
      std::cout<<std::endl;
      std::cout<<"Testing ParMETIS_V3_PartKway with options[1-2] = {"
	       <<options[1]<<" "<<options[2]<<"}, Ncon: "
	       <<ncon<<", Nparts: "<<nparts<<std::endl;
#endif
#ifdef PERF_REPART  
      // stop the time for step 2)
      t2=MPI_Wtime()-t2;
      // reset timer for step 3)
      t3=MPI_Wtime();
#endif  

      
      if(verbose){
        oocomm.communicator().barrier();
        if(oocomm.communicator().rank()==0)
          std::cout<<"Parmetis took "<<time.elapsed()<<std::endl;
      }
      time.reset();
    }else
#endif
      {
      // Everything goes to process 0!
      for(std::size_t i=0; i<indexMap.numOfOwnVtx();++i)
	part[i]=0;
      }
    

    //
    // 3) Find a optimal domain based on the ParMETIS repatitioning
    //    result
    //
  
    std::vector<int> domainMapping(nparts);
    if(nparts>1)
      getDomain(comm, part, indexMap.numOfOwnVtx(), nparts, &myDomain, domainMapping);
    else
      domainMapping[0]=0;
    
#ifdef DEBUG_REPART
    std::cout<<mype<<": myDomain: "<<myDomain<<std::endl;
    std::cout<<mype<<": DomainMapping: ";
    for(int j=0; j<nparts; j++) {
      std::cout<<" do: "<<j<<" pe: "<<domainMapping[j]<<" ";
    }
    std::cout<<std::endl;
#endif  

    // Make a domain mapping for the indexset and translate 
    //domain number to real process number
    // domainMapping is the one of parmetis, that is without
    // the overlap/copy vertices
    std::vector<int> setPartition(oocomm.indexSet().size(), -1);
      
    typedef typename  OOComm::ParallelIndexSet::const_iterator Iterator;
    std::size_t i=0; // parmetis index
    for(Iterator index = oocomm.indexSet().begin(); index != oocomm.indexSet().end(); ++index)
      if(OwnerSet::contains(index->local().attribute())){
	setPartition[index->local()]=domainMapping[part[i++]];
      }
    
    delete[] part;
    oocomm.copyOwnerToAll(setPartition, setPartition);
    // communication only needed for ALU 
    // (ghosts with same global id as owners on the same process)
    if (oocomm.getSolverCategory() == 
	static_cast<int>(SolverCategory::nonoverlapping))
      oocomm.copyCopyToAll(setPartition, setPartition);    
    bool ret = buildCommunication(graph, setPartition, oocomm, outcomm, redistInf,
                                  verbose);
    if(verbose){
        oocomm.communicator().barrier();
        if(oocomm.communicator().rank()==0)
          std::cout<<"Creating indexsets took "<<time.elapsed()<<std::endl;
      }
    return ret;
  }
  


  template<class G, class T1, class T2>
  bool buildCommunication(const G& graph,
                          std::vector<int>& setPartition, Dune::OwnerOverlapCopyCommunication<T1,T2>& oocomm, 
                          Dune::OwnerOverlapCopyCommunication<T1,T2>*& outcomm, 
                          RedistributeInterface& redistInf,
                          bool verbose)
  {
    typedef typename  Dune::OwnerOverlapCopyCommunication<T1,T2> OOComm;
    typedef typename  OOComm::OwnerSet OwnerSet;

    Timer time;
    
    // Build the send interface
    redistInf.buildSendInterface<OwnerSet>(setPartition, oocomm.indexSet());

#ifdef PERF_REPART  
    // stop the time for step 3)
    t3=MPI_Wtime()-t3;
    // reset timer for step 4)
    t4=MPI_Wtime();
#endif  
  
  
    //
    // 4) Create the output IndexSet and RemoteIndices 
    //    4.1) Determine the "send to" and "receive from" relation
    //         according to the new partition using a MPI ring 
    //         communication.
    // 
    //    4.2) Depends on the "send to" and "receive from" vector,
    //         the processes will exchange the vertices each other
    //
    //    4.3) Create the IndexSet, RemoteIndices and the new MPI 
    //         communicator
    //

    //
    // 4.1) Let's start...
    //
    int npes = oocomm.communicator().size();
    int *sendTo = 0;
    int noSendTo = 0;
    std::set<int> recvFrom;
  
    // the max number of vertices is stored in the sendTo buffer,
    // not the number of vertices to send! Because the max number of Vtx
    // is used as the fixed buffer size by the MPI send/receive calls

    typedef typename std::vector<int>::const_iterator VIter;
    int mype = oocomm.communicator().rank();

    {
      std::set<int> tsendTo;
      for(VIter i=setPartition.begin(), iend = setPartition.end(); i!=iend; ++i)
        tsendTo.insert(*i);

      noSendTo = tsendTo.size();
      sendTo = new int[noSendTo];
      typedef std::set<int>::const_iterator iterator;
      int idx=0;
      for(iterator i=tsendTo.begin(); i != tsendTo.end(); ++i, ++idx)
        sendTo[idx]=*i;
    }
  
    //
    int* gnoSend= new int[oocomm.communicator().size()];
    int* gsendToDispl =  new int[oocomm.communicator().size()+1];

    MPI_Allgather(&noSendTo, 1, MPI_INT, gnoSend, 1, 
		  MPI_INT, oocomm.communicator());

    // calculate total receive message size
    int totalNoRecv = 0;
    for(int i=0; i<npes; ++i)
      totalNoRecv += gnoSend[i];
    
    int *gsendTo = new int[totalNoRecv];
    
    // calculate displacement for allgatherv
    gsendToDispl[0]=0;
    for(int i=0; i<npes; ++i)
      gsendToDispl[i+1]=gsendToDispl[i]+gnoSend[i];
    
    // gather the data
    MPI_Allgatherv(sendTo, noSendTo, MPI_INT, gsendTo, gnoSend, gsendToDispl,
                   MPI_INT, oocomm.communicator());
    
    // Extract from which processes we will receive data
    for(int proc=0; proc < npes; ++proc)
      for(int i=gsendToDispl[proc]; i < gsendToDispl[proc+1]; ++i)
        if(gsendTo[i]==mype)
          recvFrom.insert(proc);
    
    bool existentOnNextLevel = recvFrom.size()>0;
    
    // Delete memory
    delete[] gnoSend;
    delete[] gsendToDispl;
    delete[] gsendTo;
    

#ifdef DEBUG_REPART
    if(recvFrom.size()){
      std::cout<<mype<<": recvFrom: ";
      typedef typename std::set<int>::const_iterator siter;
      for(siter i=recvFrom.begin(); i!= recvFrom.end(); ++i) {
        std::cout<<*i<<" ";
      }
    }
    
    std::cout<<std::endl<<std::endl;
    std::cout<<mype<<": sendTo: ";
    for(int i=0; i<noSendTo; i++) {
      std::cout<<sendTo[i]<<" ";
    }
    std::cout<<std::endl<<std::endl;
#endif

    if(verbose)
      if(oocomm.communicator().rank()==0)
        std::cout<<" Communicating the receive information took "<<
          time.elapsed()<<std::endl;
    time.reset();
    
    //
    // 4.2) Start the communication
    //
  
    // Get all the owner and overlap vertices for myself ans save
    // it in the vectors myOwnerVec and myOverlapVec.
    // The received vertices from the other processes are simple 
    // added to these vector.
    // 

      
    typedef typename OOComm::ParallelIndexSet::GlobalIndex GI;
    typedef std::vector<GI> GlobalVector;
    std::vector<std::pair<GI,int> > myOwnerVec;
    std::set<GI> myOverlapSet;
    GlobalVector sendOwnerVec;
    std::set<GI> sendOverlapSet;
    std::set<int> myNeighbors;
    
    //    getOwnerOverlapVec<OwnerSet>(graph, setPartition, oocomm.globalLookup(),
    //				 mype, mype, myOwnerVec, myOverlapSet, redistInf, myNeighbors);

    char **sendBuffers=new char*[noSendTo];
    MPI_Request *requests = new MPI_Request[noSendTo];
    
    // Create all messages to be sent
    for(int i=0; i < noSendTo; ++i){
      // clear the vector for sending
      sendOwnerVec.clear();
      sendOverlapSet.clear();
      // get all owner and overlap vertices for process j and save these
      // in the vectors sendOwnerVec and sendOverlapSet
      std::set<int> neighbors;
      getOwnerOverlapVec<OwnerSet>(graph, setPartition, oocomm.globalLookup(),
                                   mype, sendTo[i], sendOwnerVec, sendOverlapSet, redistInf,
                                   neighbors);
      // +2, we need 2 integer more for the length of each part
      // (owner/overlap) of the array
      int buffersize=0;
      int tsize;
      MPI_Pack_size(1, MPITraits<std::size_t>::getType(), oocomm.communicator(), &buffersize);
      MPI_Pack_size(sendOwnerVec.size(), MPITraits<GI>::getType(), oocomm.communicator(), &tsize);
      buffersize +=tsize;
      MPI_Pack_size(1, MPITraits<std::size_t>::getType(), oocomm.communicator(), &tsize);
      buffersize +=tsize;
      MPI_Pack_size(sendOverlapSet.size(), MPITraits<GI>::getType(), oocomm.communicator(), &tsize);
      buffersize += tsize;
      MPI_Pack_size(1, MPITraits<std::size_t>::getType(), oocomm.communicator(), &tsize);
      buffersize += tsize;
      MPI_Pack_size(neighbors.size(), MPI_INT, oocomm.communicator(), &tsize);
      buffersize += tsize;
	      
      sendBuffers[i] = new char[buffersize]; 

#ifdef DEBUG_REPART
      std::cout<<mype<<" sending "<<sendOwnerVec.size()<<" owner and "<<
        sendOverlapSet.size()<<" overlap to "<<sendTo[i]<<" buffersize="<<buffersize<<std::endl;
#endif
      createSendBuf(sendOwnerVec, sendOverlapSet, neighbors, sendBuffers[i], buffersize, oocomm.communicator());
      MPI_Issend(sendBuffers[i], buffersize, MPI_PACKED, sendTo[i], 99, oocomm.communicator(), requests+i);
    }

    if(verbose){
      oocomm.communicator().barrier();
      if(oocomm.communicator().rank()==0)
        std::cout<<" Creating sends took "<<
          time.elapsed()<<std::endl;
    }
    time.reset();

    // Receive Messages
    int noRecv = recvFrom.size();
    int oldbuffersize=0;
    char* recvBuf = 0;
    while(noRecv>0){
      // probe for an incoming message
      MPI_Status stat;
      MPI_Probe(MPI_ANY_SOURCE, 99,  oocomm.communicator(), &stat);
      int buffersize;
      MPI_Get_count(&stat, MPI_PACKED, &buffersize);

      if(oldbuffersize<buffersize){
        // buffer too small, reallocate
        delete[] recvBuf;
        recvBuf = new char[buffersize];
        oldbuffersize = buffersize;
      }
      MPI_Recv(recvBuf, buffersize, MPI_PACKED, stat.MPI_SOURCE, 99, oocomm.communicator(), &stat); 
      saveRecvBuf(recvBuf, buffersize, myOwnerVec, myOverlapSet, myNeighbors, redistInf, 
                  stat.MPI_SOURCE, oocomm.communicator());
      --noRecv;
    }
    
    if(recvBuf)
      delete[] recvBuf;

    time.reset();
    // Wait for sending messages to complete
    MPI_Status *statuses = new MPI_Status[noSendTo];
    int send = MPI_Waitall(noSendTo, requests, statuses);

    // check for errors
    if(send==MPI_ERR_IN_STATUS){
      std::cerr<<mype<<": Error in sending :"<<std::endl;
      // Search for the error
      for(int i=0; i< noSendTo; i++)
	if(statuses[i].MPI_ERROR!=MPI_SUCCESS){
	  char message[300];
	  int messageLength;
	  MPI_Error_string(statuses[i].MPI_ERROR, message, &messageLength);
	  std::cerr<<" source="<<statuses[i].MPI_SOURCE<<" message: ";
	  for(int i=0; i< messageLength; i++)
	    std::cout<<message[i];
	}
      std::cerr<<std::endl;
    }

    if(verbose){
      oocomm.communicator().barrier();
      if(oocomm.communicator().rank()==0)
        std::cout<<" Receiving and saving took "<<
          time.elapsed()<<std::endl;
    }    
    time.reset();

    for(int i=0; i < noSendTo; ++i)
      delete[] sendBuffers[i];
    
    delete[] sendBuffers;
    delete[] statuses;
    delete[] requests;
    
    redistInf.setCommunicator(oocomm.communicator());
	     
    //
    // 4.2) Create the IndexSet etc.
    //

    // build the new outputIndexSet
      
      
    int color=0;
      
    if (!existentOnNextLevel) {
      // this process is not used anymore
      color= MPI_UNDEFINED;
    }
    MPI_Comm outputComm;
    
    MPI_Comm_split(oocomm.communicator(), color, oocomm.communicator().rank(), &outputComm);
    outcomm = new OOComm(outputComm,oocomm.getSolverCategory(),true);

    // translate neighbor ranks.
    int newrank=outcomm->communicator().rank();
    int *newranks=new int[oocomm.communicator().size()];
    std::vector<int> tneighbors;
    tneighbors.reserve(myNeighbors.size());
    
    typename OOComm::ParallelIndexSet& outputIndexSet = outcomm->indexSet();
    
    MPI_Allgather(&newrank, 1, MPI_INT, newranks, 1, 
		  MPI_INT, oocomm.communicator());
    typedef typename std::set<int>::const_iterator IIter;

#ifdef DEBUG_REPART
    std::cout<<oocomm.communicator().rank()<<" ";
    for(IIter i=myNeighbors.begin(), end=myNeighbors.end();
	i!=end; ++i){
      assert(newranks[*i]>=0);
      std::cout<<*i<<"->"<<newranks[*i]<<" ";
      tneighbors.push_back(newranks[*i]);
    }
    std::cout<<std::endl;
#else
    for(IIter i=myNeighbors.begin(), end=myNeighbors.end();
        i!=end; ++i){
      tneighbors.push_back(newranks[*i]);
    }
#endif
    delete[] newranks;
    myNeighbors.clear();

    if(verbose){
      oocomm.communicator().barrier();
      if(oocomm.communicator().rank()==0)
	std::cout<<" Calculating new neighbours ("<<tneighbors.size()<<") took "<<
          time.elapsed()<<std::endl;
    }
    time.reset();

    
    outputIndexSet.beginResize();
    // 1) add the owner vertices
    // Sort the owners
    std::sort(myOwnerVec.begin(), myOwnerVec.end(), SortFirst());
    // The owners are sorted according to there global index
    // Therefore the entries of ownerVec are the same as the
    // ones in the resulting index set.
    typedef typename OOComm::ParallelIndexSet::LocalIndex LocalIndex;
    typedef typename std::vector<std::pair<GI,int> >::const_iterator VPIter;
    int i=0;
    for(VPIter g=myOwnerVec.begin(), end =myOwnerVec.end(); g!=end; ++g, ++i ) {
      outputIndexSet.add(g->first,LocalIndex(i, OwnerOverlapCopyAttributeSet::owner, true));
      redistInf.addReceiveIndex(g->second, i);
    }

    if(verbose){
      oocomm.communicator().barrier();
      if(oocomm.communicator().rank()==0)
	std::cout<<" Adding owner indices took "<<
          time.elapsed()<<std::endl;
    }
    time.reset();


    // After all the vertices are received, the vectors must
    // be "merged" together to create the final vectors.
    // Because some vertices that are sent as overlap could now 
    // already included as owner vertiecs in the new partition
    mergeVec(myOwnerVec, myOverlapSet);
    
    // Trick to free memory
    myOwnerVec.clear();
    myOwnerVec.swap(myOwnerVec);

    if(verbose){
      oocomm.communicator().barrier();
      if(oocomm.communicator().rank()==0)
	std::cout<<" Merging indices took "<<
          time.elapsed()<<std::endl;
    }
    time.reset();


    // 2) add the overlap vertices
    typedef typename std::set<GI>::const_iterator SIter;
    for(SIter g=myOverlapSet.begin(), end=myOverlapSet.end(); g!=end; ++g, i++) {
      outputIndexSet.add(*g,LocalIndex(i, OwnerOverlapCopyAttributeSet::copy, true));
    }
    myOverlapSet.clear();
    outputIndexSet.endResize();

#ifdef DUNE_ISTL_WITH_CHECKING
    int numOfOwnVtx =0;
    typedef typename OOComm::ParallelIndexSet::const_iterator Iterator;
    Iterator end = outputIndexSet.end();
    for(Iterator index = outputIndexSet.begin(); index != end; ++index) {
      if (OwnerSet::contains(index->local().attribute())) {
	numOfOwnVtx++;
      }
    }
    numOfOwnVtx = oocomm.communicator().sum(numOfOwnVtx);
    // if(numOfOwnVtx!=indexMap.globalOwnerVertices)
    //   {
    //     std::cerr<<numOfOwnVtx<<"!="<<indexMap.globalOwnerVertices<<" owners missing or additional ones!"<<std::endl;
    //     DUNE_THROW(ISTLError, numOfOwnVtx<<"!="<<indexMap.globalOwnerVertices<<" owners missing or additional ones"
    //     	   <<" during repartitioning.");
    //   }
    Iterator index=outputIndexSet.begin();
    if(index!=end){
      ++index;
      for(Iterator old = outputIndexSet.begin(); index != end; old=index++) {
	if(old->global()>index->global())
	  DUNE_THROW(ISTLError, "Index set's globalindex not sorted correctly");
      }
    }
#endif
    if(verbose){
      oocomm.communicator().barrier();
      if(oocomm.communicator().rank()==0)
	std::cout<<" Adding overlap indices took "<<
          time.elapsed()<<std::endl;
    }
    time.reset();

    
    if(color != MPI_UNDEFINED){
      outcomm->remoteIndices().setNeighbours(tneighbors);
      outcomm->remoteIndices().template rebuild<true>();

    }
  
    // release the memory
    delete[] sendTo;

    if(verbose){
      oocomm.communicator().barrier();
      if(oocomm.communicator().rank()==0)
        std::cout<<" Storing indexsets took "<<
          time.elapsed()<<std::endl;
    }
      
#ifdef PERF_REPART  
    // stop the time for step 4) and print the results
    t4=MPI_Wtime()-t4;
    tSum = t1 + t2 + t3 + t4;
    std::cout<<std::endl
	     <<mype<<": WTime for step 1): "<<t1
	     <<" 2): "<<t2
	     <<" 3): "<<t3
	     <<" 4): "<<t4
	     <<" total: "<<tSum
	     <<std::endl;
#endif  

    return color!=MPI_UNDEFINED;
      
  }
#else
  template<class G, class P,class T1, class T2, class R>
  bool graphRepartition(const G& graph, P& oocomm, int nparts,
			P*& outcomm,
			R& redistInf,
                        bool v=false)
  {
    if(nparts!=oocomm.size())
      DUNE_THROW(NotImplemented, "only available for MPI programs");
  }


  template<class G, class P,class T1, class T2, class R>
  bool commGraphRepartition(const G& graph, P& oocomm, int nparts,
                            P*& outcomm,
                            R& redistInf,
                            bool v=false)
  {
    if(nparts!=oocomm.size())
      DUNE_THROW(NotImplemented, "only available for MPI programs");
  }
#endif // HAVE_MPI
} // end of namespace Dune
#endif
libdune-istl-dev 2.2.1-2 / usr / include / dune / istl / repartition.hh