This file is indexed.

/usr/include/trilinos/Ifpack2_Container.hpp is in libtrilinos-ifpack2-dev 12.10.1-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
/*@HEADER
// ***********************************************************************
//
//       Ifpack2: Tempated Object-Oriented Algebraic Preconditioner Package
//                 Copyright (2009) Sandia Corporation
//
// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
// license for use of this work by or on behalf of the U.S. Government.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
//
// ***********************************************************************
//@HEADER
*/

#ifndef IFPACK2_CONTAINER_HPP
#define IFPACK2_CONTAINER_HPP

/// \file Ifpack2_Container.hpp
/// \brief Ifpack2::Container class declaration

#include "Ifpack2_ConfigDefs.hpp"
#include "Tpetra_RowMatrix.hpp"
#include "Teuchos_Describable.hpp"
#include <Ifpack2_Partitioner.hpp>
#include <Tpetra_Map.hpp>
#include <Tpetra_Experimental_BlockCrsMatrix_decl.hpp>
#include <Teuchos_ParameterList.hpp>
#include <Teuchos_Time.hpp>
#include <iostream>
#include <type_traits>

#ifndef DOXYGEN_SHOULD_SKIP_THIS
namespace Teuchos {
  // Forward declaration to avoid include.
  class ParameterList;
} // namespace Teuchos
#endif // DOXYGEN_SHOULD_SKIP_THIS

namespace Ifpack2 {

/// \class Container
/// \brief Interface for creating and solving a local linear problem.
/// \tparam MatrixType A specialization of Tpetra::RowMatrix.
///
/// This class is mainly useful for the implementation of
/// BlockRelaxation, and other preconditioners that need to solve
/// linear systems with diagonal blocks of a sparse matrix.
///
/// Users of BlockRelaxation (and any analogous preconditioners) do
/// not normally need to interact with the Container interface.
/// However, they do need to specify a specific Container subclass to
/// use, for example as the second template parameter
/// (<tt>ContainerType</tt>) of BlockRelaxation.  Implementations of
/// Container specify
/// - the kind of data structure used to store the local matrix, and
/// - how to solve a linear system with the local matrix.
///
/// For example, the SparseContainer subclass uses a sparse matrix (in
/// particular, Tpetra::CrsMatrix) to store each diagonal block, and
/// can use any given Ifpack2 Preconditioner subclass to solve linear
/// systems.
///
/// A Container can create, populate, and solve a local linear
/// system. The local linear system matrix, B, is a submatrix of the
/// local components of a distributed matrix, A. The idea of Container
/// is to specify the rows of A that are contained in B, then extract
/// B from A, and compute all it is necessary to solve a linear system
/// in B.  Then, set the initial guess (if necessary) and right-hand
/// side for B, and solve the linear system in B.
///
/// If you are writing a class (comparable to BlockRelaxation) that
/// uses Container, you should use it in the following way:
/// <ol>
/// <li> Create a Container object, specifying the global matrix A and
///      the indices of the local rows of A that are contained in B.
///      The latter indices come from a Partitioner object.</li>
/// <li> Optionally, set linear solve parameters using setParameters().</li>
/// <li> Initialize the container by calling initialize().</li>
/// <li> Prepare the linear system solver using compute().</li>
/// <li> Solve the linear system using apply().</li>
/// </ol>
/// For an example of Steps 1-5 above, see the implementation of
/// BlockRelaxation::ExtractSubmatrices() in
/// Ifpack2_BlockRelaxation_def.hpp.
template<class MatrixType>
class Container : public Teuchos::Describable {
  //! @name Internal typedefs (protected)
  //@{
protected:
  typedef typename MatrixType::scalar_type scalar_type;
  typedef typename MatrixType::local_ordinal_type local_ordinal_type;
  typedef typename MatrixType::global_ordinal_type global_ordinal_type;
  typedef typename MatrixType::node_type node_type;
  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type> mv_type;
  typedef Tpetra::Vector<scalar_type, local_ordinal_type, global_ordinal_type, node_type> vector_type;
  typedef Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
  typedef Teuchos::ScalarTraits<scalar_type> STS;
  typedef Tpetra::Import<local_ordinal_type, global_ordinal_type, node_type> import_type;
  typedef Partitioner<Tpetra::RowGraph<local_ordinal_type, global_ordinal_type, node_type> > partitioner_type;
  typedef Tpetra::Experimental::BlockCrsMatrix<scalar_type, local_ordinal_type, global_ordinal_type, node_type> block_crs_matrix_type;
  typedef Tpetra::RowMatrix<scalar_type, local_ordinal_type, global_ordinal_type, node_type> row_matrix_type;

  static_assert(std::is_same<MatrixType, row_matrix_type>::value,
                "Ifpack2::Container: Please use MatrixType = Tpetra::RowMatrix.");

  //! Internal representation of Scalar in Kokkos::View
  typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
  //@}

public:
  typedef typename mv_type::dual_view_type::t_host HostView;

  /// \brief Constructor.
  ///
  /// \brief matrix [in] The original input matrix.  This Container
  ///   will construct local diagonal blocks from the rows given by
  ///   <tt>partitioner</tt>.
  ///
  /// \param partitioner [in] The Partitioner object that assigns
  ///   local rows of the input matrix to blocks.
  Container (const Teuchos::RCP<const row_matrix_type>& matrix,
             const Teuchos::Array<Teuchos::Array<local_ordinal_type> >& partitions,
             const Teuchos::RCP<const import_type>& importer,
             int OverlapLevel,
             scalar_type DampingFactor) :
    inputMatrix_ (matrix),
    OverlapLevel_ (OverlapLevel),
    DampingFactor_ (DampingFactor),
    Importer_ (importer)
  {
    using Teuchos::Ptr;
    using Teuchos::RCP;
    using Teuchos::Array;
    using Teuchos::ArrayView;
    using Teuchos::Comm;
    // typedef Tpetra::Import<local_ordinal_type, global_ordinal_type, node_type> import_type; // unused
    NumLocalRows_ = inputMatrix_->getNodeNumRows();
    NumGlobalRows_ = inputMatrix_->getGlobalNumRows();
    NumGlobalNonzeros_ = inputMatrix_->getGlobalNumEntries();
    IsParallel_ = inputMatrix_->getRowMap()->getComm()->getSize() != 1;
    auto global_bcrs =
      Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(inputMatrix_);
    hasBlockCrs_ = !global_bcrs.is_null();
    if(hasBlockCrs_)
      bcrsBlockSize_ = global_bcrs->getBlockSize();
    else
      bcrsBlockSize_ = 1;
    setBlockSizes(partitions);
  }

  /// \brief Constructor for single block.
  ///
  /// \brief matrix [in] The original input matrix.  This Container
  ///   will construct a local diagonal block from the rows given by
  ///   <tt>localRows</tt>.
  ///
  /// \param localRows [in] The set of (local) rows assigned to this
  ///   container.  <tt>localRows[i] == j</tt>, where i (from 0 to
  ///   <tt>getNumRows() - 1</tt>) indicates the Container's row, and
  ///   j indicates the local row in the calling process.  Subclasses
  ///   must always pass along these indices to the base class.
  Container (const Teuchos::RCP<const row_matrix_type>& matrix,
             const Teuchos::Array<local_ordinal_type>& localRows) :
    inputMatrix_ (matrix),
    numBlocks_ (1),
    partitions_ (localRows.size()),
    blockRows_ (1),
    partitionIndices_ (1),
    OverlapLevel_ (0),
    DampingFactor_ (STS::one()),
    Importer_ (Teuchos::null)
  {
    NumLocalRows_ = inputMatrix_->getNodeNumRows();
    NumGlobalRows_ = inputMatrix_->getGlobalNumRows();
    NumGlobalNonzeros_ = inputMatrix_->getGlobalNumEntries();
    IsParallel_ = inputMatrix_->getRowMap()->getComm()->getSize() > 1;
    blockRows_[0] = localRows.size();
    partitionIndices_[0] = 0;
    const block_crs_matrix_type* global_bcrs =
      Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(inputMatrix_).get();
    hasBlockCrs_ = global_bcrs;
    if(hasBlockCrs_)
      bcrsBlockSize_ = global_bcrs->getBlockSize();
    else
      bcrsBlockSize_ = 1;
    for(local_ordinal_type i = 0; i < localRows.size(); i++)
      partitions_[i] = localRows[i];
  }

  //! Destructor.
  virtual ~Container() {};

  /// \brief Local indices of the rows of the input matrix that belong to this block.
  ///
  /// The set of (local) rows assigned to this Container is defined by
  /// passing in a set of indices <tt>localRows[i] = j</tt> to the
  /// constructor, where i (from 0 to <tt>getNumRows() - 1</tt>)
  /// indicates the Container's row, and j indicates the local row in
  /// the calling process.  Subclasses must always pass along these
  /// indices to the base class.
  ///
  /// The indices are usually used to reorder the local row index (on
  /// the calling process) of the i-th row in the Container.
  ///
  /// For an example of how to use these indices, see the
  /// implementation of BlockRelaxation::ExtractSubmatrices() in
  /// Ifpack2_BlockRelaxation_def.hpp.
  Teuchos::ArrayView<const local_ordinal_type> getLocalRows(int blockIndex) const
  {
    return Teuchos::ArrayView<const local_ordinal_type>
      (&partitions_[partitionIndices_[blockIndex]], blockRows_[blockIndex]);
  }

  /// \brief Do all set-up operations that only require matrix structure.
  ///
  /// If the input matrix's structure changes, you must call this
  /// method before you may call compute().  You must then call
  /// compute() before you may call apply() or weightedApply().
  ///
  /// "Structure" refers to the graph of the matrix: the local and
  /// global dimensions, and the populated entries in each row.
  virtual void initialize () = 0;

  //! Initialize arrays with information about block sizes.
  void setBlockSizes(const Teuchos::Array<Teuchos::Array<local_ordinal_type> >& partitions)
  {
    //First, create a grand total of all the rows in all the blocks
    //Note: If partitioner allowed overlap, this could be greater than the # of local rows
    local_ordinal_type totalBlockRows = 0;
    numBlocks_ = partitions.size();
    blockRows_.resize(numBlocks_);
    partitionIndices_.resize(numBlocks_);
    for(int i = 0; i < numBlocks_; i++)
    {
      local_ordinal_type rowsInBlock = partitions[i].size();
      blockRows_[i] = rowsInBlock;
      partitionIndices_[i] = totalBlockRows;
      totalBlockRows += rowsInBlock;
    }
    partitions_.resize(totalBlockRows);
    //set partitions_: each entry is the partition/block of the row
    local_ordinal_type iter = 0;
    for(int i = 0; i < numBlocks_; i++)
    {
      for(int j = 0; j < blockRows_[i]; j++)
      {
        partitions_[iter++] = partitions[i][j];
      }
    }
  }

  void getMatDiag() const
  {
    if(Diag_.is_null())
    {
      Diag_ = rcp(new vector_type(inputMatrix_->getDomainMap()));
      inputMatrix_->getLocalDiagCopy(*Diag_);
    }
  }

  /// \brief Extract the local diagonal block and prepare the solver.
  ///
  /// If any entries' values in the input matrix have changed, you
  /// must call this method before you may call apply() or
  /// weightedApply().
  virtual void compute () = 0;

  void DoJacobi(HostView& X, HostView& Y, int stride) const;
  void DoOverlappingJacobi(HostView& X, HostView& Y, HostView& W, int stride) const;
  void DoGaussSeidel(HostView& X, HostView& Y, HostView& Y2, int stride) const;
  void DoSGS(HostView& X, HostView& Y, HostView& Y2, int stride) const;

  //! Set parameters.
  virtual void setParameters (const Teuchos::ParameterList& List) = 0;

  //! Return \c true if the container has been successfully initialized.
  virtual bool isInitialized () const = 0;

  //! Return \c true if the container has been successfully computed.
  virtual bool isComputed () const = 0;

  /// \brief Compute <tt>Y := alpha * M^{-1} X + beta*Y</tt>.
  ///
  /// X is in the domain Map of the original matrix (the argument to
  /// compute()), and Y is in the range Map of the original matrix.
  /// This method only reads resp. modifies the permuted subset of
  /// entries of X resp. Y related to the diagonal block M.  That
  /// permuted subset is defined by the indices passed into the
  /// constructor.
  ///
  /// This method is marked \c const for compatibility with
  /// Tpetra::Operator's method of the same name.  This might require
  /// subclasses to mark some of their instance data as \c mutable.
  virtual void
  apply (HostView& X,
         HostView& Y,
         int blockIndex,
         int stride,
         Teuchos::ETransp mode = Teuchos::NO_TRANS,
         scalar_type alpha = Teuchos::ScalarTraits<scalar_type>::one(),
         scalar_type beta = Teuchos::ScalarTraits<scalar_type>::zero()) const = 0;

  //! Wrapper for apply with MVs, used in unit tests (never called by BlockRelaxation)
  void applyMV (mv_type& X, mv_type& Y) const
  {
    HostView XView = X.template getLocalView<Kokkos::HostSpace>();
    HostView YView = Y.template getLocalView<Kokkos::HostSpace>();
    this->apply (XView, YView, 0, X.getStride());
  }

  /// \brief Compute <tt>Y := alpha * diag(D) * M^{-1} (diag(D) * X) + beta*Y</tt>.
  ///
  /// X is in the domain Map of the original matrix (the argument to
  /// compute()), and Y is in the range Map of the original matrix.
  /// This method only reads resp. modifies the permuted subset of
  /// entries of X resp. Y related to the diagonal block M.  That
  /// permuted subset is defined by the indices passed into the
  /// constructor.  The D scaling vector must have the same number of
  /// entries on each process as X and Y, but otherwise need not have
  /// the same Map.  (For example, D could be locally replicated, or
  /// could be a different object on each process with a local (\c
  /// MPI_COMM_SELF) communicator.)
  ///
  /// This method supports overlap techniques, such as those used in
  /// Schwarz methods.
  ///
  /// This method is marked \c const by analogy with apply(), which
  /// itself is marked \c const for compatibility with
  /// Tpetra::Operator's method of the same name.  This might require
  /// subclasses to mark some of their instance data as \c mutable.
  virtual void
  weightedApply (HostView& X,
                 HostView& Y,
                 HostView& W,
                 int blockIndex,
                 int stride,
                 Teuchos::ETransp mode = Teuchos::NO_TRANS,
                 scalar_type alpha = Teuchos::ScalarTraits<scalar_type>::one(),
                 scalar_type beta = Teuchos::ScalarTraits<scalar_type>::zero()) const = 0;

  //! Wrapper for weightedApply with MVs, used in unit tests (never called by BlockRelaxation)
  void weightedApplyMV (mv_type& X,
                        mv_type& Y,
                        vector_type& W)
  {
    HostView XView = X.template getLocalView<Kokkos::HostSpace>();
    HostView YView = Y.template getLocalView<Kokkos::HostSpace>();
    HostView WView = W.template getLocalView<Kokkos::HostSpace>();
    weightedApply (XView, YView, WView, 0, X.getStride());
  }

  virtual void clearBlocks();

  //! Print basic information about the container to \c os.
  virtual std::ostream& print (std::ostream& os) const = 0;

  //! Returns string describing the container.
  //! See <tt>Details::ContainerFactory</tt>.
  static std::string getName()
  {
    return "Generic";
  }

protected:
  //! The input matrix to the constructor.
  Teuchos::RCP<const row_matrix_type> inputMatrix_;

  //! The number of blocks (partitions) in the container.
  int numBlocks_;
  //! Local indices of the rows of the input matrix that belong to this block.
  Teuchos::Array<local_ordinal_type> partitions_;      //size: total # of local rows (in all local blocks)
  //! Number of rows in each block.
  Teuchos::Array<local_ordinal_type> blockRows_;     //size: # of blocks
  //! Starting index in partitions_ of local row indices for each block.
  Teuchos::Array<local_ordinal_type> partitionIndices_;
  //! Diagonal elements.
  mutable Teuchos::RCP<vector_type> Diag_;
  //! Whether the problem is distributed across multiple MPI processes.
  bool IsParallel_;
  //! Number of rows of overlap for adjacent blocks.
  int OverlapLevel_;
  //! Damping factor, passed to apply() as alpha.
  scalar_type DampingFactor_;
  //! Importer for importing off-process elements of MultiVectors.
  Teuchos::RCP<const Tpetra::Import<local_ordinal_type, global_ordinal_type, node_type>> Importer_;
  //! Number of local rows in input matrix.
  local_ordinal_type NumLocalRows_;
  //! Number of global rows in input matrix.
  global_ordinal_type NumGlobalRows_;
  //! Number of nonzeros in input matrix.
  global_ordinal_type NumGlobalNonzeros_;
  //! Whether the input matrix is a BlockCRS matrix.
  bool hasBlockCrs_;
  //! If hasBlockCrs_, the number of DOFs per vertex. Otherwise 1.
  int bcrsBlockSize_;
};

//! Print information about the given Container to the output stream \c os.
template <class MatrixType>
inline std::ostream&
operator<< (std::ostream& os, const Ifpack2::Container<MatrixType>& obj)
{
  return obj.print (os);
}

template <class MatrixType>
void Container<MatrixType>::DoJacobi(HostView& X, HostView& Y, int stride) const
{
  const scalar_type one = STS::one();
  // Note: Flop counts copied naively from Ifpack.
  // use partitions_ and blockRows_
  size_t numVecs = X.dimension_1();
  // Non-overlapping Jacobi
  for (local_ordinal_type i = 0; i < numBlocks_; i++)
  {
    // may happen that a partition is empty
    if(blockRows_[i] != 1 || hasBlockCrs_)
    {
      if(blockRows_[i] == 0 )
        continue;
      apply(X, Y, i, stride, Teuchos::NO_TRANS, DampingFactor_, one);
    }
    else    // singleton, can't access Containers_[i] as it was never filled and may be null.
    {
      local_ordinal_type LRID = partitions_[partitionIndices_[i]];
      getMatDiag();
      HostView diagView = Diag_->template getLocalView<Kokkos::HostSpace>();
      impl_scalar_type d = (impl_scalar_type) one / diagView(LRID, 0);
      for(size_t nv = 0; nv < numVecs; nv++)
      {
        impl_scalar_type x = X(LRID, nv);
        Y(LRID, nv) = x * d;
      }
    }
  }
}

template <class MatrixType>
void Container<MatrixType>::DoOverlappingJacobi(HostView& X, HostView& Y, HostView& W, int stride) const
{
  // Overlapping Jacobi
  for(local_ordinal_type i = 0; i < numBlocks_; i++)
  {
    // may happen that a partition is empty
    if(blockRows_[i] == 0)
      continue;
    if(blockRows_[i] != 1)
      weightedApply(X, Y, W, i, stride, Teuchos::NO_TRANS, DampingFactor_, STS::one());
  }
}

template<class MatrixType>
void Container<MatrixType>::DoGaussSeidel(HostView& X, HostView& Y, HostView& Y2, int stride) const
{
  using Teuchos::Array;
  using Teuchos::ArrayRCP;
  using Teuchos::ArrayView;
  using Teuchos::Ptr;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  // Note: Flop counts copied naively from Ifpack.
  const scalar_type one = STS::one();
  const size_t Length = inputMatrix_->getNodeMaxNumRowEntries();
  auto numVecs = X.dimension_1();
  Array<scalar_type> Values;
  Array<local_ordinal_type> Indices;
  Indices.resize(Length);

  Values.resize(bcrsBlockSize_ * bcrsBlockSize_ * Length);    //note: if A was not a BlockCRS, bcrsBlockSize_ is 1
  // I think I decided I need two extra vectors:
  // One to store the sum of the corrections (initialized to zero)
  // One to store the temporary residual (doesn't matter if it is zeroed or not)
  // My apologies for making the names clear and meaningful. (X=RHS, Y=guess?! Nice.)
  HostView Resid("", X.dimension_0(), X.dimension_1());
  for(local_ordinal_type i = 0; i < numBlocks_; i++)
  {
    if(blockRows_[i] > 1 || hasBlockCrs_)
    {
      if (blockRows_[i] == 0)
        continue;
      // update from previous block
      ArrayView<const local_ordinal_type> localRows = getLocalRows (i);
      const size_t localNumRows = blockRows_[i];
      for(size_t j = 0; j < localNumRows; j++)
      {
        const local_ordinal_type LID = localRows[j]; // Containers_[i]->ID (j);
        size_t NumEntries;
        inputMatrix_->getLocalRowCopy(LID, Indices(), Values(), NumEntries);
        for(size_t m = 0; m < numVecs; m++)
        {
          if(hasBlockCrs_)
          {
            for (int localR = 0; localR < bcrsBlockSize_; localR++)
              Resid(LID * bcrsBlockSize_ + localR, m) = X(LID * bcrsBlockSize_ + localR, m);
            for (size_t k = 0; k < NumEntries; ++k)
            {
              const local_ordinal_type col = Indices[k];
              for (int localR = 0; localR < bcrsBlockSize_; localR++)
              {
                for(int localC = 0; localC < bcrsBlockSize_; localC++)
                {
                  Resid(LID * bcrsBlockSize_ + localR, m) -=
                    Values[k * bcrsBlockSize_ * bcrsBlockSize_ + localR + localC * bcrsBlockSize_]
                    * Y2(col * bcrsBlockSize_ + localC, m);
                }
              }
            }
          }
          else
          {
            Resid(LID, m) = X(LID, m);
            for (size_t k = 0; k < NumEntries; ++k)
            {
              const local_ordinal_type col = Indices[k];
              Resid(LID, m) -= Values[k] * Y2(col, m);
            }
          }
        }
      }
      // solve with this block
      //
      // Note: I'm abusing the ordering information, knowing that X/Y
      // and Y2 have the same ordering for on-proc unknowns.
      //
      // Note: Add flop counts for inverse apply
      apply(Resid, Y2, i, stride, Teuchos::NO_TRANS, DampingFactor_, one);
    }
    else if(blockRows_[i] == 1)
    {
      // singleton, can't access Containers_[i] as it was never filled and may be null.
      // a singleton calculation is exact, all residuals should be zero.
      local_ordinal_type LRID = partitionIndices_[i];  // by definition, a singleton 1 row in block.
      getMatDiag();
      HostView diagView = Diag_->template getLocalView<Kokkos::HostSpace>();
      impl_scalar_type d = (impl_scalar_type) one / diagView(LRID, 0);
      for(size_t m = 0; m < numVecs; m++)
      {
        impl_scalar_type x = X(LRID, m);
        impl_scalar_type newy = x * d;
        Y2(LRID, m) = newy;
     }
    } // end else
  } // end for numBlocks_
  if(IsParallel_)
  {
    auto numMyRows = inputMatrix_->getNodeNumRows();
    for (size_t m = 0; m < numVecs; ++m)
    {
      for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i)
      {
        Y(i, m) = Y2(i, m);
      }
    }
  }
}

template<class MatrixType>
void Container<MatrixType>::DoSGS(HostView& X, HostView& Y, HostView& Y2, int stride) const
{
  using Teuchos::Array;
  using Teuchos::ArrayRCP;
  using Teuchos::ArrayView;
  using Teuchos::Ptr;
  using Teuchos::ptr;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  const scalar_type one = STS::one();
  auto numVecs = X.dimension_1();
  const size_t Length = inputMatrix_->getNodeMaxNumRowEntries();
  Array<scalar_type> Values;
  Array<local_ordinal_type> Indices(Length);
  Values.resize(bcrsBlockSize_ * bcrsBlockSize_ * Length);
  // I think I decided I need two extra vectors:
  // One to store the sum of the corrections (initialized to zero)
  // One to store the temporary residual (doesn't matter if it is zeroed or not)
  // My apologies for making the names clear and meaningful. (X=RHS, Y=guess?! Nice.)
  HostView Resid("", X.dimension_0(), X.dimension_1());
  // Forward Sweep
  for(local_ordinal_type i = 0; i < numBlocks_; i++)
  {
    if(blockRows_[i] != 1 || hasBlockCrs_)
    {
      if(blockRows_[i] == 0)
        continue; // Skip empty partitions
      // update from previous block
      ArrayView<const local_ordinal_type> localRows = getLocalRows(i);
      for(local_ordinal_type j = 0; j < blockRows_[i]; j++)
      {
        const local_ordinal_type LID = localRows[j]; // Containers_[i]->ID (j);
        size_t NumEntries;
        inputMatrix_->getLocalRowCopy(LID, Indices(), Values(), NumEntries);
        //set tmpresid = initresid - A*correction
        for(size_t m = 0; m < numVecs; m++)
        {
          if(hasBlockCrs_)
          {
            for(int localR = 0; localR < bcrsBlockSize_; localR++)
              Resid(LID * bcrsBlockSize_ + localR, m) = X(LID * bcrsBlockSize_ + localR, m);
            for(size_t k = 0; k < NumEntries; ++k)
            {
              const local_ordinal_type col = Indices[k];
              for (int localR = 0; localR < bcrsBlockSize_; localR++)
              {
                for(int localC = 0; localC < bcrsBlockSize_; localC++)
                  Resid(LID * bcrsBlockSize_ + localR, m) -=
                    Values[k * (bcrsBlockSize_ * bcrsBlockSize_) + (localR + localC * bcrsBlockSize_)]
                    * Y2(col * bcrsBlockSize_ + localC, m);
              }
            }
          }
          else
          {
            Resid(LID, m) = X(LID, m);
            for(size_t k = 0; k < NumEntries; k++)
            {
              local_ordinal_type col = Indices[k];
              Resid(LID, m) -= Values[k] * Y2(col, m);
            }
          }
        }
      }
      // solve with this block
      //
      // Note: I'm abusing the ordering information, knowing that X/Y
      // and Y2 have the same ordering for on-proc unknowns.
      //
      // Note: Add flop counts for inverse apply
      apply(Resid, Y2, i, stride, Teuchos::NO_TRANS, DampingFactor_, one);
      // operations for all getrow's
    }
    else // singleton, can't access Containers_[i] as it was never filled and may be null.
    {
      local_ordinal_type LRID  = partitions_[partitionIndices_[i]];
      getMatDiag();
      HostView diagView = Diag_->template getLocalView<Kokkos::HostSpace>();
      impl_scalar_type d = (impl_scalar_type) one / diagView(LRID, 0);
      for(size_t m = 0; m < numVecs; m++)
      {
        impl_scalar_type x = X(LRID, m);
        impl_scalar_type newy = x * d;
        Y2(LRID, m) = newy;
      }
    } // end else
  } // end forward sweep over NumLocalBlocks
  // Reverse Sweep
  //
  // mfh 12 July 2013: The unusual iteration bounds, and the use of
  // i-1 rather than i in the loop body, ensure correctness even if
  // local_ordinal_type is unsigned.  "i = numBlocks_-1; i >= 0;
  // i--" will loop forever if local_ordinal_type is unsigned, because
  // unsigned integers are (trivially) always nonnegative.
  for(local_ordinal_type i = numBlocks_; i > 0; --i)
  {
    if(hasBlockCrs_ || blockRows_[i-1] != 1)
    {
      if(blockRows_[i - 1] == 0)
        continue;
      // update from previous block
      ArrayView<const local_ordinal_type> localRows = getLocalRows(i-1);
      for(local_ordinal_type j = 0; j < blockRows_[i-1]; j++)
      {
        const local_ordinal_type LID = localRows[j]; // Containers_[i-1]->ID (j);
        size_t NumEntries;
        inputMatrix_->getLocalRowCopy(LID, Indices(), Values(), NumEntries);
        //set tmpresid = initresid - A*correction
        for (size_t m = 0; m < numVecs; m++)
        {
          if(hasBlockCrs_)
          {
            for(int localR = 0; localR < bcrsBlockSize_; localR++)
              Resid(LID * bcrsBlockSize_ + localR, m) = X(LID * bcrsBlockSize_ + localR, m);
            for(size_t k = 0; k < NumEntries; ++k)
            {
              const local_ordinal_type col = Indices[k];
              for(int localR = 0; localR < bcrsBlockSize_; localR++)
              {
                for(int localC = 0; localC < bcrsBlockSize_; localC++)
                  Resid(LID*bcrsBlockSize_+localR, m) -=
                    Values[k * (bcrsBlockSize_ * bcrsBlockSize_) + (localR + localC * bcrsBlockSize_)]
                    * Y2(col * bcrsBlockSize_ + localC, m);
              }
            }
          }
          else
          {
            Resid(LID, m) = X(LID, m);
            for(size_t k = 0; k < NumEntries; ++k)
            {
              local_ordinal_type col = Indices[k];
              Resid(LID, m) -= Values[k] * Y2(col, m);
            }
          }
        }
      }
      // solve with this block
      //
      // Note: I'm abusing the ordering information, knowing that X/Y
      // and Y2 have the same ordering for on-proc unknowns.
      //
      // Note: Add flop counts for inverse apply
      apply(Resid, Y2, i - 1, stride, Teuchos::NO_TRANS, DampingFactor_, one);
      // operations for all getrow's
    } // end  Partitioner_->numRowsInPart (i) != 1 )
    // else do nothing, as by definition with a singleton, the residuals are zero.
  } //end reverse sweep
  if(IsParallel_)
  {
    auto numMyRows = inputMatrix_->getNodeNumRows();
    for (size_t m = 0; m < numVecs; ++m)
    {
      for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i)
      {
        Y(i, m) = Y2(i, m);
      }
    }
  }
}

template<class MatrixType>
void Container<MatrixType>::clearBlocks()
{
  numBlocks_ = 0;
  partitions_.clear();
  blockRows_.clear();
  partitionIndices_.clear();
  Diag_ = Teuchos::null;      //Diag_ will be recreated if needed
}

} // namespace Ifpack2

namespace Teuchos {

/// \brief Partial specialization of TypeNameTraits for Ifpack2::Container.
///
/// \tparam MatrixType The template parameter of Ifpack2::Container.
///   Must be a Tpetra::RowMatrix specialization.
template<class MatrixType>
class TEUCHOSCORE_LIB_DLL_EXPORT TypeNameTraits< ::Ifpack2::Container<MatrixType> >
{
 public:
  static std::string name () {
    return std::string ("Ifpack2::Container<") +
      TypeNameTraits<MatrixType>::name () + " >";
  }

  static std::string concreteName (const ::Ifpack2::Container<MatrixType>&) {
    return name ();
  }
};

} // namespace Teuchos

#endif // IFPACK2_CONTAINER_HPP