This file is indexed.

/usr/include/trilinos/Tsqr_CombineBenchmark.hpp is in libtrilinos-tpetra-dev 12.10.1-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
//@HEADER
// ************************************************************************
//
//          Kokkos: Node API and Parallel Node Kernels
//              Copyright (2008) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
//
// ************************************************************************
//@HEADER

#ifndef __TSQR_Test_CombineBenchmark_hpp
#define __TSQR_Test_CombineBenchmark_hpp

#include <Tsqr_ConfigDefs.hpp>
#include <Tsqr_CombineBenchmarker.hpp>
#include <Tsqr_CombineDefault.hpp>
#include <Tsqr_CombineNative.hpp>
#ifdef HAVE_KOKKOSTSQR_FORTRAN
#  include <Tsqr_CombineFortran.hpp>
#endif // HAVE_KOKKOSTSQR_FORTRAN

#include <algorithm>
#include <iostream>
#include <limits>
#include <sstream>
#include <stdexcept>
#include <utility>
#include <vector>


namespace TSQR {
  namespace Test {

    /// \class CombineBenchmarkParameters
    /// \brief Parameters for the TSQR::Combine benchmarks.
    /// \author Mark Hoemmen
    ///
    /// numRows: Number of rows in the cache block A.
    ///
    /// numCols: Number of columns in the cache block A,
    ///   and number of rows and columns in the upper triangular
    ///   matrices R, R1, and R2.
    ///
    /// testReal: Whether to test real-arithmetic routines.
    ///
    /// testComplex: Whether to test complex-arithmetic routines.
    ///
    /// numTrials: If calibrate is false: the number of
    ///   trials to run each of the benchmarks.  Ignored if calibrate
    ///   is true.
    ///
    /// calibrate: Whether to calibrate the number of
    ///   trials according to the computed timer resolution.
    ///
    /// averageTimings: Whether to print average (true)
    ///   or cumulative (false) timings over all trials.
    ///
    /// strictPerfTests: Whether to require the ratio of CombineNative
    ///   run time to CombineDefault run time to be less than
    ///   allowance.  "Require" means that we throw an exception (and
    ///   the test fails) otherwise.  CombineFortran is tested
    ///   similarly, if applicable.
    ///
    /// allowance: Allowed slowdown factor for strictPerfTests (if
    ///   applicable).
    ///
    /// seed: If useSeedValues is false, ignored; else, the
    ///   four-integer seed for the random number generator.  See the
    ///   documentation of LAPACK's _LARNV routines for requirements.
    ///
    /// useSeedValues: Whether seed (see above) is read.
    ///
    /// additionalFieldNames: Field names for any additional
    ///   data to print after each row.  May be an empty string,
    ///   in which case the number of additional fields is zero
    ///   and no additional data is printed after each row.
    ///
    /// additionalData: Any additional data to print after each row.
    ///   Same number of additional data per row as fields in
    ///   additionalFieldNames.
    ///
    /// printFieldNames: Whether to print a "%" - commented row of
    ///   comma-delimited field names before the first row of data.
    ///
    /// debug: Whether to print copious debugging output to stderr.
    ///
    struct CombineBenchmarkParameters {
      int numRows;
      int numCols;
      bool testReal;
      bool testComplex;
      int numTrials;
      bool calibrate;
      bool averageTimings;
      bool strictPerfTests;
      double allowance;
      std::vector<int> seed;
      bool useSeedValues;
      std::string additionalFieldNames;
      std::string additionalData;
      bool printFieldNames;
      bool debug;
    };

    template<class CombineType, class TimerType>
    static std::vector<double>
    benchmarkCombineType (std::ostream& out,
                          std::vector<int>& iseed,
                          const std::string& dataTypeName,
                          const std::string& combineTypeName,
                          const typename CombineType::ordinal_type numRows,
                          const typename CombineType::ordinal_type numCols,
                          const int cacheBlockNumTrials,
                          const int pairNumTrials,
                          const bool averageTimings,
                          const std::string& additionalData)
    {
      using std::endl;

      typedef typename CombineType::ordinal_type ordinal_type;
      typedef typename CombineType::scalar_type scalar_type;
      typedef CombineBenchmarker<ordinal_type, scalar_type, CombineType, TimerType>
        benchmarker_type;

      TEUCHOS_TEST_FOR_EXCEPTION(cacheBlockNumTrials < 1, std::invalid_argument,
                         "The number of trials for the cache block benchmark "
                         "must be positive, but you specified cacheBlockNum"
                         "Trials = " << cacheBlockNumTrials << ".");
      TEUCHOS_TEST_FOR_EXCEPTION(pairNumTrials < 1, std::invalid_argument,
                         "The number of trials for the pair benchmark must be "
                         "positive, but you specified pairNumTrials = "
                         << pairNumTrials << ".");

      benchmarker_type b (iseed);
      std::pair<double, double> results;
      results.first =
        b.benchmarkPair (numCols, pairNumTrials);
      results.second =
        b.benchmarkCacheBlock (numRows, numCols, cacheBlockNumTrials);

      // Whether or not we should print the "additional data"
      // (originally supplied at command-line invocation of this
      // benchmark) after the benchmark results.  The additional data
      // option makes it easier to write parsers for benchmark
      // results, since we can include data that are known outside the
      // benchmark (when invoking the benchmark as an executable), but
      // not known (easily or at all) inside the benchmark.  A good
      // example would be environment variables, like OMP_NUM_THREADS,
      // or (for a benchmark that uses MPI, which this is not) the
      // number of MPI processes per node ("ppn").
      const bool printAdditionalData = (! additionalData.empty());

      const double pairTime = averageTimings ?
        results.first / static_cast<double>(pairNumTrials) :
        results.first;
      const double cacheBlockTime = averageTimings ?
        results.second / static_cast<double>(cacheBlockNumTrials) :
        results.second;

      out << combineTypeName
          << "," << "R1R2"
          << "," << dataTypeName
          << "," << (2*numCols)
          << "," << numCols
          << "," << pairNumTrials
          << "," << pairTime;
      if (printAdditionalData)
        out << "," << additionalData;
      out << endl;
      out << combineTypeName
          << "," << "RA"
          << "," << dataTypeName
          << "," << numRows
          << "," << numCols
          << "," << cacheBlockNumTrials
          << "," << cacheBlockTime;
      if (printAdditionalData)
        out << "," << additionalData;
      out << endl;

      std::vector<double> timings (2);
      timings[0] = pairTime;
      timings[1] = cacheBlockTime;
      return timings;
    }

    template<class Scalar, class TimerType>
    static void
    benchmarkAllCombineTypes (std::ostream& out,
                              const std::string& dataTypeName,
                              CombineBenchmarkParameters& params,
                              const double timerResolution)
    {
      using std::cerr;
      using std::endl;
      const bool debug = params.debug;
      const int numRows = params.numRows;
      const int numCols = params.numCols;

      TEUCHOS_TEST_FOR_EXCEPTION(timerResolution <= static_cast<double>(0),
                         std::invalid_argument,
                         "The timer resolution must be a positive number, "
                         "but you specified timerResolution = "
                         << timerResolution << ".");

      // If no calibration is performed, then the number of trials is
      // the same for both the cache block [R; A] benchmark and the
      // pair [R1; R2] benchmark.  Otherwise, we calibrate the number
      // of trials for each separately.  This is because we expect the
      // [R1; R2] benchmark to take much less time than the [R; A]
      // benchmark, so [R1; R2] should have more trials, in order to
      // get comparable timing accuracy without requiring too many [R;
      // A] trials.
      int pairNumTrials = params.numTrials;
      int cacheBlockNumTrials = params.numTrials;
      if (params.calibrate)
        { // We calibrate the number of trials using the default
          // Combine implementation.  We don't expect CombineNative or
          // CombineFortran to be much faster than that.
          if (debug)
            cerr << "Calibrating..." << endl;

          // Calibrater gets the timer resolution.
          typedef CombineDefault<int, Scalar> combine_type;
          typedef CombineBenchmarker<int, Scalar, combine_type, TimerType>
            benchmarker_type;
          benchmarker_type c (timerResolution, params.seed);

          // Accuracy factor of 1000 gives us 3 digits of timer accuracy.
          const double accuracyFactor = static_cast<double> (1000);

          // Number of trials for factor_pair() and apply_pair().
          std::pair<int, double> result;
          result = c.calibratePair (numCols, accuracyFactor);
          if (debug)
            {
              cerr << "- Pair number of trials: " << result.first << endl;
              cerr << "- Pair calibration time: " << result.second << endl;
            }
          pairNumTrials = result.first;

          // Number of trials for factor_inner() and apply_inner().
          result = c.calibrateCacheBlock (numRows, numCols, accuracyFactor);
          if (debug)
            {
              cerr << "- Cache block number of trials: " << result.first << endl;
              cerr << "- Cache block calibration time: " << result.second << endl;
            }
          cacheBlockNumTrials = result.first;

          // Store the updated PRNG seed in the benchmark parameters.
          c.getSeed (params.seed);
        }

      // Always benchmark CombineDefault.  We use its timings as the
      // standard by which the other Combine implementations' timings
      // are compared.  The returned vector contains two timings: for
      // [R1; R2], and for [R; A], in that order.
      std::vector<double> defaultTimings;
      {
        typedef CombineDefault< int, Scalar > combine_type;
        std::string combineTypeName ("Default");
        defaultTimings =
          benchmarkCombineType<combine_type, TimerType> (out, params.seed,
                                                         dataTypeName,
                                                         combineTypeName,
                                                         numRows,
                                                         numCols,
                                                         cacheBlockNumTrials,
                                                         pairNumTrials,
                                                         params.averageTimings,
                                                         params.additionalData);
      }

      // If we're doing strict performance tests, then CombineNative
      // (and CombineFortran, if applicable) may be no slower than the
      // given allowance factor times CombineDefault's time.  For now,
      // we only look at cache block performance, since that is where
      // most of the time should be going.
      std::vector<double> nativeTimings;
      {
        typedef CombineNative<int, Scalar> combine_type;
        std::string combineTypeName ("Native");
        nativeTimings =
          benchmarkCombineType<combine_type, TimerType> (out, params.seed,
                                                         dataTypeName,
                                                         combineTypeName,
                                                         numRows,
                                                         numCols,
                                                         cacheBlockNumTrials,
                                                         pairNumTrials,
                                                         params.averageTimings,
                                                         params.additionalData);
        const double slowdown = nativeTimings[1] / defaultTimings[1];
        const bool tooSlow = slowdown > params.allowance;
        // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
        // more appropriately named exception.
        TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow,
                           std::runtime_error,
                           "CombineNative is too slow!  For cache block "
                           "benchmark with numRows=" << numRows << " and numCols="
                           << numCols << ", CombineNative time (= "
                           << nativeTimings[1] << ") / CombineDefault time (= "
                           << defaultTimings[1] << ") = " << slowdown
                           << " > the allowed fraction " << params.allowance
                           << ".");
      }

#ifdef HAVE_KOKKOSTSQR_FORTRAN
      std::vector<double> fortranTimings;
      {
        typedef CombineFortran<Scalar> combine_type;
        std::string combineTypeName ("Fortran");
        fortranTimings =
          benchmarkCombineType<combine_type, TimerType> (out, params.seed,
                                                         dataTypeName,
                                                         combineTypeName,
                                                         numRows,
                                                         numCols,
                                                         cacheBlockNumTrials,
                                                         pairNumTrials,
                                                         params.averageTimings,
                                                         params.additionalData);
        const double slowdown = fortranTimings[1] / defaultTimings[1];
        const bool tooSlow = slowdown > params.allowance;
        // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
        // more appropriately named exception.
        TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow,
                           std::runtime_error,
                           "CombineFortran is too slow!  For cache block "
                           "benchmark with numRows=" << numRows << " and numCols="
                           << numCols << ", CombineFortran time (= "
                           << fortranTimings[1] << ") / CombineDefault time (= "
                           << defaultTimings[1] << ") = " << slowdown
                           << " > the allowed fraction " << params.allowance
                           << ".");
      }
#endif // HAVE_KOKKOSTSQR_FORTRAN
    }


    template<class TimerType>
    static void
    benchmarkAllCombineTypesAndScalars (std::ostream& out,
                                        CombineBenchmarkParameters& params)
    {
      using std::cerr;
      using std::endl;
      using std::string;
      const bool debug = params.debug;

      // Compute timer resolution.
      const double timerResolution = computeTimerResolution<TimerType> ();
      if (debug)
        cerr << "Timer resolution: " << timerResolution << " seconds" << endl;

      string dataTypeName;
      if (params.testReal)
        {
          dataTypeName = "float";
          benchmarkAllCombineTypes<float, TimerType> (out, dataTypeName,
                                                      params, timerResolution);
          dataTypeName = "double";
          benchmarkAllCombineTypes<double, TimerType> (out, dataTypeName,
                                                       params, timerResolution);
        }
      if (params.testComplex)
        {
#ifdef HAVE_KOKKOSTSQR_COMPLEX
          using std::complex;

          dataTypeName = "complex<float>";
          benchmarkAllCombineTypes<complex<float>, TimerType> (out, dataTypeName,
                                                               params, timerResolution);
          dataTypeName = "complex<double>";
          benchmarkAllCombineTypes<complex<double>, TimerType> (out, dataTypeName,
                                                                params, timerResolution);

#else // Don't HAVE_KOKKOSTSQR_COMPLEX
          throw std::logic_error("TSQR not built with complex arithmetic support");
#endif // HAVE_KOKKOSTSQR_COMPLEX
        }
    }

    /// \fn benchmarkCombine
    /// \brief Benchmark TSQR::Combine, using a timer of type TimerType.
    /// \author Mark Hoemmen
    ///
    /// Benchmarks test cache block and pair operations for all
    /// Combine implementations, over all Scalar types (modulated by
    /// testReal and testComplex).
    ///
    /// \param out [out] Output stream to which to write results.
    /// \param params [in/out] Benchmark parameters.
    template<class TimerType>
    void
    benchmarkCombine (std::ostream& out,
                      CombineBenchmarkParameters& params)
    {
      TEUCHOS_TEST_FOR_EXCEPTION(params.numRows < 1 || params.numCols < 1,
                         std::invalid_argument,
                         "The test matrix must have a positive number of rows "
                         "and columns, but you specified numRows = "
                         << params.numRows << " and numCols = "
                         << params.numCols << ".");
      TEUCHOS_TEST_FOR_EXCEPTION(! params.calibrate && params.numTrials < 1,
                         std::invalid_argument,
                         "Since you specified no calibration is to be performed, "
                         "the number of trials must be positive, but you specified "
                         "numTrials = " << params.numTrials << ".");

      if (! params.useSeedValues)
        { // Fill in default seed values.
          if (params.seed.size() < 4)
            params.seed.resize (4);
          params.seed[0] = 0;
          params.seed[1] = 0;
          params.seed[2] = 0;
          params.seed[3] = 1;
        }

      if (params.printFieldNames)
        {
          // The row of field names begins with a '%' character, in
          // order to help out the benchmark results parser.
          out << "%" << "method"
              << "," << "kernel"
              << "," << "scalarType"
              << "," << "numRows"
              << "," << "numCols"
              << "," << "numTrials"
              << "," << "timing";
          if (params.printFieldNames && ! params.additionalFieldNames.empty())
            // The additionalFieldNames string should be a
            // comma-delimited list of additional field name(s).
            out << "," << params.additionalFieldNames;
          out << std::endl;
        }
      benchmarkAllCombineTypesAndScalars<TimerType> (out, params);
    }

  } // namespace Test
} // namespace TSQR

#endif // __TSQR_Test_CombineBenchmark_hpp