This file is indexed.

/usr/include/mlpack/core/kernels/pspectrum_string_kernel.hpp is in libmlpack-dev 1.0.10-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/**
 * @file pspectrum_string_kernel.hpp
 * @author Ryan Curtin
 *
 * Implementation of the p-spectrum string kernel, created for use with FastMKS.
 * Instead of passing a data matrix to FastMKS which stores the kernels, pass a
 * one-dimensional data matrix (data vector) to FastMKS which stores indices of
 * strings; then, the actual strings are given to the PSpectrumStringKernel at
 * construction time, and the kernel knows to map the indices to actual strings.
 *
 * This file is part of MLPACK 1.0.10.
 *
 * MLPACK is free software: you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 *
 * MLPACK is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 * A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
 * details (LICENSE.txt).
 *
 * You should have received a copy of the GNU General Public License along with
 * MLPACK.  If not, see <http://www.gnu.org/licenses/>.
 */
#ifndef __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_HPP
#define __MLPACK_CORE_KERNELS_PSPECTRUM_STRING_KERNEL_HPP

#include <map>
#include <string>
#include <vector>

#include <mlpack/core.hpp>

namespace mlpack {
namespace kernel {

/**
 * The p-spectrum string kernel.  Given a length p, the p-spectrum kernel finds
 * the contiguous subsequence match count between two strings.  The kernel will
 * take every possible substring of length p of one string and count how many
 * times it appears in the other string.
 *
 * The string kernel, when created, must be passed a reference to a series of
 * string datasets (std::vector<std::vector<std::string> >&).  This is because
 * MLPACK only supports datasets which are Armadillo matrices -- and a dataset
 * of variable-length strings cannot be easily cast into an Armadillo matrix.
 *
 * Therefore, once the PSpectrumStringKernel is created with a reference to the
 * string datasets, a "fake" Armadillo data matrix must be created, which simply
 * holds indices to the strings they represent.  This "fake" matrix has two rows
 * and n columns (where n is the number of strings in the dataset).  The first
 * row holds the index of the dataset (remember, the kernel can have multiple
 * datasets), and the second row holds the index of the string.  A fake matrix
 * containing only strings from dataset 0 might look like this:
 *
 * [[0 0 0 0 0 0 0 0 0]
 *  [0 1 2 3 4 5 6 7 8]]
 *
 * This fake matrix is then given to the machine learning method, which will
 * eventually call PSpectrumStringKernel::Evaluate(a, b), where a and b are two
 * columns of the fake matrix.  The string kernel will then map these fake
 * columns back to the strings they represent, and then correctly evaluate the
 * kernel.
 *
 * Unfortunately, not every machine learning method will work with this kernel.
 * Only machine learning methods which do not ever operate on the explicit
 * representation of points can use this kernel.  So, for instance, one cannot
 * build a kd-tree on strings, because the BinarySpaceTree<> class will split
 * the data according to the fake data matrix -- resulting in a meaningless
 * tree.  This kernel was originally written for the FastMKS method; so, at the
 * very least, it will work with that.
 */
class PSpectrumStringKernel
{
 public:
  /**
   * Initialize the PSpectrumStringKernel with the given string datasets.  For
   * more information on this, see the general class documentation.
   *
   * @param datasets Sets of string data.
   * @param p The length of substrings to search.
   */
  PSpectrumStringKernel(const std::vector<std::vector<std::string> >& datasets,
                        const size_t p);

  /**
   * Evaluate the kernel for the string indices given.  As mentioned in the
   * class documentation, a and b should be 2-element vectors, where the first
   * element contains the index of the dataset and the second element contains
   * the index of the string.  Therefore, if [2 3] is passed for a, the string
   * used will be datasets[2][3] (datasets is of type
   * std::vector<std::vector<std::string> >&).
   *
   * @param a Index of string and dataset for first string.
   * @param b Index of string and dataset for second string.
   */
  template<typename VecType>
  double Evaluate(const VecType& a, const VecType& b) const;

  //! Access the lists of substrings.
  const std::vector<std::vector<std::map<std::string, int> > >& Counts() const
  { return counts; }
  //! Modify the lists of substrings.
  std::vector<std::vector<std::map<std::string, int> > >& Counts()
  { return counts; }

  //! Access the value of p.
  size_t P() const { return p; }
  //! Modify the value of p.
  size_t& P() { return p; }

   /*
   * Returns a string representation of this object.
   */
  std::string ToString() const{
    std::ostringstream convert;
    convert << "PSpectrumStringKernel [" << this << "]" << std::endl;
    convert << "  p used: " << p << std::endl;
    convert << "  Dataset:" << datasets.size() << std::endl;
    std::ostringstream convertb;
    for (size_t ind=0; ind < datasets.size(); ind++)
      convertb << datasets[ind].size();
    convert << mlpack::util::Indent(convertb.str(),2);
    return convert.str();
  }
 private:
  //! The datasets.
  const std::vector<std::vector<std::string> >& datasets;

  //! Mappings of the datasets to counts of substrings.  Such a huge structure
  //! is not wonderful...
  std::vector<std::vector<std::map<std::string, int> > > counts;

  //! The value of p to use in calculation.
  size_t p;
};

}; // namespace kernel
}; // namespace mlpack

// Include implementation of templated Evaluate().
#include "pspectrum_string_kernel_impl.hpp"

#endif