/usr/include/shogun/kernel/string/StringSubsequenceKernel.h is in libshogun-dev 3.2.0-7.3build4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | /*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2014 Soumyajit De
*/
#ifndef STRING_SUBSEQUENCE_KERNEL_H_
#define STRING_SUBSEQUENCE_KERNEL_H_
#include <shogun/lib/common.h>
#include <shogun/kernel/string/StringKernel.h>
namespace shogun
{
/**
* @brief class StringSubsequenceKernel that implements String Subsequence Kernel
* (SSK) discussed by Lodhi et. al.[1]. A subsequence is any ordered sequence
* of \f$n\f$ characters occurring in the text, though not necessarily contiguous.
* More formally, string \f$u\f$ is a subsequence of string \f$s\f$, iff there
* exists indices \f$\mathbf{i}=(i_{1},\dots,i_{|u|})\f$, with
* \f$1\le i_{1} \le \cdots \le i_{|u|} \le |s|\f$, such that
* \f$u_{j}=s_{i_{j}}\f$ for \f$j=1,\dots,|u|\f$, written as \f$u=s[\mathbf{i}]\f$.
* The feature mapping \f$\phi\f$ in this scenario is given by
* \f[
* \phi_{u}(s)=\sum_{\mathbf{i}:u=s[\mathbf{i}]}\lambda^{l(\mathbf{i})}
* \f]
* for some \f$lambda\le 1\f$, where \f$l(\mathbf{i})\f$ is the length of the
* subsequence in \f$s\f$, given by \f$i_{|u|}-i_{1}+1\f$.
* The kernel here is an inner product in the feature space generated by all
* subsequences of length \f$n\f$.
* \f[
* K_{n}(s,t)=\sum_{u\in\Sigma^{n}}\langle \phi_{u}(s), \phi_{u}(t)\rangle
* = \sum_{u\in\Sigma^{n}}\sum_{\mathbf{i}:u=s[\mathbf{i}]}
* \sum_{\mathbf{j}:u=t[\mathbf{j}]}\lambda^{l(\mathbf{i})+l(\mathbf{j})}
* \f]
* Since the subsequences are weighted by the exponentially decaying factor
* \f$\lambda\f$ of their full length in the text, more weight is given to those
* occurrences that are nearly contiguous. A direct computation is infeasible
* since the dimension of the feature space grows exponentially with \f$n\f$.
* The paper describes an efficient computation approach using a dynamic
* programming technique.
*
* The implementation is inspired from Razvan C. Bunescu's
* (http://ace.cs.ohiou.edu/~razvan/) SSK core library.
*
* Reference:
* [1] Text Classification using String Kernels, Lodhi et. al. Journal of Machine
* Learning Research 2(2002), 419-444.
*/
class CStringSubsequenceKernel: public CStringKernel<char>
{
public:
/** default constructor */
CStringSubsequenceKernel();
/**
* constructor
*
* @param size cache size
* @param maxlen maximum length of the subsequence
* @param lambda the penalty parameter
*/
CStringSubsequenceKernel(int32_t size, int32_t maxlen, float64_t lambda);
/**
* constructor
*
* @param lhs features of left-hand side
* @param rhs features of right-hand side
* @param maxlen maximum length of the subsequence
* @param lambda the penalty parameter
*/
CStringSubsequenceKernel(CStringFeatures<char>* lhs, CStringFeatures<char>* rhs,
int32_t maxlen, float64_t lambda);
/** destructor */
virtual ~CStringSubsequenceKernel();
/**
* initialize kernel
*
* @param lhs features of left-hand side
* @param rhs features of right-hand side
* @return true if initialization was successful, false otherwise
*/
virtual bool init(CFeatures* lhs, CFeatures* rhs);
/** clean up kernel */
virtual void cleanup();
/** @return the kernel type */
virtual EKernelType get_kernel_type()
{
return K_POLYMATCH;
}
/** @return name */
virtual const char* get_name() const
{
return "StringSubsequenceKernel";
}
/** register the parameters */
virtual void register_params();
/**
* compute kernel function for features a and b.
* idx_{a,b} denote the index of the feature vectors
* in the corresponding feature object.
*
* The computation is achieved using two auxiliary functions \f$K'\f$ and
* \f$K''\f$ which are defined as follows -
* \f[
* K'_{i}(s,t)=\sum_{u\in\Sigma^{i}}\sum_{\mathbf{i}:u=s[\mathbf{i}]}
* \sum_{\mathbf{j}:u=t[\mathbf{j}]}\lambda^{|s|+|t|-i_{1}-j_{1}+2}
* \f]
* for \f$i=1,\dots,n\f$, and
* \f[
* K''_{i}(sx,t)=\sum_{j:t_{j}=x}K'_{i-1}(s,t[1:j-1])\lambda^{|t|-j+2}
* \f]
* where \f$K'_{0}(s,t)=0\f$ for all \f$s\f$ and \f$t\f$ (see reference for
* details).
*
* @param idx_a index a
* @param idx_b index b
* @return computed kernel function at indices a,b
*/
virtual float64_t compute(int32_t idx_a, int32_t idx_b);
protected:
/** maximum length of common subsequences */
int32_t m_maxlen;
/** gap penalty */
float64_t m_lambda;
};
}
#endif // STRING_SUBSEQUENCE_KERNEL_H_
|