This file is indexed.

/usr/include/ngram/ngram-katz.h is in libngram-dev 1.3.2-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2016 Brian Roark and Google, Inc.
// Katz backoff derived class for smoothing.

#ifndef NGRAM_NGRAM_KATZ_H_
#define NGRAM_NGRAM_KATZ_H_

#include <vector>

#include <ngram/ngram-count-of-counts.h>
#include <ngram/ngram-make.h>
#include <ngram/util.h>

namespace ngram {

template <class Arc>
class NGramKatz : public NGramMake<Arc> {
 public:
  typedef typename Arc::StateId StateId;
  typedef typename Arc::Label Label;
  typedef typename Arc::Weight Weight;

  using NGramMake<Arc>::HiOrder;
  using NGramMake<Arc>::MakeNGramModel;
  using NGramMake<Arc>::ScalarValue;

  // Construct NGramMake object, consisting of the FST and some
  // information about the states under the assumption that the FST is a model.
  // Ownership of the FST is retained by the caller.
  explicit NGramKatz(MutableFst<Arc> *infst, bool backoff = true,
            Label backoff_label = 0, double norm_eps = kNormEps,
            bool check_consistency = false, int bins = -1)
      : NGramMake<Arc>(infst, BackOffOrMixed(backoff), backoff_label, norm_eps,
                       check_consistency),
        bins_(bins <= 0 ? 5 : bins),
        count_of_counts_(bins_) {}

  // Normalize n-gram counts and smooth to create an n-gram model
  // Using Katz smoothing methods
  //   number of 'bins' used by Katz (>=1)
  bool MakeNGramModel() {
    count_of_counts_.CalculateCounts(*this);
    CalculateDiscounts();
    if (FLAGS_v > 0) count_of_counts_.ShowCounts(discount_, "Katz discounts");
    return NGramMake<Arc>::MakeNGramModel();
  }

  // Pass in count of counts (rather than computing them)
  void SetCountOfCounts(const StdFst &fst) { count_of_counts_.SetCounts(fst); }

 protected:
  // Return negative log discounted count for provided negative log count
  double GetDiscount(Weight neglogcount, int order) override;

 private:
  // Katz discount for count r: (r*/r - rnorm) / (1 - rnorm)
  // r* = (r+1) n_{r+1} / n_r   and   rnorm = (bins + 1) n_{bins + 1} / n_1
  // stored with indices starting at 0, so bin 0 is count 1
  void CalculateKatzDiscount(int order, int bin, double rnorm) {
    if (bin < bins_) {
      double rstar = Offset(bin) + 1, denom = 1.0 - rnorm;
      if (count_of_counts_.Count(order, bin + 1) > 0.0)
        rstar *= count_of_counts_.Count(order, bin + 1);
      if (count_of_counts_.Count(order, bin) > 0.0)
        rstar /= count_of_counts_.Count(order, bin);
      discount_[order][bin] = rstar;
      if (Offset(bin) > 0) {
        discount_[order][bin] /= Offset(bin);
        discount_[order][bin] -= rnorm;
        if (denom > 0.0) {
          discount_[order][bin] /= denom;
        }
      }
    }
    if (bin == bins_ || discount_[order][bin] <= 0.0 ||
        discount_[order][bin] >= 1.0) {  // need to provide epsilon discount
      if (bin < bins_ && discount_[order][bin] != 1.0)
        VLOG(1) << "Histograms violating Good-Turing assumptions";
      discount_[order][bin] = 1.0;
      discount_[order][bin] -= kNormEps;  // everything discounted epsilon
    }
  }

  // Calculate Katz discounts for each order
  void CalculateDiscounts() {
    discount_.clear();
    discount_.resize(HiOrder());

    for (int order = 0; order < HiOrder(); ++order) {
      discount_[order].resize(bins_ + 1, 0.0);  // space for bins + 1
      double rnorm = SaneKatzRNorm(order);
      for (int bin = 0; bin <= bins_; ++bin)
        CalculateKatzDiscount(order, bin, rnorm);
    }
  }

  // Ratio of count mass for lowest undiscounted value to singleton count mass
  // We generalize to allow for count pruning which can lead to zero singletons
  // standard form: rnorm = ( bins_ + 1 ) n_{bins_ + 1} / n_1
  // generalized: rnorm = ( bins_ + 1 ) n_{bins_ + 1} / ( k n_k )
  //              for k = lowest count with non-zero observations
  double SaneKatzRNorm(int order) {
    double rnorm_numerator = Offset(bins_), rnorm_denom = 1;
    rnorm_numerator *= count_of_counts_.Count(order, bins_);
    // NB: Offset(0) = 0 means that first count is for n_0, which we skip
    int rlevel = 1 - Offset(0);

    while (rlevel < bins_ && count_of_counts_.Count(order, rlevel) <= 0) {
      ++rlevel;
      ++rnorm_denom;
    }
    rnorm_denom *= count_of_counts_.Count(order, rlevel);
    return rnorm_numerator / rnorm_denom;
  }

  // Offset between index of the count of counts and the count.
  // For usual count of counts, we store n_1 n_2 n_3 etc., so index
  // of n_k is k-1 and Offset(k-1) = k.
  // For count of histograms, we store n_0 n_1 n_2 etc. and so Offset(k) = k.
  int Offset(int bin);

  // User has a choice to make model purely backoff or mixed.
  // Histogram count models are always mixed and user's choice is ignored.
  // This is because histogram method requires a mixed model to
  // correctly account for the probability of ngrams observed in the lattice
  // but that may occur 0 times with positive probability.
  static bool BackOffOrMixed(bool backoff);

  int bins_;                                 // number of bins for discounting
  NGramCountOfCounts<Arc> count_of_counts_;  // count bins for orders
  std::vector<std::vector<double> > discount_;         // discount for bins
};

template <typename Arc>
double NGramKatz<Arc>::GetDiscount(typename Arc::Weight neglogcount_weight,
                                   int order) {
  // Returns the negative log discounted count of the ngram.
  // This is -log(kd_k) = -log(k) - log(d_k) for count k and discount d_k.
  double neglogcount = ScalarValue(neglogcount_weight);  // -log(k)
  double neglogdiscount;
  if (neglogcount == ScalarValue(Arc::Weight::Zero()))  // count = 0
    return neglogcount;
  int bin = count_of_counts_.GetCountBin(neglogcount, bins_, true);
  if (bin >= 0) {
    neglogdiscount = -log(discount_[order][bin]);
  } else {
    NGRAMERROR() << "NGramKatz: No discount bin for discounting";
    NGramModel<Arc>::SetError();
    neglogdiscount = ScalarValue(Arc::Weight::Zero());
  }
  return neglogcount + neglogdiscount;  // -log(k) - log(d_k) = -log(kd_k).
}

template <>
inline double NGramKatz<HistogramArc>::GetDiscount(
    HistogramArc::Weight neglogcount_weight, int order) {
  // Returns the negative log discounted expected count of the ngram.
  // This is -log[sum_{k > 0} q(k)c^*(k)] where q(k) is the probability of
  // observing k instances of the ngram and c^*(k) is the discounted count
  // for count k.  Note that  c^*(k) = kd_k for some discount
  // factor d_k. For k > K, d_k = 1, i.e., above the threshold K, counts are
  // undiscounted.  Let lambda = sum_{k > 0} q(k)k, i.e., the expected count
  // of the ngram.  Then
  // sum_{k > 0} q(k)c^*(k) = sum_{k > 0} q(k)kd_k
  //                        = sum_{k > 0} q(k)k(1 + (d_k - 1))
  //                        = lambda + sum_{k > 0} q(k)k(d_k - 1)
  //                        = lambda + sum_{k=1}^K q(k)k(d_k - 1)
  //                        = lambda - sum_{k=1}^K q(k)k(1 - d_k)
  // since d_k = 1 for k > K, hence the last term only required to sum up to K.
  // The scalar value of the histogram arc weight is the expected count.
  double neg_log_lambda = ScalarValue(neglogcount_weight);

  // Accumulates neg log of the final term above: sum_{k=1}^K q(k)k(1 - d_k).
  // To sum over k from 1 to K, we initialize value to -log(0.0).
  double neg_log_discounted = ScalarValue(HistogramArc::Weight::Zero());
  int cutoff = neglogcount_weight.Length() - 1;
  int length = (cutoff > bins_) ? bins_ : cutoff;  // length = K + 1
  for (int bin = 1; bin < length; bin++) {
    // Accumulates -log(q(k)) - log(k) - log(1 - d_k).
    neg_log_discounted = NegLogSum(
        neg_log_discounted, neglogcount_weight.Value(bin + 1).Value() -
                                log(1 - discount_[order][bin]) - log(bin));
  }

  // Truncate result for additional numerical stability
  if (neg_log_discounted > 99.0) {
    neg_log_discounted = ScalarValue(HistogramArc::Weight::Zero());
  }

  // Reserves epsilon from higher order probs if no mass was discounted.
  if (neg_log_discounted == ScalarValue(HistogramArc::Weight::Zero())) {
    neg_log_discounted = -log(1 - discount_[order][bins_]) + neg_log_lambda;
  }

  // Returns -log[ lambda - sum_{k=1}^K q(k)k(1 - d_k) ].
  return NegLogDiff(neg_log_lambda, neg_log_discounted);
}

template <typename Arc>
int NGramKatz<Arc>::Offset(int bin) {
  return bin + 1;
}

template <>
inline int NGramKatz<HistogramArc>::Offset(int bin) {
  return bin;
}

template <typename Arc>
bool NGramKatz<Arc>::BackOffOrMixed(bool backoff) {
  return backoff;
}

template <>
inline bool NGramKatz<HistogramArc>::BackOffOrMixed(bool backoff) {
  return false;
}

}  // namespace ngram

#endif  // NGRAM_NGRAM_KATZ_H_