/usr/include/ngram/ngram-absolute.h is in libngram-dev 1.3.2-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2016 Brian Roark and Google, Inc.
// Absolute Discounting derived class for smoothing.
#ifndef NGRAM_NGRAM_ABSOLUTE_H_
#define NGRAM_NGRAM_ABSOLUTE_H_
#include <vector>
#include <ngram/ngram-count-of-counts.h>
#include <ngram/ngram-make.h>
namespace ngram {
class NGramAbsolute : public NGramMake<StdArc> {
public:
// Construct NGramMake object, consisting of the FST and some
// information about the states under the assumption that the FST is a model.
// Ownership of the FST is retained by the caller.
explicit NGramAbsolute(StdMutableFst *infst, bool backoff = false,
Label backoff_label = 0, double norm_eps = kNormEps,
bool check_consistency = false, double parameter = -1.0,
int bins = -1)
: NGramMake(infst, backoff, backoff_label, norm_eps, check_consistency),
parameter_(parameter),
bins_(bins <= 0 ? 1 : bins),
count_of_counts_(bins_) {}
// Smooth model according to 'method' and parameters.
bool MakeNGramModel();
// Pass in count of counts (rather than computing them)
void SetCountOfCounts(const StdFst &fst) { count_of_counts_.SetCounts(fst); }
protected:
// Return negative log discounted count for provided negative log count
double GetDiscount(Weight neglogcount, int order) override;
private:
// Calculate absolute discount parameter for count i
// Note: discounts stored with bin indices starting at 0, bin k is count k+1
void CalculateAbsoluteDiscount(int order, int bin) {
if (parameter_ >= 0) { // user provided discount parameter
discount_[order][bin] = parameter_;
} else { // no discount parameter given: assign based on rule of thumb
double ROTval = AbsDiscountRuleOfThumb(order);
if (ROTval <= 0.0) { // rule of thumb provides unusable parameter
discount_[order][bin] = 0.6; // just assign some default parameter
} else { // assign according to formula for given rule of thumb value
discount_[order][bin] = AbsoluteDiscountFormula(order, bin, ROTval);
}
}
}
// Calculate absolute discounting parameter according to histogram formula.
// Using Chen and Goodman version from equation (26) of paper
// For count i, discount: i - ( (i+1) Y n_{i+1} / n_{i} ) for a given Y
double AbsoluteDiscountFormula(int order, int bin, double Y) {
double discount = bin + 1, n = bin + 2; // recall bin (k-1) = count k
n *= Y * count_of_counts_.Count(order, bin + 1);
if (n == 0.0) n++; // to avoid full discounts when given an empty bin
if (count_of_counts_.Count(order, bin) > 0.0)
n /= count_of_counts_.Count(order, bin);
discount -= n;
if (discount <= 0) discount = kNormEps;
return discount;
}
// Generalized rule of thumb: Y = k n_k / ( k n_k + (k+1) * n_{k+1} )
// where n_k is the total count mass for items that occurred k times
// Note: method generalized to allow for zeros in low count bins:
// find lowest non-empty count bins, then use rule of thumb
double AbsDiscountRuleOfThumb(int order) {
int basebin = 1; // cannot assume bins have observations (count pruning)
while (basebin <= bins_ && // find lowest non-zero pair of bins
(count_of_counts_.Count(order, basebin - 1) <= 0.0 ||
count_of_counts_.Count(order, basebin) <= 0.0))
basebin++;
if (basebin > bins_) // insufficient non-zero data available in histogram
return 0.0;
double k = basebin, kn_k = k * count_of_counts_.Count(order, basebin - 1),
kp1n_kp1 = (k + 1) * count_of_counts_.Count(order, basebin);
return kn_k / (kn_k + kp1n_kp1);
}
// Calculate discounts for each order, according to the requested method
void CalculateDiscounts();
double parameter_; // Absolute Discounting D
int bins_; // number of bins for discounting
NGramCountOfCounts<StdArc> count_of_counts_; // count bins for orders
std::vector<std::vector<double> > discount_; // discount for bins
};
} // namespace ngram
#endif // NGRAM_NGRAM_ABSOLUTE_H_
|