This file is indexed.

/usr/include/ngram/ngram-context.h is in libngram-dev 1.3.2-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2016 Brian Roark and Google, Inc.
// Class to parse and maintain context specifications.

#ifndef NGRAM_NGRAM_CONTEXT_H_
#define NGRAM_NGRAM_CONTEXT_H_

#include <algorithm>
#include <cstring>
#include <string>
#include <vector>

#include <fst/fst.h>
#include <ngram/ngram-model.h>

namespace ngram {

using fst::StdArc;
using std::ostringstream;

// Represents a context interval.
class NGramContext {
 public:
  typedef StdArc::Label Label;
  typedef StdArc::StateId StateId;

  // Constructs a context specification from begin and end context
  // vectors.  If the context is less than the n-gram order - 1, it is
  // padded with 0 on the left. The begin and end context vectors
  // specify a (half-open) interval of highest-order state contexts in
  // an LM with the interval defined using the reverse lexicographic
  // order (i.e., on the reverse of the context). All suffixes of
  // these contexts are also included for proper backoff (when
  // include_all_suffixes = true).
  //
  // Example1: context_begin = {1,1,1,1} and context_end = {1,1,1,5} with
  // a 5-gram:
  //   specifies states that have a rightmost context in [1,5).
  //
  // Example2: context_begin = {1} and context_end = {5,6} with a 5-gram:
  //   same as context_begin = {0,0,0,1} and context_end = {0,0,5,6}.
  NGramContext(const std::vector<Label> &context_begin,
               const std::vector<Label> &context_end, int hi_order)
      : hi_order_(hi_order),
        context_begin_(context_begin),
        context_end_(context_end) {
    Init();
  }

  // Constructs a context specification from context pattern string.
  // Expected format: "w_1 ...w_m : v_1 ... v_n" where
  // the w_i and v_i are numeric word IDs and m,n are typically less than
  // the n-gram order. A word ID 0 signfies the initial word.
  //
  // Example: "1 1 1 1 : 1 1 1 5" signifies a begin context vector of
  //   {1,1,1,1} and and end context vector of {1,1,1,5}. See next constructor
  //   for the behavior with these vectors.
  NGramContext(const string &context_pattern, int hi_order)
      : hi_order_(hi_order) {
    ParseContextInterval(context_pattern, &context_begin_, &context_end_);
    Init();
  }

  // Null context
  NGramContext() : hi_order_(0) {}

  // Is ngram in context?  If 'include_all_suffixes' is true, then all
  // suffixes of the begin and end contexts are considered in
  // context. When false, true (reverse) lexicographic order is used.
  bool HasContext(std::vector<Label> ngram,
                  bool include_all_suffixes = true) const;

  // No/empty context requested?
  int NullContext() const { return context_begin_.empty(); }

  // Derives begin and end context vectors from input context pattern
  // string.
  static void ParseContextInterval(const string &context_pattern,
                                   std::vector<Label> *context_begin,
                                   std::vector<Label> *context_end);

  // Generates context string from begin and end context vectors
  static string GetContextString(const std::vector<Label> &context_begin,
                                 const std::vector<Label> &context_end) {
    ostringstream context_pattern_strm;
    for (int i = 0; i < context_begin.size(); ++i)
      context_pattern_strm << context_begin[i] << " ";
    context_pattern_strm << ":";
    for (int i = 0; i < context_end.size(); ++i)
      context_pattern_strm << " " << context_end[i];
    return context_pattern_strm.str();
  }

  // Given a n-gram model, returns 'ncontext' contexts balanced for
  // size.  Arg 'bigram_thresh' determines how overfull a context
  // bin has to be to force a split at a bigram context.  The model
  // must have state n-grams enabled.
  template <class Arc>
  static void FindContexts(const NGramModel<Arc> &model, int ncontexts,
                           std::vector<string> *contexts,
                           float bigram_thresh = 1.1) {
    std::vector<std::vector<typename Arc::Label>> begin_contexts;
    std::vector<std::vector<typename Arc::Label>> end_contexts;
    FindContexts(model, ncontexts, &begin_contexts, &end_contexts,
                 bigram_thresh);

    for (int i = 0; i < begin_contexts.size(); ++i)
      contexts->push_back(GetContextString(begin_contexts[i], end_contexts[i]));
  }

  // Given a n-gram model, returns 'ncontext' contexts balanced for
  // size.  Arg 'bigram_thresh' determines how overfull a context
  // bin has to be to force a split at a bigram context.  The model
  // must have state n-grams enabled.
  template <class Arc>
  static void FindContexts(const NGramModel<Arc> &model, int ncontexts,
                           vector<vector<typename Arc::Label>> *begin_contexts,
                           vector<vector<typename Arc::Label>> *end_contexts,
                           float bigram_thresh = 1.1);

  // Begin context as could be passed to class constructor
  std::vector<Label> GetContextBegin() const {
    std::vector<Label> ngram(context_begin_);
    while (ngram.size() > 1 && ngram.back() == 0) ngram.pop_back();
    reverse(ngram.begin(), ngram.end());
    return ngram;
  }

  // End context as could be passed to class constructor
  std::vector<Label> GetContextEnd() const {
    std::vector<Label> ngram(context_end_);
    while (ngram.size() > 1 && ngram.back() == 0) ngram.pop_back();
    reverse(ngram.begin(), ngram.end());
    return ngram;
  }

  // Context is reversed and padded to high-order
  std::vector<Label> GetReverseContextBegin() const { return context_begin_; }

  // Context is reversed and padded to high-order
  std::vector<Label> GetReverseContextEnd() const { return context_end_; }

  // Note order is wrt transitions not states in the model;
  // so state ngram.size() == 1 has order 2
  int GetHiOrder() const { return hi_order_; }

  // Changes hi order (which affects context padding)
  // Used by NGramExtendedContext to put several
  // NGramContexts on the same hi-order.
  void SetHiOrder(int hi_order) {
    if (hi_order > hi_order_) {
      if (!NullContext()) {
        context_begin_.resize(hi_order - 1, 0);
        context_end_.resize(hi_order - 1, 0);
      }
      hi_order_ = hi_order;
    }
  }

 private:
  void Init();

  int hi_order_;
  std::vector<Label> context_begin_;
  std::vector<Label> context_end_;
};

// Represents a set of disjoint context intervals.
class NGramExtendedContext {
 public:
  typedef StdArc::Label Label;

  // Constructs a context specification om begin and end context vectors.
  // See the corresponding NGramContext constructor.
  NGramExtendedContext(const std::vector<Label> &context_begin,
                       const std::vector<Label> &context_end, int hi_order) {
    contexts_.push_back(NGramContext(context_begin, context_end, hi_order));
    Init(false);
  }

  // Constructs a context specification from an extended context
  // pattern string.  An extended context pattern is a comma-separated
  // set of NGramContext context patterns that must be disjoint.
  // If 'merge_contexts' is true, adjacent contexts will be merged.
  NGramExtendedContext(const string &extended_context_pattern, int hi_order,
                       bool merge_contexts = true) {
    ParseContextIntervals(extended_context_pattern, hi_order, &contexts_);
    Init(merge_contexts);
  }

  // Constructs a context specification from a NGramContext vector.
  // If 'merge_contexts' is true, adjacent contexts will be merged.
  explicit NGramExtendedContext(const std::vector<NGramContext> &contexts,
                                bool merge_contexts = true)
      : contexts_(contexts) {
    Init(merge_contexts);
  }

  // Null context
  NGramExtendedContext() {}

  // No/empty context requested?
  int NullContext() const { return contexts_.empty(); }

  // Is ngram in context?  If 'include_all_suffixes' is true, then all
  // suffixes of the begin and end contexts are considered in
  // context. When false, true (reverse) lexicographic order is used.
  bool HasContext(const std::vector<Label> &ngram,
                  bool include_all_suffixes = true) const;

  // Find NGramContext that matches context. Returns a null pointer
  // if no match or if the input is the null context.  If
  // 'include_all_suffixes' is true, then all suffixes of the begin
  // and end contexts are considered in context. When false, true
  // (reverse) lexicographic order is used.
  const NGramContext *GetContext(const std::vector<Label> &ngram,
                                 bool include_all_suffixes = true) const;

  // Derives NGramContext vector from input extended context pattern string.
  static void ParseContextIntervals(const string &extended_context_pattern,
                                    int hi_order,
                                    std::vector<NGramContext> *contexts);

  // Generates an extended context string from a vector of NGramContexts
  static string GetExtendedContextString(
      const std::vector<NGramContext> &contexts) {
    ostringstream extended_context_pattern_strm;
    for (size_t i = 0; i < contexts.size(); ++i) {
      if (i > 0) extended_context_pattern_strm << ",";
      const std::vector<Label> &context_begin = contexts[i].GetContextBegin();
      const std::vector<Label> &context_end = contexts[i].GetContextEnd();
      extended_context_pattern_strm
          << NGramContext::GetContextString(context_begin, context_end);
    }
    return extended_context_pattern_strm.str();
  }

  const std::vector<NGramContext> &GetContexts() const { return contexts_; }

 private:
  // Ensures disjoint, same hi-order and canonicalizes.
  void Init(bool merge_contexts);

  // Ensures contexts are non-empty, non-overlapping and that
  // the high orders made to match.
  bool CheckContexts();

  // Merges contexts_ in range [i, j] and write to k.
  void MergeContexts(size_t i, size_t j, size_t k);

  // Comparison function object on contexts
  struct ContextCompare {
    bool operator()(const NGramContext &c1, const NGramContext &c2) {
      // Sorts by beginning of the context interval. Will be a total
      // order assuming the context intervals are disjoint.
      const std::vector<Label> &b1 = c1.GetReverseContextBegin();
      const std::vector<Label> &b2 = c2.GetReverseContextBegin();
      return lexicographical_compare(b1.begin(), b1.end(), b2.begin(),
                                     b2.end());
    };
  };

  std::vector<NGramContext> contexts_;
};

// Reads (possibly extended) context specifications form a file into a vector.
bool NGramReadContexts(const string &file, std::vector<string> *contexts);
// Writes (possibly extended) context specifications from a vector to a file.
bool NGramWriteContexts(const string &file,
                        const std::vector<string> &contexts);

template <class Arc>
void NGramContext::FindContexts(
    const NGramModel<Arc> &model, int ncontexts,
    std::vector<std::vector<typename Arc::Label>> *begin_contexts,
    std::vector<std::vector<typename Arc::Label>> *end_contexts,
    float bigram_thresh) {
  // state n-gram counts with given unigram suffix
  std::map<typename Arc::Label, size_t> suffix1_counts;
  // state n-gram counts with given (reversed) bigram suffix
  std::map<std::pair<typename Arc::Label, typename Arc::Label>, size_t>
      suffix2_counts;
  // state n-gram counts at a bigram state
  std::unordered_map<typename Arc::Label, size_t> bigram_counts;
  size_t total_count = 0;
  typename Arc::Label max_label = kNoLabel;

  for (StateId s = 0; s < model.NumStates(); ++s) {
    for (ArcIterator<Fst<Arc>> aiter(model.GetFst(), s); !aiter.Done();
         aiter.Next()) {
      const Arc &arc = aiter.Value();
      if (arc.ilabel == kNoLabel || arc.ilabel > max_label)
        max_label = arc.ilabel;
    }
    const auto &ngram = model.StateNGram(s);
    typename Arc::Label l1 =
        ngram.size() > 0 ? ngram[ngram.size() - 1] : kNoLabel;
    typename Arc::Label l2 =
        ngram.size() > 1 ? ngram[ngram.size() - 2] : kNoLabel;
    if (l1 == kNoLabel) continue;
    suffix1_counts[l1] += model.GetFst().NumArcs(s);
    total_count += model.GetFst().NumArcs(s);

    if (l2 != kNoLabel) {
      suffix2_counts[std::make_pair(l1, l2)] += model.GetFst().NumArcs(s);
    } else {
      bigram_counts[l1] += model.GetFst().NumArcs(s);
    }
  }
  std::vector<typename Arc::Label> context;
  begin_contexts->clear();
  end_contexts->clear();
  begin_contexts->push_back(context);
  begin_contexts->back().push_back(0);
  ssize_t bin_count = 0;
  auto it1 = suffix1_counts.begin();

  while (it1 != suffix1_counts.end()) {
    auto suffix1 = it1->first;
    ssize_t delta1 = it1->second;
    ssize_t deltab = bigram_counts[suffix1];
    ++it1;
    if (it1 != suffix1_counts.end() &&
        (bin_count + delta1) * ncontexts < total_count) {
      // Continues to fill bin
      bin_count += delta1;
    } else if ((bin_count + delta1 - deltab) * ncontexts <
               bigram_thresh * total_count) {
      // Splits at a unigram state suffix when bin not overfull
      if (it1 == suffix1_counts.end()) {
        // Finalizes at end of contexts
        end_contexts->push_back(context);
        end_contexts->back().push_back(max_label + 1);
      } else {
        // Splits and continues with next unigram state suffix
        auto next_suffix1 = it1->first;
        end_contexts->push_back(context);
        begin_contexts->push_back(context);
        end_contexts->back().push_back(next_suffix1);
        begin_contexts->back().push_back(next_suffix1);
        total_count -= bin_count + delta1;
        --ncontexts;
        bin_count = 0;
      }
    } else {
      // Splits at a bigram state context
      total_count -= deltab;
      auto it2 = suffix2_counts.find(std::make_pair(suffix1, 0));
      while (it2 != suffix2_counts.end() && it2->first.first <= suffix1) {
        ssize_t delta2 = it2->second;
        ++it2;
        if (it1 != suffix1_counts.end() &&
            (bin_count + delta2) * ncontexts < total_count) {
          // Continues to fill bin
          bin_count += delta2;
        } else if (it1 == suffix1_counts.end()) {
          // Finalizes at end of contexts
          end_contexts->push_back(context);
          end_contexts->back().push_back(max_label + 1);
        } else if (it2 == suffix2_counts.end() || it2->first.first > suffix1) {
          // Splits and continues with next unigram state suffix
          auto next_suffix1 = it1->first;
          end_contexts->push_back(context);
          begin_contexts->push_back(context);
          end_contexts->back().push_back(next_suffix1);
          begin_contexts->back().push_back(next_suffix1);
          total_count -= bin_count + delta2;
          --ncontexts;
          bin_count = 0;
        } else {
          // Splits and continues with next bigram state suffix
          auto next_suffix2 = it2->first;
          end_contexts->push_back(context);
          begin_contexts->push_back(context);
          end_contexts->back().push_back(next_suffix2.second);
          end_contexts->back().push_back(next_suffix2.first);
          begin_contexts->back().push_back(next_suffix2.second);
          begin_contexts->back().push_back(next_suffix2.first);
          total_count -= bin_count + delta2;
          --ncontexts;
          bin_count = 0;
        }
      }
    }
  }
}

}  // namespace ngram

#endif  // NGRAM_NGRAM_CONTEXT_H_