This file is indexed.

/usr/include/cld2/internal/getonescriptspan.h is in libcld2-dev 0.0.0-git20150806-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Author: dsites@google.com (Dick Sites)
//


#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_

#include "integral_types.h"
#include "langspan.h"
#include "offsetmap.h"

namespace CLD2 {

static const int kMaxScriptBuffer = 40960;
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
static const int kMaxScriptBytes = kMaxScriptBuffer - 32;   // Leave some room
static const int kWithinScriptTail = 32;    // Stop at word space in last
                                            // N bytes of script buffer


static inline bool IsContinuationByte(char c) {
  return static_cast<signed char>(c) < -64;
}

// Gets lscript number for letters; always returns
//   0 (common script) for non-letters
int GetUTF8LetterScriptNum(const char* src);

// Update src pointer to point to next quadgram, +2..+5
// Looks at src[0..4]
const char* AdvanceQuad(const char* src);


class ScriptScanner {
 public:
  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
                bool any_text, bool any_script);
  ~ScriptScanner();

  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
  bool GetOneScriptSpan(LangSpan* span);

  // Force Latin and Cyrillic scripts to be lowercase
  void LowerScriptSpan(LangSpan* span);

  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
  // Force Latin and Cyrillic scripts to be lowercase
  bool GetOneScriptSpanLower(LangSpan* span);

  // Copy next run of non-tag characters to buffer [NUL terminated]
  // This just removes tags and removes entities
  // Buffer has leading space
  bool GetOneTextSpan(LangSpan* span);

  // Maps byte offset in most recent GetOneScriptSpan/Lower
  // span->text [0..text_bytes] into an additional byte offset from
  // span->offset, to get back to corresponding text in the original
  // input buffer.
  // text_offset must be the first byte
  // of a UTF-8 character, or just beyond the last character. Normally this
  // routine is called with the first byte of an interesting range and
  // again with the first byte of the following range.
  int MapBack(int text_offset);

  const char* GetBufferStart() {return start_byte_;};

 private:
  // Skip over tags and non-letters
  int SkipToFrontOfSpan(const char* src, int len, int* script);

  const char* start_byte_;        // Starting byte of buffer to scan
  const char* next_byte_;         // First unscanned byte
  int byte_length_;               // Bytes left

  bool is_plain_text_;            // true fo text, false for HTML
  char* script_buffer_;           // Holds text with expanded entities
  char* script_buffer_lower_;     // Holds lowercased text
  bool letters_marks_only_;       // To distinguish scriptspan of one
                                  // letters/marks vs. any mixture of text
  bool one_script_only_;          // To distinguish scriptspan of one
                                  // script vs. any mixture of scripts
  int exit_state_;                // For tag parser kTagParseTbl_0, based
                                  // on letters_marks_only_
 public :
  // Expose for debugging
  OffsetMap map2original_;    // map from script_buffer_ to buffer
  OffsetMap map2uplow_;       // map from script_buffer_lower_ to script_buffer_
};

}  // namespace CLD2

#endif  // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_