/usr/include/CLucene/analysis/AnalysisHeader.h is in libclucene-dev 2.3.3.4+dfsg-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 | /*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_analysis_AnalysisHeader_
#define _lucene_analysis_AnalysisHeader_
#include "CLucene/index/Payload.h"
#include "CLucene/util/VoidList.h"
#include "CLucene/LuceneThreads.h"
CL_CLASS_DEF(util,Reader)
CL_NS_DEF(analysis)
typedef CL_NS(util)::CLSetList<TCHAR*, CL_NS(util)::Compare::TChar, CL_NS(util)::Deletor::tcArray> CLTCSetList;
/** A Token is an occurence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
<p>
The start and end offsets permit applications to re-associate a token with
its source text, e.g., to display highlighted query terms in a document
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
display, etc.
<p>
The type is an interned string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word".
<p>
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
length byte array. Use {@link lucene::index::TermPositions#getPayloadLength()} and
{@link lucene::index::TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
<br><br>
<p><font color="#FF0000">
WARNING: The status of the <b>Payloads</b> feature is experimental.
The APIs introduced here might change in the future and will not be
supported anymore in such a case.</font>
<br><br>
<p>Tokenizers and filters should try to re-use a Token
instance when possible for best performance, by
implementing the {@link lucene::index::TokenStream#next(Token)} API.
Failing that, to create a new Token you should first use
one of the constructors that starts with null text. Then
you should call either {@link #termBuffer()} or {@link
#resizeTermBuffer(int)} to retrieve the Token's
termBuffer. Fill in the characters of your term into this
buffer, and finally call {@link #setTermLength(int)} to
set the length of the term text. See <a target="_top"
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
for details.</p>
@see Payload
*/
class CLUCENE_EXPORT Token:LUCENE_BASE{
private:
int32_t _startOffset; ///< start in source text
int32_t _endOffset; ///< end in source text
const TCHAR* _type; ///< lexical type
int32_t positionIncrement;
size_t bufferTextLen;
#ifndef LUCENE_TOKEN_WORD_LENGTH
TCHAR* _buffer; ///< the text of the term
#else
TCHAR _buffer[LUCENE_TOKEN_WORD_LENGTH+1]; ///< the text of the term
#endif
int32_t _termTextLen; ///< the length of termText. Internal use only
CL_NS(index)::Payload* payload;
public:
static const TCHAR* getDefaultType();
Token();
virtual ~Token();
/// Constructs a Token with the given text, start and end offsets, & type.
Token(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ=NULL);
void set(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ=NULL);
size_t bufferLength();
void growBuffer(size_t size);
TCHAR* resizeTermBuffer(size_t size);
/** Set the position increment. This determines the position of this
* token relative to the previous Token in a TokenStream, used in
* phrase searching.
*
* The default value is 1.
*
* Some common uses for this are:
*
* - Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* - Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*/
void setPositionIncrement(int32_t posIncr);
int32_t getPositionIncrement() const;
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
* #resizeTermBuffer(int)} to increase it. After
* altering the buffer be sure to call {@link
* #setTermLength} to record the number of valid
* characters that were placed into the termBuffer. */
TCHAR* termBuffer() const;
size_t termLength(); //< Length of the the termBuffer. See #termBuffer
_CL_DEPRECATED( termBuffer ) const TCHAR* termText() const; //< See #termBuffer()
_CL_DEPRECATED( termLength ) size_t termTextLength(); //< See #termLength
void resetTermTextLen(); //< Empties the termBuffer. See #termBuffer
void setText(const TCHAR* txt, int32_t len=-1); //< Sets the termBuffer. See #termBuffer
/**
* Returns this Token's starting offset, the position of the first character
* corresponding to this token in the source text.
*
* Note that the difference between endOffset() and startOffset() may not be
* equal to termText.length(), as the term text may have been altered by a
* stemmer or some other filter.
*/
int32_t startOffset() const;
/** Set the starting offset.
@see #startOffset() */
void setStartOffset(const int32_t val);
void setTermLength(int32_t);
/**
* Returns this Token's ending offset, one greater than the position of the
* last character corresponding to this token in the source text.
*/
int32_t endOffset() const;
/** Set the ending offset.
@see #endOffset() */
void setEndOffset(const int32_t val);
/// Returns this Token's lexical type. Defaults to "word".
const TCHAR* type() const; ///<returns reference
void setType(const TCHAR* val); ///<returns reference
/**
* Returns this Token's payload.
*/
CL_NS(index)::Payload* getPayload();
/**
* Sets this Token's payload.
*/
void setPayload(CL_NS(index)::Payload* payload);
/** Resets the term text, payload, and positionIncrement to default.
* Other fields such as startOffset, endOffset and the token type are
* not reset since they are normally overwritten by the tokenizer. */
void clear();
TCHAR* toString() const;
};
/** A TokenStream enumerates the sequence of tokens, either from
fields of a document or from query text.
<p>
This is an abstract class. Concrete subclasses are:
<ul>
<li>{@link Tokenizer}, a TokenStream
whose input is a Reader; and
<li>{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
</ul>
NOTE: subclasses must override at least one of {@link
#next()} or {@link #next(Token)}.
*/
class CLUCENE_EXPORT TokenStream {
public:
/** Returns the next token in the stream, or null at EOS.
* When possible, the input Token should be used as the
* returned Token (this gives fastest tokenization
* performance), but this is not required and a new Token
* may be returned (pass NULL for this).
* Callers may re-use a single Token instance for successive
* calls to this method.
* <p>
* This implicitly defines a "contract" between
* consumers (callers of this method) and
* producers (implementations of this method
* that are the source for tokens):
* <ul>
* <li>A consumer must fully consume the previously
* returned Token before calling this method again.</li>
* <li>A producer must call {@link Token#clear()}
* before setting the fields in it & returning it</li>
* </ul>
* Note that a {@link TokenFilter} is considered a consumer.
* @param result a Token that may or may not be used to return
* @return next token in the stream or null if end-of-stream was hit
*/
virtual Token* next(Token* token) = 0;
/** This is for backwards compatibility only. You should pass the token you want to fill
* to next(), this will save a lot of object construction and destructions.
* @deprecated. use next(token). Kept only to avoid breaking existing code.
*/
_CL_DEPRECATED(next(Token)) Token* next();
/** Releases resources associated with this stream. */
virtual void close() = 0;
/** Resets this stream to the beginning. This is an
* optional operation, so subclasses may or may not
* implement this method. Reset() is not needed for
* the standard indexing process. However, if the Tokens
* of a TokenStream are intended to be consumed more than
* once, it is necessary to implement reset().
*/
virtual void reset();
virtual ~TokenStream();
};
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
* <p>
* Typical implementations first build a Tokenizer, which breaks the stream of
* characters from the Reader into raw Tokens. One or more TokenFilters may
* then be applied to the output of the Tokenizer.
* <p>
* WARNING: You must override one of the methods defined by this class in your
* subclass or the Analyzer will enter an infinite loop.
*/
class CLUCENE_EXPORT Analyzer{
public:
Analyzer();
/** Creates a TokenStream which tokenizes all the text in the provided
Reader. Default implementation forwards to tokenStream(Reader) for
compatibility with older version. Override to allow Analyzer to choose
strategy based on document and/or field. Must be able to handle null
field name for backward compatibility. */
virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader)=0;
/** Creates a TokenStream that is allowed to be re-used
* from the previous time that the same thread called
* this method. Callers that do not need to use more
* than one TokenStream at the same time from this
* analyzer should use this method for better
* performance.
*/
virtual TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
private:
DEFINE_MUTEX(THIS_LOCK)
struct Internal;
Internal* _internal;
protected:
/** Used by Analyzers that implement reusableTokenStream
* to retrieve previously saved TokenStreams for re-use
* by the same thread. */
TokenStream* getPreviousTokenStream();
/** Used by Analyzers that implement reusableTokenStream
* to save a TokenStream for later re-use by the same
* thread. */
void setPreviousTokenStream(TokenStream* obj);
public:
/**
* Invoked before indexing a Field instance if
* terms have already been added to that field. This allows custom
* analyzers to place an automatic position increment gap between
* Field instances using the same field name. The default value
* position increment gap is 0. With a 0 position increment gap and
* the typical default token position increment of 1, all terms in a field,
* including across Field instances, are in successive positions, allowing
* exact PhraseQuery matches, for instance, across Field instance boundaries.
*
* @param fieldName Field name being indexed.
* @return position increment gap, added to the next token emitted from {@link #tokenStream(TCHAR*, Reader*)}
*/
virtual int32_t getPositionIncrementGap(const TCHAR* fieldName);
virtual ~Analyzer();
};
/** A Tokenizer is a TokenStream whose input is a Reader.
<p>
This is an abstract class.
<p>
NOTE: subclasses must override at least one of {@link
#next()} or {@link #next(Token)}.
<p>
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
*/
class CLUCENE_EXPORT Tokenizer:public TokenStream {
protected:
/** The text source for this Tokenizer. */
CL_NS(util)::Reader* input;
public:
/** Construct a tokenizer with null input. */
Tokenizer();
/** Construct a token stream processing the given input. */
Tokenizer(CL_NS(util)::Reader* _input);
/** By default, closes the input Reader. */
virtual void close();
/** Expert: Reset the tokenizer to a new reader. Typically, an
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
virtual void reset(CL_NS(util)::Reader* _input);
virtual ~Tokenizer();
};
/** A TokenFilter is a TokenStream whose input is another token stream.
<p>
This is an abstract class.
*/
class CLUCENE_EXPORT TokenFilter:public TokenStream {
protected:
/** The source of tokens for this filter. */
TokenStream* input;
/** If true then input will be deleted in the destructor */
bool deleteTokenStream;
/** Construct a token stream filtering the given input.
*
* @param in The TokenStream to filter from
* @param deleteTS If true, input will be deleted in the destructor
*/
TokenFilter(TokenStream* in, bool deleteTS=false);
virtual ~TokenFilter();
public:
/** Close the input TokenStream. */
void close();
};
CL_NS_END
#endif
|