/usr/include/shogun/lib/NGramTokenizer.h is in libshogun-dev 3.2.0-7.3build4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | /*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2013 Evangelos Anagnostopoulos
* Copyright (C) 2013 Evangelos Anagnostopoulos
*/
#ifndef _NGRAMTOKENIZER__H__
#define _NGRAMTOKENIZER__H__
#include <shogun/lib/Tokenizer.h>
namespace shogun
{
class CTokenizer;
/** @brief The class CNGramTokenizer is used to tokenize
* a SGVector<char> into n-grams
*/
class CNGramTokenizer: public CTokenizer
{
public:
/** Constructor
*
* @param ns N-grams' size
*/
CNGramTokenizer(int32_t ns=3);
/** copy constructor
*
* @param orig the original NGramTokenizer
*/
CNGramTokenizer(const CNGramTokenizer& orig);
/** destructor */
virtual ~CNGramTokenizer() {}
/** Set the char array that requires tokenization
*
* @param txt the text to tokenize
*/
virtual void set_text(SGVector<char> txt);
/** Returns true or false based on whether
* there exists another token in the text
*
* @return if another token exists
*/
virtual bool has_next();
/** Method that returns the indices, start and end, of
* the next token in line.
*
* @param start token's starting index
* @return token's ending index (exclusive)
*/
virtual index_t next_token_idx(index_t& start);
/** Returns the name of the SGSerializable instance. It MUST BE
* the CLASS NAME without the prefixed `C'.
*
* @return name of the SGSerializable
*/
virtual const char* get_name() const;
virtual CNGramTokenizer* get_copy();
private:
void init();
protected:
/** n-grams' size */
int32_t n;
/** last index returned */
index_t last_idx;
};
}
#endif /* _NGRAMTOKENIZER__H__ */
|