/usr/include/ucto/tokenize.h

/*
  $Id: tokenize.h 15910 2013-04-03 13:57:51Z sloot $
  $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/tokenize.h $
  Copyright (c) 2006 - 2013
  Tilburg University

  This file is part of Ucto.

  Ucto is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 3 of the License, or
  (at your option) any later version.

  Ucto is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.

  For more information and updates, see:
      http://ilk.uvt.nl/frog
*/

#ifndef UCTO_TOKENIZE_H
#define UCTO_TOKENIZE_H

#include <vector>
#include <map>
#include <sstream>
#include <stdexcept>
#include "config.h"
#include "ucto/unicode.h"
#include "ticcutils/LogStream.h"

namespace Tokenizer {

  std::string Version();
  std::string VersionName();

  //enum RuleTrigger { PUNCTUATION, PERIOD, NUMBER }; //TODO: implement
    
  enum TokenRole {
    NOROLE                      = 0,
    NOSPACE                     = 1,
    BEGINOFSENTENCE             = 2, 
    ENDOFSENTENCE               = 4,
    NEWPARAGRAPH                = 8,    
    BEGINQUOTE                  = 16, 
    ENDQUOTE                    = 32, 
    TEMPENDOFSENTENCE           = 64,
    LISTITEM                    = 128, //reserved for future use
    TITLE                       = 256 //reserved for future use
  };

  std::ostream& operator<<( std::ostream&, const TokenRole& );

  inline TokenRole operator|( TokenRole T1, TokenRole T2 ){
    return (TokenRole)( (int)T1|(int)T2 );
  }

  inline TokenRole& operator|= ( TokenRole& T1, TokenRole T2 ){
    T1 = (T1 | T2);
    return T1;
  }
  
  inline TokenRole operator^( TokenRole T1, TokenRole T2 ){
    return (TokenRole)( (int)T1^(int)T2 );
  }

  inline TokenRole& operator^= ( TokenRole& T1, TokenRole T2 ){
    T1 = (T1 ^ T2);
    return T1;
  }
  
  class Token {
    friend std::ostream& operator<< (std::ostream&, const Token& );
  public:
    const UnicodeString *type;
    UnicodeString us;
    TokenRole role; 
    Token( const UnicodeString *,
	   const UnicodeString& s,
	   TokenRole role = NOROLE ); 
  };

  class UnicodeRegexMatcher;

  class Rule {
    friend std::ostream& operator<< (std::ostream&, const Rule& );
  public:
  Rule(): regexp(0){
    };
    Rule( const UnicodeString& id, const UnicodeString& pattern);
    ~Rule();
    UnicodeString id;
    UnicodeString pattern;
    UnicodeRegexMatcher *regexp;
    bool matchAll( const UnicodeString&,
		   UnicodeString&,
		   UnicodeString&,
		   std::vector<UnicodeString>& );
    
  };

  class Quoting {
    friend std::ostream& operator<<( std::ostream&, const Quoting& );
    struct QuotePair {
      UnicodeString openQuote;
      UnicodeString closeQuote;
    };
  public:
    void add( const UnicodeString&, const UnicodeString& );
    UnicodeString lookupOpen( const UnicodeString &) const;
    UnicodeString lookupClose( const UnicodeString & ) const;
    bool empty() const { return quotes.empty(); };
    bool emptyStack() const { return quotestack.empty(); };
    void clearStack() { quoteindexstack.clear(); quotestack.clear(); };
    int lookup( const UnicodeString&, int& );
    void eraseAtPos( int pos ) {
      quotestack.erase( quotestack.begin()+pos );
      quoteindexstack.erase( quoteindexstack.begin()+pos );
    }
    void flushStack( int ); //renamed from eraseBeforeIndex
    void push( int i, UChar c ){
      quoteindexstack.push_back(i);
      quotestack.push_back(c);
    }
  private:
    std::vector<QuotePair> quotes;
    std::vector<int> quoteindexstack;
    std::vector<UChar> quotestack;
  };
  
  class TokenizerClass{
  public:
    TokenizerClass();
    ~TokenizerClass();
    bool init( const std::string& );
    void setErrorLog( TiCC::LogStream *os );

    // Tokenize from input stream to FoLiA document
    folia::Document tokenize( std::istream& );
    
    // Tokenize a folia document
    bool tokenize(folia::Document& );
    
    //Tokenize from input stream to output stream
    std::vector<Token> tokenizeStream( std::istream& );
    void tokenize( std::istream&, std::ostream& );
    void tokenize( std::istream* in, std::ostream* out){
      // for backward compatability
      return tokenize( *in, *out );};
    
    // Tokenize a line (a line is NOT a sentence, but an arbitrary string 
    //                  of characters, inclusive EOS markers, Newlines etc.)
    int tokenizeLine( const UnicodeString& ); // Unicode chars
    int tokenizeLine( const std::string& );   // UTF8 chars
    
    void passthruLine( const std::string&, bool& );    
    
    //Processes tokens and initialises the sentence buffer. Returns the amount of sentences found
    int countSentences(bool forceentirebuffer = false); //count the number of sentences (only after detectSentenceBounds) (does some extra validation as well)
    int flushSentences(const int); //Flush n sentences from buffer (does some extra validation as well)
    
    //Get the sentence with the specified index as a string (UTF-8 encoded)
    std::string getSentenceString( unsigned int );
    
    //Get all sentences as a vector of strings (UTF-8 encoded)
    std::vector<std::string> getSentences();

    //Enable verbose mode
    bool setVerbose( bool b=true ) { bool t = verbose; verbose = b; return t; };
    bool getVerbose() const { return verbose; }
    
    //set debug value
    int setDebug( int d ) { bool dd = tokDebug; tokDebug = d; return dd; };
    int getDebug() const { return tokDebug; }
    
    //Enable conversion of all output to lowercase
    bool setLowercase( bool b=true ) { bool t = lowercase; lowercase = b; if (b) uppercase = false; return t; };
    bool getLowercase() const { return lowercase; }

    //Enable passtru mode
    bool setPassThru( bool b=true ) { bool t = passthru; passthru = b; return t; };
    bool getPassThru() const { return passthru; }
    
    //Enable conversion of all output to uppercase
    bool setUppercase( bool b=true ) { bool t = uppercase; uppercase = b; if (b) lowercase = false; return t; };
    bool getUppercase() const { return uppercase; }

    //Enable sentence-bound detection
    bool setSentenceDetection( bool b=true ) { bool t = detectBounds; detectBounds = b; return t; }
    bool getSentenceDetection() const { return detectBounds; }
    
    //Enable paragraph detection
    bool setParagraphDetection( bool b=true ) { bool t = detectPar; detectPar = b; return t; }
    bool getParagraphDetection() const { return detectPar; }
    
    //Enable quote detection
    bool setQuoteDetection( bool b=true ) { bool t = detectQuotes; detectQuotes = b; return t; }
    bool getQuoteDetection() const { return detectQuotes; }

    //Enable filtering
    bool setFiltering( bool b=true ) { bool t = doFilter; doFilter = b; return t; }
    bool getFiltering() const { return doFilter; };

    // set normalization mode
    std::string setNormalization( const std::string& s ) {
      return normalizer.setMode( s );
    }
    std::string getNormalization() const { return normalizer.getMode(); };

    // set input encoding
    std::string setInputEncoding( const std::string& );
    std::string getInputEncoding() const { return inputEncoding; };
    
    // set eos marker
    UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark =  folia::UTF8ToUnicode(s); return t; };
    UnicodeString getEosMarker( ) const { return eosmark; }

    bool setSentencePerLineOutput( bool b=true ) { bool t = sentenceperlineoutput; sentenceperlineoutput = b; return t; };
    bool getSentencePerLineOutput() const { return sentenceperlineoutput; }
    
    bool setSentencePerLineInput( bool b=true ) { bool t = sentenceperlineinput; sentenceperlineinput = b; return t; };
    bool getSentencePerLineInput() const { return sentenceperlineinput; }    
    
    std::string getDocID() const { return docid; }
    bool getXMLOutput() const { return xmlout; }
    bool getXMLInput() const { return xmlin; }

    const std::string getTextClass( ) const { return textclass; }
    const std::string setTextClass( const std::string& cls) {  
      std::string res = textclass;
      textclass = cls;
      return res;
    }
    
    bool setXMLOutput( bool b, const std::string& id) { bool t = xmlout; docid = id; xmlout = b; return t; }
    bool setXMLInput( bool b ) { bool t = xmlin; xmlin = b; return t; }
    
    void outputTokens( std::ostream&, const std::vector<Token>& ) const;
  private:
    void tokenizeWord( const UnicodeString&, bool);    

    bool detectEos( size_t ) const;
    void detectSentenceBounds( const int offset = 0 );
    void detectQuotedSentenceBounds( const int offset = 0 );
    void detectQuoteBounds( const int );
    //Signal the tokeniser that a paragraph is detected
    void signalParagraph( bool b=true ) { paragraphsignal = b; };
        
    bool resolveQuote( int, const UnicodeString& );
    bool u_isquote( UChar );
    std::string checkBOM( const std::string&, std::string& );
    bool readsettings( const std::string& );
    bool readrules( const std::string& );
    bool readfilters( const std::string& );
    bool readquotes( const std::string& );
    bool readeosmarkers( const std::string& );
    bool readabbreviations( const std::string&, UnicodeString& );
    
    void sortRules( std::vector<Rule *>&, std::vector<UnicodeString>& );
    void outputTokensDoc( folia::Document&, const std::vector<Token>& ) const;
    void outputTokensXML( folia::FoliaElement *, const std::vector<Token>& ) const;
    void tokenizeElement( folia::FoliaElement * );
    void tokenizeSentenceElement( folia::FoliaElement * );         
    //return the sentence with the specified index in a Token vector;
    std::vector<Token> getSentence( int );

    Quoting quotes;
    UnicodeFilter filter;
    UnicodeNormalizer normalizer;    
    UnicodeString eosmarkers;
    std::string inputEncoding;

    UnicodeString eosmark;
    std::vector<Token> tokens;
    std::vector<Rule *> rules;
    TiCC::LogStream *theErrLog;
    
    //debug flag
    int tokDebug;

    //verbose tokenisation mode
    bool verbose;
    
    //detect sentence bounds?
    bool detectBounds;
    
    //detect quotes?
    bool detectQuotes;
    
    //filter special characters (default on)?
    bool doFilter;
    
    //detect paragraphs?
    bool detectPar;
    
    //has a paragraph been signaled?
    bool paragraphsignal;
    
    //one sentence per line output
    bool sentenceperlineoutput;
    bool sentenceperlineinput;
    

    bool lowercase;
    bool uppercase;
    bool xmlout;  
    bool xmlin;  
    bool passthru;

    std::string settingsfilename;
    std::string docid; //document ID (UTF-8), necessary for XML output 
    std::string textclass; // class for folia text
  };

  template< typename T >
    T stringTo( const std::string& str ) {
    T result;
    std::stringstream dummy ( str );
    if ( !( dummy >> result ) ) {
      throw( std::runtime_error( "conversion from '" + str + "' failed" ) );
    }
    return result;
  }
  
  template< typename T >
    std::string toString( const T val ) {    
    std::stringstream dummy;
    if ( !( dummy << val ) ) {
      throw( std::runtime_error( "conversion failed" ) );
    }
    return dummy.str();
  }
  
}
#endif
libucto-dev 0.5.3-3.1ubuntu1 / usr / include / ucto / tokenize.h