/usr/include/commoncpp/tokenizer.h

// Copyright (C) 1999-2005 Open Source Telecom Corporation.
// Copyright (C) 2006-2010 David Sugar, Tycho Softworks.
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// As a special exception, you may use this file as part of a free software
// library without restriction.  Specifically, if other files instantiate
// templates or use macros or inline functions from this file, or you compile
// this file and link it with other files to produce an executable, this
// file does not by itself cause the resulting executable to be covered by
// the GNU General Public License.  This exception does not however
// invalidate any other reasons why the executable file might be covered by
// the GNU General Public License.
//
// This exception applies only to the code released under the name GNU
// Common C++.  If you copy code from other releases into a copy of GNU
// Common C++, as the General Public License permits, the exception does
// not apply to the code that you add in this way.  To avoid misleading
// anyone as to the status of such modified files, you must delete
// this exception notice from them.
//
// If you write modifications of your own for GNU Common C++, it is your choice
// whether to permit this exception to apply to your modifications.
// If you do not wish that, delete this exception notice.
//

/**
 * @file tokenizer.h
 * @short string tokenizer.
 **/

#ifndef COMMONCPP_TOKENIZER_H_
#define COMMONCPP_TOKENIZER_H_

#ifndef COMMONCPP_CONFIG_H_
#include <commoncpp/config.h>
#endif

#ifndef COMMONCPP_THREAD_H_
#include <commoncpp/thread.h>
#endif

#ifndef COMMMONCPP_EXCEPTION_H_
#include <commoncpp/exception.h>
#endif

NAMESPACE_COMMONCPP

/**
 * Splits delimited string into tokens.
 *
 * The StringTokenizer takes a pointer to a string and a pointer
 * to a string containing a number of possible delimiters.
 * The StringTokenizer provides an input forward iterator which allows
 * to iterate through all tokens. An iterator behaves like a logical
 * pointer to the tokens, i.e. to shift to the next token, you've
 * to increment the iterator, you get the token by dereferencing the
 * iterator.
 *
 * Memory consumption:
 * This class operates on the original string and only allocates memory
 * for the individual tokens actually requested, so this class
 * allocates at maximum the space required for the longest token in the
 * given string.
 * Since for each iteration, memory is reclaimed for the last token,
 * you MAY NOT store pointers to them; if you need them afterwards,
 * copy them. You may not modify the original string while you operate
 * on it with the StringTokenizer; the behaviour is undefined in that
 * case.
 *
 * The iterator has one special method 'nextDelimiter()' which returns
 * a character containing the next delimiter following this
 * tokenization process or '\\0', if there are no following delimiters. In
 * case of skipAllDelim, it returns the FIRST delimiter.
 *
 * With the method 'setDelimiters(const char*)' you may change the
 * set of delimiters. It affects all running iterators.
 *
 * Example:
 * <code><pre>
 *  StringTokenizer st("mary had a little lamb;its fleece was..", " ;");
 *  StringTokenizer::iterator i;
 *  for (i = st.begin() ; i != st.end() ; ++i) {
 *        cout << "Token: '" << *i << "'\t";
 *        cout << " next Delim: '" << i.nextDelimiter() << "'" << endl;
 *  }
 *  </pre></code>
 *
 * @author Henner Zeller <H.Zeller@acm.org>
 * @license LGPL
 */
class __EXPORT StringTokenizer {
public:
    /**
     * a delimiter string containing all usual whitespace delimiters.
     * These are space, tab, newline, carriage return,
     * formfeed and vertical tab. (see isspace() manpage).
     */
    static const char * const SPACE;

    /**
     * Exception thrown, if someone tried to read beyond the
     * end of the tokens.
     * Will not happen if you use it the 'clean' way with comparison
     * against end(), but if you skip some tokens, because you 'know'
     * they are there. Simplifies error handling a lot, since you can
     * just read your tokens the way you expect it, and if there is some
     * error in the input this Exception will be thrown.
     */
    // maybe move more global ?
    class NoSuchElementException { };

    /**
     * The input forward iterator for tokens.
     * @author Henner Zeller
     */
    class __EXPORT iterator {
        friend class StringTokenizer;  // access our private constructors
    private:
        const StringTokenizer *myTok; // my StringTokenizer
        const char *start;      // start of current token
        const char *tokEnd;     // end of current token (->nxDelimiter)
        const char *endp;       // one before next token
        char *token;            // allocated token, if requested

        // for initialization of the itEnd iterator
        iterator(const StringTokenizer &tok, const char *end)
            : myTok(&tok),tokEnd(0),endp(end),token(0) {}

        iterator(const StringTokenizer &tok)
            : myTok(&tok),tokEnd(0),endp(myTok->str-1),token(0) {
            ++(*this); // init first token.
        }

    public:
        iterator() : myTok(0),start(0),tokEnd(0),endp(0),token(0) {}

        // see also: comment in implementation of operator++
        virtual ~iterator()
            { if (token) *token='\0'; delete [] token; }

        /**
         * copy constructor.
         */
        // everything, but not responsible for the allocated token.
        iterator(const iterator& i) :
            myTok(i.myTok),start(i.start),tokEnd(i.tokEnd),
            endp(i.endp),token(0) {}

        /**
         * assignment operator.
         */
        // everything, but not responsible for the allocated token.
        iterator &operator = (const iterator &i)
        {
            myTok = i.myTok;
            start = i.start; endp = i.endp; tokEnd = i.tokEnd;
            if ( token )
                delete [] token;
            token = 0;
            return *this;
        }

        /**
         * shifts this iterator to the next token in the string.
         */
        iterator &operator ++ () THROWS (NoSuchElementException);

        /**
         * returns the immutable string this iterator
         * points to or '0' if no token is available (i.e.
         * i == end()).
         * Do not store pointers to this token, since it is
         * invalidated for each iteration. If you need the token,
         * copy it (e.g. with strdup());
         */
        const char*  operator *  () THROWS (NoSuchElementException);

        /**
         * returns the next delimiter after the current token or
         * '\\0', if there are no following delimiters.
         * It returns the very next delimiter (even if
         * skipAllDelim=true).
         */
        inline char nextDelimiter() const
            {return (tokEnd) ? *tokEnd : '\0';}

        /**
         * compares to other iterator. Usually used to
         * compare against the end() iterator.
         */
        // only compare the end-position. speed.
        inline bool operator == (const iterator &other) const
            {return (endp == other.endp);}

        /**
         * compares to other iterator. Usually used to
         * compare against the end() iterator.
         */
        // only compare the end position. speed.
        inline bool operator != (const iterator &other) const
            {return (endp != other.endp);}
    };
private:
    friend class StringTokenizer::iterator;
    const char *str;
    const char *delim;
    bool skipAll, trim;
    iterator itEnd;

public:
    /**
     * creates a new StringTokenizer for a string
     * and a given set of delimiters.
     *
     * @param  str          String to be split up. This string will
     *                      not be modified by this StringTokenizer,
     *                      but you may as well not modfiy this string
     *                      while tokenizing is in process, which may
     *                      lead to undefined behaviour.
     *
     * @param  delim        String containing the characters
     *                      which should be regarded as delimiters.
     *
     * @param  skipAllDelim OPTIONAL.
     *                      true, if subsequent
     *                      delimiters should be skipped at once
     *                      or false, if empty tokens should
     *                      be returned for two delimiters with
     *                      no other text inbetween. The first
     *                      behaviour may be desirable for whitespace
     *                      skipping, the second for input with
     *                      delimited entry e.g. /etc/passwd like files
     *                      or CSV input.
     *                      NOTE, that 'true' here resembles the
     *                      ANSI-C strtok(char *s,char *d) behaviour.
     *                      DEFAULT = false
     *
     * @param trim          OPTIONAL.
     *                      true, if the tokens returned
     *                      should be trimmed, so that they don't have
     *                      any whitespaces at the beginning or end.
     *                      Whitespaces are any of the characters
     *                      defined in StringTokenizer::SPACE.
     *                      If delim itself is StringTokenizer::SPACE,
     *                      this will result in a behaviour with
     *                      skipAllDelim = true.
     *                      DEFAULT = false
     */
    StringTokenizer (const char *str,
             const char *delim,
             bool skipAllDelim = false,
             bool trim = false);

    /**
     * create a new StringTokenizer which splits the input
     * string at whitespaces. The tokens are stripped from
     * whitespaces. This means, if you change the set of
     * delimiters in either the 'begin(const char *delim)' method
     * or in 'setDelimiters()', you then get whitespace
     * trimmed tokens, delimited by the new set.
     * Behaves like StringTokenizer(s, StringTokenizer::SPACE,false,true);
     */
    StringTokenizer (const char *s);

    /**
     * returns the begin iterator
     */
    iterator begin() const
        {return iterator(*this);}

    /**
     * changes the set of delimiters used in subsequent
     * iterations.
     */
    void setDelimiters (const char *d)
        {delim = d;}

    /**
     * returns a begin iterator with an alternate set of
     * delimiters.
     */
    iterator begin(const char *d)
    {
        delim = d;
        return iterator(*this);
    }

    /**
     * the iterator marking the end.
     */
    const iterator& end() const
        {return itEnd;}
};

END_NAMESPACE

#endif

/** EMACS **
 * Local variables:
 * mode: c++
 * c-basic-offset: 4
 * End:
 */
libucommon-dev 6.0.7-1.1 / usr / include / commoncpp / tokenizer.h