/usr/include/htdig/WordDBCompress.h

//
// WordDBCompress.h
//
// WordDBCompress: Implements specific compression scheme for
//                 Berkeley DB pages containing WordReferences objects.
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: WordDBCompress.h,v 1.6 2004/05/28 13:15:26 lha Exp $
//

#ifndef _WordDBCompress_h_
#define _WordDBCompress_h_

// ***********************************************
// *************** WordDBCompress*****************
// ***********************************************
// Starting point for compression. 
// 
//
// Comrpession HOW IT WORKS:
//
// ** General outline: 
//
// BerkeleyDB pages are stored in a memory pool. When the memory pool
// is full, least recently used pages are swaped to disk.  Page
// compression occurs at page in/out level.  The
// WordDBCompress_compress_c functions are C callbacks that are called
// by the the page compression code in BerkeleyDB. The C callbacks the
// call the WordDBCompress comress/uncompress methods. The
// WordDBCompress creates a WordDBPage which does the actual
// compress/uncompress job.
// 
// The  WordDBPage compression/uncompression methods  store/retreive data
// from a bitstream.  BitStream is  a simple bitstream, and Compressor is
// a bitstream with added compression capabilities.
// 

// Compression algorithm.
// 
// Most DB pages are full of really redundant data. Mifluz choice of using 
// one db entry per word makes the DB pages have an even more redundant.
// But this choice also makes the pages have a very simple structure.
// 
// Here is a real world example of what a page can look like:
// (key structure: word + 4 numerical fields)
// 
// "trois"     1 4482    1  10b    
// "trois"     1 4482    1  142    
// "trois"     1 4484    1   40    
// "trois"     1 449f    1  11e    
// "trois"     1 4545    1   11    
// "trois"     1 45d3    1  545    
// "trois"     1 45e0    1  7e5    
// "trois"     1 45e2    1  830    
// "trois"     1 45e8    1  545    
// "trois"     1 45fe    1   ec    
// "trois"     1 4616    1  395    
// "trois"     1 461a    1  1eb    
// "trois"     1 4631    1   49    
// "trois"     1 4634    1   48    
// .... etc ....
// 
// To compress we chose to only code differences between succesive entries.
// 
// Differences in words are coded by 2 numbers and some letters: 
// - the position within the word of the first letter that changes
// - the size of the new suffix
// - the letters in the new suffix
// 
// Only differences in succesive numerical entries are stored.
// 
// A flag is stored for each entry indicating which fields have changed.
// 
// All this gives us a few numerical arrays which are themselves compressed 
// and sent to the bitstream.
//
//
class WordDBCompress
{
 public:
    WordDBCompress();
    WordDBCompress(int, int);

    int Compress(const  u_int8_t* inbuff, int inbuff_length, u_int8_t** outbuffp, int* outbuff_lengthp);
    int Uncompress(const  u_int8_t* inbuff, int inbuff_length, u_int8_t* outbuff, int outbuff_length);
    
    //
    // Return a new DB_CMPR_INFO initialized with characteristics of the
    // current object and suitable as WordDB::CmprInfo argument.
    //
    DB_CMPR_INFO *CmprInfo();

 private:
    DB_CMPR_INFO *cmprInfo;

    //ZLIB WordDBCompression Flags
    int use_zlib;
    int zlib_level;

// DEBUGING / BENCHMARKING
    int debug;
// 0 : no debug no check
// 1 : TestCompress before each compression (but no debug within Compress Uncompress)
// 2 : use_tags (BitStream) within TestCompress ->  Compress Uncompress
// 3 : verbose
    int TestCompress(const  u_int8_t* pagebuff, int pagebuffsize);
};

#endif
htdig 1:3.2.0b6-12 / usr / include / htdig / WordDBCompress.h